Remove unused row_idx in token_dispatcher (#3442 )

### What this PR does / why we need it? The `row_idx` parameter is no longer used since PR[#2689](https://github.com/vllm-project/vllm-ascend/pull/2689), so remove it across multiple files to remove unnecessary calculations and parameter passing. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? accuracy test passed for Qwen3 235B and DeepSeek V3 671B after this PR. - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 --------- Signed-off-by: CaranLic <740821011@qq.com>
bugfix for mtp with multistream_moe (#3419 )
2025-10-20 13:43:53 +08:00 · 2025-10-15 09:08:31 +08:00 · 2025-10-15 08:59:58 +08:00 · 2025-10-15 08:45:44 +08:00 · 2025-10-14 23:07:45 +08:00 · 2025-10-14 21:51:09 +08:00
616 changed files with 117097 additions and 5702 deletions
--- a/.gemini/config.yaml
+++ b/.gemini/config.yaml
@ -0,0 +1,6 @@
+# https://developers.google.com/gemini-code-assist/docs/customize-gemini-behavior-github
+have_fun: false  # Just review the code
+code_review:
+  comment_severity_threshold: HIGH  # Reduce quantity of comments
+  pull_request_opened:
+    summary: false  # Don't summarize the PR in a separate comment
--- a/.github/Dockerfile.buildwheel
+++ b/.github/Dockerfile.buildwheel
@ -0,0 +1,45 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+ARG PY_VERSION=3.11
+FROM quay.io/ascend/manylinux:8.0.0-910b-manylinux_2_28-py${PY_VERSION}
+
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+WORKDIR /workspace
+
+COPY . /workspace/vllm-ascend/
+
+# Install req
+RUN python3 -m pip install -r vllm-ascend/requirements.txt --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip install twine
+
+# Install vllm-ascend
+RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    cd vllm-ascend && \
+    python3 setup.py bdist_wheel && \
+    ls -l dist 
+
+CMD ["/bin/bash"]
--- a/.github/ISSUE_TEMPLATE/110-user-story.yml
+++ b/.github/ISSUE_TEMPLATE/110-user-story.yml
@ -0,0 +1,37 @@
+name: 📚 User Story
+description: Apply for an user story to be displayed on https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html
+title: "[User Story]: "
+labels: ["user-story"]
+
+body:
+- type: textarea
+  attributes:
+    label: 📚 Title
+    description: >
+      A clear title about what your user story is about.
+  validations:
+    required: true
+- type: textarea
+  attributes:
+    label: About / Introduction
+    description: >
+      A brief introduction about the background of your use case, like your scenario, hardware size etc.
+- type: textarea
+  attributes:
+    label: Bussiness Challenges
+    description: >
+      Tell us how what kind of challenge you faced in this user story.
+- type: textarea
+  attributes:
+    label: Solving challenges with vLLM Ascend and benefits
+    description: >
+      Tell us how vLLM Ascend helped you overcome the challenges, including details like how you use it, what version you used, hardware info, etc. And what kind of benefit do you get from using vLLM Ascend
+- type: textarea
+  attributes:
+    label: Extra Info
+    description: >
+      Any extra infomation you want to include in this story
+- type: markdown
+  attributes:
+    value: >
+      Thanks for contributing 🎉!
--- a/.github/ISSUE_TEMPLATE/400-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug-report.yml
@ -14,9 +14,7 @@ body:
    description: |
      Please run the following and paste the output below.
      ```sh
-      npu-smi info
-      cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
-      wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
+      wget https://raw.githubusercontent.com/vllm-project/vllm-ascend/main/collect_env.py
      # For security purposes, please feel free to check the contents of collect_env.py before running it.
      python collect_env.py
      ```
--- a/.github/ISSUE_TEMPLATE/750-RFC.yml
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@ -40,7 +40,7 @@ body:
  attributes:
    label: Any Other Things.
    description: >
-      Any other things you would like to mention.
+      Any other things you would like to mention, such as feature branch request.
  validations:
    required: false
 - type: markdown
--- a/.github/ISSUE_TEMPLATE/900-release-checklist.yml
+++ b/.github/ISSUE_TEMPLATE/900-release-checklist.yml
@ -0,0 +1,104 @@
+name: Release Checklist
+description: Generate a release checklist issue when prepare a new release.(Used for release team)
+title: "[Release]: Release checklist for v"
+
+body:
+- type: textarea
+  attributes:
+    description: >
+      Brief info for the new release.
+    label: Release Checklist
+    value: >
+      **Release Version**: 
+
+      **Release Branch**: 
+
+      **Release Date**: 
+
+      **Release Manager**: 
+- type: textarea
+  attributes:
+    description: >
+      Release notes.
+    label: Prepare Release Note
+    value: >
+      - [ ] Create a new issue for release feedback
+
+      - [ ] Upgrade vllm version to the new version for CI and Dockerfile
+
+      - [ ] Write the release note PR.
+
+        - [ ] Update the feedback issue link in docs/source/faqs.md
+
+        - [ ] Add release note to docs/source/user_guide/release_notes.md
+
+        - [ ] Update release version in README.md and README.zh.md
+
+        - [ ] Update version info in docs/source/community/versioning_policy.md
+
+        - [ ] Update contributor info in docs/source/community/contributors.md
+
+        - [ ] Update package version in docs/conf.py
+- type: textarea
+  attributes:
+    description: >
+      Make sure the code is merged.
+    label: PR need Merge
+    value: >
+      - [ ] PR link1
+
+      - [ ] PR link2
+
+      - [ ] ...
+- type: textarea
+  attributes:
+    description: >
+      Make sure the new Feature/Function is tested
+    label: Functional Test
+    value: >
+      - [ ] Feature1
+
+      - [ ] Bug1
+
+      - [ ] ...
+- type: textarea
+  attributes:
+    description: >
+      Make sure the doc is updated.
+    label: Doc Test
+    value: >
+      - [ ] Tutorial is updated.
+
+      - [ ] User Guide is updated.
+
+      - [ ] Developer Guide is updated.
+- type: textarea
+  attributes:
+    description: >
+      Make sure the artifacts is ready
+    label: Prepare Artifacts
+    value: >
+      - [ ] Docker image is ready.
+
+      - [ ] Wheel package is ready.
+- type: textarea
+  attributes:
+    description: >
+      Start to release.
+    label: Release Step
+    value: >
+      - [ ] Release note PR is merged.
+
+      - [ ] Post the release on GitHub release page.
+
+      - [ ] Generate official doc page on https://app.readthedocs.org/dashboard/
+
+      - [ ] Wait for the wheel package to be available on https://pypi.org/project/vllm-ascend
+
+      - [ ] Wait for the docker image to be available on https://quay.io/ascend/vllm-ascend
+
+      - [ ] Upload 310p wheel to Github release page
+
+      - [ ] Broadcast the release news (By message, blog , etc)
+
+      - [ ] Close this issue
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -25,4 +25,3 @@ CI passed with new added/existing test.
 If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
 If tests were not added, please describe why they were not added and/or why it was difficult to add.
 -->
-
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -0,0 +1,21 @@
+self-hosted-runner:
+  # Labels of self-hosted runner in array of strings.
+  labels:
+    - linux-aarch64-a2-0
+    - linux-aarch64-a2-1
+    - linux-aarch64-a2-2
+    - linux-aarch64-a2-4
+    - linux-aarch64-a2-8
+    - linux-arm64-npu-static-8
+    - linux-aarch64-310p-1
+    - linux-aarch64-310p-2
+    - linux-aarch64-310p-4
+    - ubuntu-24.04-arm
+    - linux-aarch64-a3-1
+    - linux-aarch64-a3-2
+    - linux-aarch64-a3-4
+    - linux-aarch64-a3-8
+    - linux-amd64-cpu-0
+    - linux-amd64-cpu-8
+    - linux-amd64-cpu-16
+    - linux-aarch64-a3-0
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,10 @@
+version: 2
+updates:
+  - package-ecosystem: "github-actions"
+    directory: "/"
+    schedule:
+      # Check for updates to GitHub Actions every week
+      interval: "weekly"
+    open-pull-requests-limit: 2
+    reviewers:
+      - "Yikun"
--- a/.github/format_pr_body.sh
+++ b/.github/format_pr_body.sh
@ -0,0 +1,59 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/.github/scripts/cleanup_pr_body.sh
+
+#!/bin/bash
+
+set -eux
+
+# ensure 2 argument is passed
+if [ "$#" -ne 3 ]; then
+    echo "Usage: $0 <pr_number> <vllm_version> <vllm_commit>"
+    exit 1
+fi
+
+PR_NUMBER=$1
+VLLM_VERSION=$2
+VLLM_COMMIT=$3
+OLD=/tmp/orig_pr_body.txt
+NEW=/tmp/new_pr_body.txt
+FINAL=/tmp/final_pr_body.txt
+
+gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
+cp "${OLD}" "${NEW}"
+
+# Remove notes in pr description and add vLLM version and commit
+sed -i '/<!--/,/-->/d' "${NEW}"
+sed -i '/- vLLM .*$/d' "${NEW}"
+{
+    echo ""
+    echo "- vLLM version: $VLLM_VERSION"
+    echo "- vLLM main: $VLLM_COMMIT"
+} >> "${NEW}"
+
+# Remove redundant empty lines
+uniq "${NEW}" > "${FINAL}"
+
+# Run this only if ${NEW} is different than ${OLD}
+if ! cmp -s "${OLD}" "${FINAL}"; then
+    echo
+    echo "Updating PR body:"
+    echo
+    cat "${NEW}"
+    gh pr edit --body-file "${FINAL}" "${PR_NUMBER}"
+else
+    echo "No changes needed"
+fi
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -0,0 +1,38 @@
+---
+documentation:
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'docs/**'
+          - '**/*.md'
+
+ci/build:
+  - changed-files:
+      - any-glob-to-any-file:
+          - '.github/actions/*.yml'
+          - '.github/workflows/*.yml'
+
+'module:tests':
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'tests/**'
+
+'module:tools':
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'tools/**'
+
+'module:ops':
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'vllm_ascend/ops/**'
+
+'module:quantization':
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'vllm_ascend/quantization/**'
+
+'module:core':
+  - changed-files:
+      - any-glob-to-any-file:
+          - 'vllm_ascend/*.py'
+
--- a/.github/workflows/_accuracy_test.yaml
+++ b/.github/workflows/_accuracy_test.yaml
@ -0,0 +1,175 @@
+name: 'accuracy test'
+
+on:
+  workflow_call:
+    inputs:
+      vllm:
+        required: true
+        type: string
+      vllm-ascend:
+        required: false
+        type: string
+        default: main
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      model_name:
+        required: true
+        type: string
+      upload:
+        required: false
+        type: boolean
+        default: false
+
+jobs:
+  accuracy_tests:
+
+    runs-on: ${{ inputs.runner }}
+    name: ${{ inputs.model_name }} accuracy
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      env:
+        VLLM_USE_MODELSCOPE: True
+        # 1. If version specified (work_dispatch), do specified branch accuracy test
+        # 2. If no version (labeled PR), do accuracy test by default ref:
+        # The branch, tag or SHA to checkout. When checking out the repository that
+        # triggered a workflow, this defaults to the reference or SHA for that event.
+        # Otherwise, uses the default branch.
+        GHA_VLLM_ASCEND_VERSION: ${{ inputs.vllm-ascend }}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set model name as output
+        id: set_output
+        run: |
+          echo "model_name=${{ inputs.model_name }}" >> $GITHUB_OUTPUT
+
+      - name: Config mirrors
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+          apt-get update -y
+          apt install git -y
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm }}
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Resolve vllm-ascend version
+        run: |
+          VERSION_INPUT="${{ inputs.vllm-ascend }}"
+          
+          if [[ "$VERSION_INPUT" == "latest" ]]; then
+            TAGS=$(git ls-remote --tags --sort=-v:refname https://github.com/vllm-project/vllm-ascend "v*" | cut -f2 | sed 's|refs/tags/||')
+            LATEST_TAG=$(echo "$TAGS" | head -n1)
+            if [[ -z "$LATEST_TAG" ]]; then
+              RESOLVED_VERSION="main"
+            else
+              RESOLVED_VERSION="$LATEST_TAG"
+            fi
+          else
+            RESOLVED_VERSION="$VERSION_INPUT"
+          fi
+          echo "GHA_VLLM_ASCEND_VERSION=$RESOLVED_VERSION" >> $GITHUB_ENV
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm-ascend
+          path: ./vllm-ascend
+          ref: ${{ env.GHA_VLLM_ASCEND_VERSION }}
+
+      - name: Install vllm-project/vllm-ascend
+        working-directory: ./vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Get vLLM commit hash and URL
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_COMMIT=$(git rev-parse --short=7 HEAD)
+          echo "VLLM_COMMIT=$VLLM_COMMIT" >> $GITHUB_ENV
+
+      - name: Get vLLM-Ascend commit hash and URL
+        working-directory: ./vllm-ascend
+        run: |
+          VLLM_ASCEND_COMMIT=$(git rev-parse --short=7 HEAD)
+          echo "VLLM_ASCEND_COMMIT=$VLLM_ASCEND_COMMIT" >> $GITHUB_ENV
+
+      - name: Collect version info
+        run: |
+          for dir in /usr/local/Ascend/ascend-toolkit/*; do
+            dname=$(basename "$dir")
+            if [ "$dname" != "latest" ]; then
+              TOOLKIT_DIR="$dname"
+              break
+            fi
+          done
+          INFO_FILE="/usr/local/Ascend/ascend-toolkit/${TOOLKIT_DIR}/$(uname -i)-linux/ascend_toolkit_install.info"
+          GHA_CANN_VERSION=$(grep "version=" "$INFO_FILE" \
+                           | head -n1 \
+                           | cut -d'=' -f2 \
+                           | tr -d '"')
+          {
+            echo "GHA_CANN_VERSION=$GHA_CANN_VERSION"
+            pip show torch | grep "Version:" | awk '{print "GHA_TORCH_VERSION="$2}'
+            pip show torch_npu | grep "Version:" | awk '{print "GHA_TORCH_NPU_VERSION="$2}'
+            pip show vllm | grep "Version:" | awk '{print "GHA_VLLM_VERSION="$2}' | sed 's/+.*//'
+          } >> "$GITHUB_ENV"
+
+      - name: Run accuracy test
+        id: report
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+          VLLM_VERSION: ${{ env.GHA_VLLM_VERSION }}
+          VLLM_COMMIT: ${{ env.VLLM_COMMIT }}
+          VLLM_ASCEND_VERSION: ${{ env.GHA_VLLM_ASCEND_VERSION || github.ref }}
+          VLLM_ASCEND_COMMIT: ${{ env.VLLM_ASCEND_COMMIT }}
+          CANN_VERSION: ${{ env.GHA_CANN_VERSION }}
+          TORCH_VERSION: ${{ env.GHA_TORCH_VERSION }}
+          TORCH_NPU_VERSION: ${{ env.GHA_TORCH_NPU_VERSION }}
+        run: |
+          model_base_name=$(basename ${{ inputs.model_name }})
+          markdown_name="${model_base_name}"
+          echo "markdown_name=$markdown_name" >> $GITHUB_OUTPUT
+          mkdir -p ./benchmarks/accuracy
+          pytest -sv ./tests/e2e/models/test_lm_eval_correctness.py \
+          --config ./tests/e2e/models/configs/${{ inputs.model_name }}.yaml
+
+      - name: Generate step summary
+        if: ${{ always() }}
+        run: |
+          cat ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload Report
+        if: ${{ inputs.upload == true }}
+        uses: actions/upload-artifact@v4
+        with:
+          name: "report-${{ env.GHA_VLLM_ASCEND_VERSION }}-${{ steps.report.outputs.markdown_name }}"
+          path: ./benchmarks/accuracy/${{ steps.report.outputs.markdown_name }}.md
+          if-no-files-found: warn
+          retention-days: 90
+          overwrite: true
--- a/.github/workflows/_e2e_nightly.yaml
+++ b/.github/workflows/_e2e_nightly.yaml
@ -0,0 +1,105 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'e2e nightly test'
+
+on:
+  workflow_call:
+    inputs:
+      vllm:
+        required: true
+        type: string
+      runner:
+        required: true
+        type: string
+      image:
+        required: false
+        type: string
+        default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
+      tests:
+        required: true
+        type: string
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-nightly:
+    name: e2e-nightly
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.image }}
+      env:
+        VLLM_USE_MODELSCOPE: True
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+          apt-get update -y
+          apt install git -y
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm }}
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend test
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+        run: |
+          # TODO: enable more tests
+          pytest -sv ${{ inputs.tests }}
--- a/.github/workflows/_e2e_test.yaml
+++ b/.github/workflows/_e2e_test.yaml
@ -0,0 +1,196 @@
+name: 'e2e test'
+
+on:
+  workflow_call:
+    inputs:
+      vllm:
+        required: true
+        type: string
+      runner:
+        required: true
+        type: string
+      image:
+        required: true
+        type: string
+      type:
+        required: true
+        type: string
+
+jobs:
+  e2e:
+    name: singlecard
+    runs-on: ${{ inputs.runner }}-1
+    container:
+      image: ${{ inputs.image }}
+      env:
+        VLLM_LOGGING_LEVEL: ERROR
+        VLLM_USE_MODELSCOPE: True
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+          apt-get update -y
+          apt install git -y
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm }}
+          path: ./vllm-empty
+          fetch-depth: 1
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend test
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+        if: ${{ inputs.type == 'light' }}
+        run: |
+          pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_quantization.py
+          pytest -sv tests/e2e/singlecard/test_vlm.py::test_multimodal_vl
+
+      - name: Run e2e test
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+        if: ${{ inputs.type == 'full' }}
+        run: |
+          # We found that if running aclgraph tests in batch, it will cause AclmdlRICaptureBegin error. So we run
+          # the test separately.
+
+          pytest -sv tests/e2e/singlecard/test_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_ascend_scheduler.py
+          pytest -sv tests/e2e/singlecard/test_bge_model.py
+          pytest -sv tests/e2e/singlecard/test_camem.py
+          pytest -sv tests/e2e/singlecard/test_chunked.py
+          pytest -sv tests/e2e/singlecard/test_embedding.py
+          pytest -sv tests/e2e/singlecard/test_embedding_aclgraph.py
+          pytest -sv tests/e2e/singlecard/test_guided_decoding.py
+          pytest -sv tests/e2e/singlecard/test_ilama_lora.py
+          pytest -sv tests/e2e/singlecard/test_profile_execute_duration.py
+          pytest -sv tests/e2e/singlecard/test_quantization.py
+          pytest -sv tests/e2e/singlecard/test_sampler.py
+          pytest -sv tests/e2e/singlecard/test_vlm.py
+
+          # ------------------------------------ v1 spec decode test ------------------------------------ #
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py
+          pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_torchair_correctness.py
+          # Fix me: OOM error
+          #pytest -sv tests/e2e/singlecard/spec_decode_v1/test_v1_spec_decode.py
+
+          pytest -sv tests/e2e/singlecard/ops/
+
+  e2e-2-cards:
+    name: multicard
+    runs-on: ${{ inputs.runner }}-2
+    container:
+      image: ${{ inputs.image }}
+      env:
+        VLLM_LOGGING_LEVEL: ERROR
+        VLLM_USE_MODELSCOPE: True
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+          apt-get update -y
+          apt install git -y
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ inputs.vllm }}
+          path: ./vllm-empty
+          fetch-depth: 1
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend test (light)
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+        if: ${{ inputs.type == 'light' }}
+        run: |
+          pytest -sv tests/e2e/multicard/test_qwen3_moe.py::test_models_distributed_Qwen3_MOE_TP2_WITH_EP
+
+      - name: Run vllm-project/vllm-ascend test (full)
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+        if: ${{ inputs.type == 'full' }}
+        run: |
+          pytest -sv tests/e2e/multicard/test_data_parallel.py
+          pytest -sv tests/e2e/multicard/test_expert_parallel.py
+          pytest -sv tests/e2e/multicard/test_external_launcher.py
+          pytest -sv tests/e2e/multicard/test_fused_moe_allgather_ep.py
+          pytest -sv tests/e2e/multicard/test_ilama_lora_tp2.py
+
+          # To avoid oom, we need to run the test in a single process.
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_QwQ
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W8A8
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen3_W4A8DYNAMIC
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W4A8DYNAMIC
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_sp_for_qwen3_moe
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_flashcomm_v1
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight
+
+          pytest -sv tests/e2e/multicard/test_pipeline_parallel.py
+          pytest -sv tests/e2e/multicard/test_prefix_caching.py
+          pytest -sv tests/e2e/multicard/test_qwen3_moe.py
+          pytest -sv tests/e2e/multicard/test_torchair_graph_mode.py
--- a/.github/workflows/accuracy_test.yaml
+++ b/.github/workflows/accuracy_test.yaml
@ -0,0 +1,72 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+# This test will be triggered:
+# - PR labeled with: 'accuracy-test' & 'ready-for-test'
+name: ascend test / accuracy
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    types: [ labeled, synchronize ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run:
+    name: ""
+    strategy:
+      matrix:
+        # Only top series models should be listed in here
+        include:
+          - runner: a2-1
+            model_name: Qwen3-8B
+          - runner: a2-1
+            model_name: Qwen2.5-VL-7B-Instruct
+          - runner: a2-1
+            model_name: Qwen2-Audio-7B-Instruct
+          - runner: a2-2
+            model_name: Qwen3-30B-A3B
+          - runner: a2-2
+            model_name: Qwen3-VL-30B-A3B-Instruct
+          - runner: a2-2
+            model_name: DeepSeek-V2-Lite
+      fail-fast: false
+    # test will be triggered when tag 'accuracy-test' & 'ready-for-test'
+    if:  >-
+      ${{
+      contains(github.event.pull_request.labels.*.name, 'accuracy-test') &&
+      contains(github.event.pull_request.labels.*.name, 'ready-for-test')
+      }}
+    uses: ./.github/workflows/_accuracy_test.yaml
+    with:
+      vllm: v0.11.0
+      runner:  linux-aarch64-${{ matrix.runner }}
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      model_name: ${{ matrix.model_name }}
--- a/.github/workflows/actionlint.yml
+++ b/.github/workflows/actionlint.yml
@ -1,59 +0,0 @@
-#
-# Adapted from vllm-project/vllm/blob/main/.github
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-name: Lint GitHub Actions workflows
-on:
-  push:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '.github/workflows/*.ya?ml'
-      - '.github/workflows/actionlint.*'
-      - '.github/workflows/matchers/actionlint.json'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  actionlint:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Run actionlint"
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
-          tools/actionlint.sh -color
--- a/.github/workflows/format_pr_body.yaml
+++ b/.github/workflows/format_pr_body.yaml
@ -0,0 +1,57 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: format / pr body
+
+on:
+  # The PR updated when PR opened and push new commits
+  pull_request_target:
+    types: [opened, synchronize]
+    branches:
+      - 'main'
+
+permissions:
+  pull-requests: write
+
+jobs:
+  update-description:
+    name: update vLLM version
+    runs-on: ubuntu-latest
+
+    steps:
+
+      - name: Get vLLM version
+        run: |
+          VLLM_COMMIT=v0.11.0
+          echo "VLLM_COMMIT=https://github.com/vllm-project/vllm/commit/$VLLM_COMMIT" >> $GITHUB_ENV
+
+      - name: Checkout repository
+        uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v4.2.2
+
+      - name: Set up Python
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+
+      - name: Get vLLM release version
+        run: |
+          VLLM_VERSION=$(python3 docs/source/conf.py | jq .ci_vllm_version | tr -d '"')
+          echo "VLLM_VERSION=$VLLM_VERSION" >> $GITHUB_ENV
+
+      - name: Update PR description
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          bash .github/format_pr_body.sh "${{ github.event.number }}" "${{ env.VLLM_VERSION }}" "${{ env.VLLM_COMMIT }}"
--- a/.github/workflows/image_310p_openeuler.yml
+++ b/.github/workflows/image_310p_openeuler.yml
@ -0,0 +1,135 @@
+name: 'image / openEuler / 310p'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main-310p-openeuler / vllm-ascend:*-dev-310p-openeuler
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-310p-openeuler / vllm-ascend:v1.2.3rc1-310p-openeuler
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_310p_openeuler.yml'
+      - 'Dockerfile.310p.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_310p_openeuler.yml'
+      - 'Dockerfile.310p.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
+    runs-on: >-
+      ${{
+          github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+          'ubuntu-latest' ||
+          'ubuntu-24.04-arm'
+      }}
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N-310p-openeuler is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-310p-openeuler
+        #    - pre/post/dev: v0.7.1rc1-310p-openeuler/v0.7.1rc1-310p-openeuler/v0.7.1rc1.dev1-310p-openeuler/v0.7.1.post1-310p-openeuler, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-310p-openeuler
+          type=ref,event=pr,suffix=-310p-openeuler
+          type=pep440,pattern={{raw}},suffix=-310p-openeuler
+        flavor:
+          latest=false
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push 310p
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/arm64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        file: Dockerfile.310p.openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
--- a/.github/workflows/image_310p_ubuntu.yml
+++ b/.github/workflows/image_310p_ubuntu.yml
@ -0,0 +1,131 @@
+name: 'image / Ubuntu / 310p'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main-310p / vllm-ascend:*-dev-310p
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-310p / vllm-ascend:v1.2.3rc1-310p
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_310p_ubuntu.yml'
+      - 'Dockerfile.310p'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_310p_ubuntu.yml'
+      - 'Dockerfile.310p'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  build:
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-310p
+        #    - pre/post/dev: v0.7.1rc1-310p/v0.7.1rc1-310p/v0.7.1rc1.dev1-310p/v0.7.1.post1-310p, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-310p
+          type=ref,event=pr,suffix=-310p
+          type=pep440,pattern={{raw}},suffix=-310p
+        flavor:
+          latest=false
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push 310p
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/amd64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        file: Dockerfile.310p
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
--- a/.github/workflows/image_a3_openeuler.yml
+++ b/.github/workflows/image_a3_openeuler.yml
@ -0,0 +1,135 @@
+name: 'image / openEuler / a3'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main / vllm-ascend:*-dev
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-a3-openeuler / vllm-ascend:v1.2.3rc1-a3-openeuler
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_a3_openeuler.yml'
+      - 'Dockerfile.a3.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_a3_openeuler.yml'
+      - 'Dockerfile.a3.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
+    runs-on: >-
+      ${{
+          github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+          'ubuntu-latest' ||
+          'ubuntu-24.04-arm'
+      }}
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Print
+      run: |
+        lscpu
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N-a3-openeuler is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-a3-openeuler
+        #    - pre/post/dev: v0.7.1rc1-a3-openeuler/v0.7.1rc1-a3-openeuler/v0.7.1rc1.dev1-a3-openeuler/v0.7.1.post1-a3-openeuler, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-a3-openeuler
+          type=ref,event=pr,suffix=-a3-openeuler
+          type=pep440,pattern={{raw}},suffix=-a3-openeuler
+        flavor:
+          latest=false
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push a3
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/arm64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        file: Dockerfile.a3.openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
+
--- a/.github/workflows/image_a3_ubuntu.yml
+++ b/.github/workflows/image_a3_ubuntu.yml
@ -0,0 +1,131 @@
+name: 'image / Ubuntu / a3'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main / vllm-ascend:*-dev
+# 3. tags push trigger image publish
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-a3|vllm-ascend:v1.2.3rc1-a3
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_a3_ubuntu.yml'
+      - 'Dockerfile.a3'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_a3_ubuntu.yml'
+      - 'Dockerfile.a3'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+
+  build:
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
+    runs-on: ubuntu-latest
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Print
+      run: |
+        lscpu
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N-a3 is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-a3
+        #    - pre/post/dev: v0.7.1rc1-a3/v0.7.1rc1-a3/v0.7.1rc1.dev1-a3/v0.7.1.post1-a3, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-a3
+          type=ref,event=pr,suffix=-a3
+          type=pep440,pattern={{raw}},suffix=-a3
+        flavor:
+          latest=false
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push a3
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/amd64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        file: Dockerfile.a3
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
+
--- a/.github/workflows/image_openeuler.yml
+++ b/.github/workflows/image_openeuler.yml
@ -0,0 +1,134 @@
+name: 'image / openEuler'
+# This is a docker build check and publish job:
+# 1. PR Triggered docker image build check
+#   - is for image build check
+#   - Enable on main/*-dev branch
+#   - push: ${{ github.event_name != 'pull_request' }} ==> false
+# 2. branches push trigger image publish
+#   - is for branch/dev/nightly image
+#   - commits are merge into main/*-dev  ==> vllm-ascend:main-openeuler / vllm-ascend:*-dev-openeuler
+#   - is for final release image
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3-openeuler / vllm-ascend:v1.2.3rc1-openeuler
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/image_openeuler.yml'
+      - 'Dockerfile.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
+  push:
+    # Publish image when tagging, the Dockerfile in tag will be build as tag image
+    branches:
+      - 'main'
+      - '*-dev'
+    tags:
+      - 'v*'
+    paths:
+      - '.github/workflows/image_openeuler.yml'
+      - 'Dockerfile.openEuler'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
+    runs-on: >-
+      ${{
+          github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+          'ubuntu-latest' ||
+          'ubuntu-24.04-arm'
+      }}
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false
+
+    - name: Print
+      run: |
+        lscpu
+
+    - name: Docker meta
+      id: meta
+      uses: docker/metadata-action@v5
+      with:
+        # TODO(yikun): add more hub image and a note on release policy for container image
+        images: |
+          quay.io/ascend/vllm-ascend
+        # Note for test case
+        # https://github.com/marketplace/actions/docker-metadata-action#typeref
+        # 1. branch job pulish per main/*-dev branch commits
+        # 2. main and dev pull_request is build only, so the tag pr-N-openeuler is fine
+        # 3. only pep440 matched tag will be published:
+        #    - v0.7.1 --> v0.7.1-openeuler
+        #    - pre/post/dev: v0.7.1rc1-openeuler/v0.7.1rc1-openeuler/v0.7.1rc1.dev1-openeuler/v0.7.1.post1-openeuler, no latest
+        #      which follow the rule from vLLM with prefix v
+        # TODO(yikun): the post release might be considered as latest release
+        tags: |
+          type=ref,event=branch,suffix=-openeuler
+          type=ref,event=pr,suffix=-openeuler
+          type=pep440,pattern={{raw}},suffix=-openeuler
+        flavor:
+          latest=true
+
+    - name: Free up disk space
+      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
+      with:
+        tool-cache: true
+        docker-images: false
+
+    - name: Build - Set up QEMU
+      uses: docker/setup-qemu-action@v3
+
+    - name: Build - Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Publish - Login to Quay Container Registry
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+      uses: docker/login-action@v3
+      with:
+        registry: quay.io
+        username: ${{ vars.QUAY_USERNAME }}
+        password: ${{ secrets.QUAY_PASSWORD }}
+
+    - name: Build and push 910b
+      uses: docker/build-push-action@v6
+      with:
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/arm64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        # only trigger when tag, branch/main push
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
+        labels: ${{ steps.meta.outputs.labels }}
+        tags: ${{ steps.meta.outputs.tags }}
+        file: Dockerfile.openEuler
+        build-args: |
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
--- a/.github/workflows/image_ubuntu.yml
+++ b/.github/workflows/image_ubuntu.yml
@ -1,4 +1,4 @@
-name: 'image'
+name: 'image / Ubuntu'
 # This is a docker build check and publish job:
 # 1. PR Triggered docker image build check
 #   - is for image build check
@ -9,16 +9,23 @@ name: 'image'
 #   - commits are merge into main/*-dev  ==> vllm-ascend:main / vllm-ascend:*-dev
 # 3. tags push trigger image publish
 #   - is for final release image
-#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3|latest / vllm-ascend:v1.2.3rc1
+#   - Publish when tag with v* (pep440 version)  ===>  vllm-ascend:v1.2.3 / vllm-ascend:v1.2.3rc1
 on:
  pull_request:
    branches:
      - 'main'
      - '*-dev'
    paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
      - 'Dockerfile'
      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+    types: [ labeled ]
  push:
    # Publish image when tagging, the Dockerfile in tag will be build as tag image
    branches:
@ -27,17 +34,34 @@ on:
    tags:
      - 'v*'
    paths:
-      - '.github/workflows/image.yml'
+      - '.github/workflows/image_ubuntu.yml'
      - 'Dockerfile'
      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:

  build:
-    name: vllm-ascend image
+    name: vllm-ascend image build
+    # Only arm64 build on openEuler arm64, only amd64 build on Ubuntu amd64
+    # Push event or PR with both 'ready' and 'ready-for-test' labels
    runs-on: ubuntu-latest
-
+    if: ${{ github.event_name == 'push' || (contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test')) }}
    steps:
    - uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+        persist-credentials: false

    - name: Print
      run: |
@ -63,6 +87,8 @@ jobs:
            type=ref,event=branch
            type=ref,event=pr
            type=pep440,pattern={{raw}}
+        flavor:
+          latest=true

    - name: Free up disk space
      uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1
@ -71,31 +97,35 @@ jobs:
        docker-images: false

    - name: Build - Set up QEMU
-      uses: docker/setup-qemu-action@v2
-      # TODO(yikun): remove this after https://github.com/docker/setup-qemu-action/issues/198 resolved
-      with:
-        image: tonistiigi/binfmt:qemu-v7.0.0-28
+      uses: docker/setup-qemu-action@v3

    - name: Build - Set up Docker Buildx
-      uses: docker/setup-buildx-action@v2
+      uses: docker/setup-buildx-action@v3

    - name: Publish - Login to Quay Container Registry
-      if: ${{ github.event_name == 'push' }}
+      if: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
      uses: docker/login-action@v3
      with:
        registry: quay.io
        username: ${{ vars.QUAY_USERNAME }}
        password: ${{ secrets.QUAY_PASSWORD }}

-    - name: Build and push
+    - name: Build and push 910b
      uses: docker/build-push-action@v6
      with:
-        platforms: linux/amd64,linux/arm64
-        cache-from: type=gha
-        cache-to: type=gha,mode=max
+        platforms: >-
+          ${{
+              github.event_name == 'push' && github.repository_owner == 'vllm-project' &&
+              'linux/amd64,linux/arm64' ||
+              'linux/amd64'
+          }}
+        # use the current repo path as the build context, ensure .git is contained
+        context: .
+        file: Dockerfile
        # only trigger when tag, branch/main push
-        push: ${{ github.event_name != 'pull_request' }}
+        push: ${{ github.event_name == 'push' && github.repository_owner == 'vllm-project' }}
        labels: ${{ steps.meta.outputs.labels }}
        tags: ${{ steps.meta.outputs.tags }}
        build-args: |
-            PIP_INDEX_URL=https://pypi.org/simple
+          PIP_INDEX_URL=https://pypi.org/simple
+        provenance: false
--- a/.github/workflows/label_merge_conflict.yml
+++ b/.github/workflows/label_merge_conflict.yml
@ -0,0 +1,20 @@
+name: "Merge Conflict Labeler"
+on:
+  # So that PRs touching the same files as the push are updated
+  push:
+  # So that the `dirtyLabel` is removed if conflicts are resolve
+  # We recommend `pull_request_target` so that github secrets are available.
+  # In `pull_request` we wouldn't be able to change labels of fork PRs
+  pull_request_target:
+    types: [synchronize]
+
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      - name: check if prs are dirty
+        uses: eps1lon/actions-label-merge-conflict@v3
+        with:
+          dirtyLabel: "merge-conflicts"
+          repoToken: "${{ secrets.GITHUB_TOKEN }}"
+          commentOnDirty: "This pull request has conflicts, please resolve those before we can evaluate the pull request."
--- a/.github/workflows/labeler.yml
+++ b/.github/workflows/labeler.yml
@ -0,0 +1,18 @@
+name: Pull Request Labeler
+
+on: pull_request_target
+
+jobs:
+  label:
+    name: Label
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Label the PR
+        uses: actions/labeler@v6
+        with:
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+          configuration-path: .github/labeler.yml
+          sync-labels: true
--- a/.github/workflows/multi_node_test.yaml
+++ b/.github/workflows/multi_node_test.yaml
@ -0,0 +1,109 @@
+name: 'e2e test / multi-dp'
+
+on:
+    schedule:
+      - cron: "0 */4 * * *"
+    workflow_dispatch:
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # This is a runner with no NPU for k8s controller
+    runs-on: linux-aarch64-a3-0
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        KUBECONFIG: /tmp/kubeconfig
+        KUBECTL: /root/.cache/.kube/kubectl
+        NAMESPACE: vllm-project
+        LEADER_POD: vllm-0
+    steps:
+        - name: Install system denpendencies
+          run: |
+           # configure apt and pip source
+           sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+           pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+           apt-get update -y && apt-get install -y git curl
+
+           TOKEN=`echo -n "x-access-token:${{ secrets.ADMIN_PTA }}" | base64`
+           git config --global http.https://gh-proxy.test.osinfra.cn/.extraheader "AUTHORIZATION: basic $TOKEN"
+
+        - name: Install kubectl
+          run: |
+            install -o root -g root -m 0755 $KUBECTL /usr/local/bin/kubectl
+
+            # get kubeconfig from secret
+            echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
+
+        - name: Checkout code
+          uses: actions/checkout@v4
+
+        - name: Prepare scripts
+          run: |
+            # prepare for lws entrypoint scripts
+            install -D tests/e2e/multi_node/scripts/run.sh /root/.cache/tests/run.sh
+
+        - name: Launch cluster
+          run: |
+            kubectl apply -f tests/e2e/multi_node/scripts/lws.yaml
+          
+        - name: Waiting for pod ready
+          run: |
+            echo "waiting for Pod [$LEADER_POD] in namespace [$NAMESPACE] to Ready..."
+
+            while true; do
+              # get pod status
+              READY_STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.containerStatuses[*].ready}')
+
+              if [[ "$READY_STATUS" == "true" ]]; then
+                echo "✅ Pod [$LEADER_POD] is Ready!"
+                break
+              else
+                echo "Pod [$LEADER_POD] not ready, waiting..."
+                sleep 3
+              fi
+            done
+
+        - name: Stream logs and monitor pod health
+          run: |
+            set -euo pipefail
+
+            echo "🚀 Start streaming logs for Pod [$LEADER_POD] ..."
+            kubectl logs -f "$LEADER_POD" -n "$NAMESPACE" &
+            LOG_PID=$!
+
+            echo "Start monitoring Pod [$LEADER_POD] status ..."
+            while true; do
+              STATUS=$(kubectl get pod "$LEADER_POD" -n "$NAMESPACE" -o jsonpath='{.status.phase}')
+              if [[ "$STATUS" != "Running" && "$STATUS" != "Succeeded" ]]; then
+                echo "❌ Pod [$LEADER_POD] exited abnormally with status: $STATUS"
+                kubectl describe pod "$LEADER_POD" -n "$NAMESPACE" || true
+                kubectl logs "$LEADER_POD" -n "$NAMESPACE" --previous --all-containers || true
+                kill $LOG_PID || true
+                exit 1
+              fi
+              sleep 5
+            done &
+
+            MONITOR_PID=$!
+            wait $LOG_PID || true
+            kill $MONITOR_PID || true
+
+        - name: Post process
+          if: always()
+          run: |
+            kubectl get pods -n $NAMESPACE
+            kubectl delete -f tests/e2e/multi_node/scripts/lws.yaml
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@ -1,78 +0,0 @@
-#
-# Adapted from vllm-project/vllm/blob/main/.github
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-name: mypy
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '**/*.py'
-      - '.github/workflows/mypy.yaml'
-      - 'tools/mypy.sh'
-      - 'mypy.ini'
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    # This workflow is only relevant when one of the following files changes.
-    # However, we have github configured to expect and require this workflow
-    # to run and pass before github with auto-merge a pull request. Until github
-    # allows more flexible auto-merge policy, we can just run this on every PR.
-    # It doesn't take that long to run, anyway.
-    paths:
-     - '**/*.py'
-     - '.github/workflows/mypy.yaml'
-     - 'tools/mypy.sh'
-     - 'mypy.ini'
-
-jobs:
-  mypy:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
-    steps:
-    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-    - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install -r requirements-dev.txt 
-
-    - name: Checkout vllm-project/vllm repo
-      uses: actions/checkout@v4
-      with:
-        repository: vllm-project/vllm
-        path: vllm-empty
-
-    - name: Install vllm-project/vllm from source
-      working-directory: vllm-empty
-      run: |
-        pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
-        VLLM_TARGET_DEVICE=empty pip install .
-
-    - name: Mypy
-      run: |
-        echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1 ${{ matrix.python-version }}
--- a/.github/workflows/nightly_benchmarks.yaml
+++ b/.github/workflows/nightly_benchmarks.yaml
@ -0,0 +1,206 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: 'ascend test / performance'
+# This workflow runs nightly benchmarks for vllm-ascend.
+
+on:
+  schedule:
+    # Run benchmarks at 20:00 and 03:00 Beijing time (UTC+8)
+    - cron: "0 12 * * *"
+    - cron: "0 19 * * *"
+
+  workflow_dispatch:
+    # Allow manual triggering of the workflow
+
+  pull_request:
+    types: [ labeled ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only 1 job can runs on static-8-01-cards
+concurrency:
+  group: static-8-01-cards
+  cancel-in-progress: false
+
+jobs:
+  test:
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'performance-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }}
+
+    name: Benchmarks/vLLM=${{ matrix.vllm_branch }}, vLLM-Ascend=${{ matrix.vllm_ascend_branch }}, use_v1=${{ matrix.vllm_use_v1 }}
+    runs-on: 'linux-arm64-npu-static-8'
+    strategy:
+      matrix:
+        include:
+          - vllm_branch: v0.11.0
+            vllm_ascend_branch: main
+            vllm_use_v1: 1
+      max-parallel: 1
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      volumes:
+        - /usr/local/dcmi:/usr/local/dcmi
+        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+        - /usr/local/Ascend/driver/:/usr/local/Ascend/driver/
+        # Use self-host cache speed up pip and model download
+        - /home/action/.cache:/github/home/.cache/
+      options: >-
+        --device /dev/davinci0
+        --device /dev/davinci1
+        --device /dev/davinci_manager
+        --device /dev/devmm_svm
+        --device /dev/hisi_hdc
+      env:
+        VLLM_USE_MODELSCOPE: True
+        ES_OM_DOMAIN: ${{ secrets.ES_OM_DOMAIN }}
+        ES_OM_AUTHORIZATION: ${{ secrets.ES_OM_AUTHORIZATION }}
+        VLLM_USE_V1: ${{ matrix.vllm_use_v1 }}
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          # keep using tuna's proxy since linux-arm64-npu-static-8 is in another region
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Install system dependencies
+        run: |
+          apt-get update -y
+          apt-get -y install git jq wget curl lsof gcc g++ cmake libnuma-dev
+
+      - name: Config git
+        run: |
+          git config --global --add safe.directory "$GITHUB_WORKSPACE"
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: ./vllm-empty
+          ref: ${{  matrix.vllm_branch }}
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -e .
+          pip install -r benchmarks/requirements-bench.txt
+
+      - name: Run current commit benchmarks
+        if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch'
+        run: |
+          # Sometimes we only want to run benchmarks on the current commit
+          # This is useful for debugging or a release benchmark
+          bash benchmarks/scripts/run-performance-benchmarks.sh
+          # Convert the benchmark results to markdown format
+          python3 benchmarks/scripts/convert_json_to_markdown.py
+
+      - name: Generate step summary
+        if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch'
+        run: |
+          cat ./benchmarks/results/benchmark_results.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Upload benchmark artifacts
+        if: github.event_name != 'schedule' && github.event_name != 'workflow_dispatch'
+        uses: actions/upload-artifact@v4
+        with:
+          name: "benchmark-performance-${{ matrix.vllm_branch }}-${{ matrix.vllm_ascend_branch }}-report"
+          path: ./benchmarks/results/benchmark_results.md
+          if-no-files-found: warn
+          retention-days: 90
+          overwrite: true
+
+      - name: Install elastic_tool
+        if: github.event_name != 'pull_request'
+        run: |
+          pip install escli-tool==0.2.3
+
+      - name: Collect pr info from vllm-project/vllm-ascend
+        if: github.event_name != 'pull_request'
+        run: |
+          # Only get the pull request which may influences performance
+          git log --pretty=format:"%H %s" -- '**/*.py' ':!docs/*' ':!tests/*' ':!examples/*' ':!benchmarks/*' > commit_log.txt
+          escli check commit_log.txt
+      
+      - name: Prepare benchmark script in advance
+        if: github.event_name != 'pull_request'
+        # This is for the benchmark iteration, which will change the benchmark scripts while checkouting each commit.
+        # We need ensure the benchmark scripts always available.
+        run: |
+          # Prepare the benchmark script in advance
+          mkdir -p /github/home/benchmarks
+          cp -r benchmarks/* /github/home/benchmarks/
+
+      - name: Run benchmark iteration
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        if: github.event_name != 'pull_request'
+        run: |
+          while IFS= read -r line || [[ -n "$line" ]]; do
+            commit_id=${line%% *}
+            commit_title=${line#* }
+
+            git checkout $commit_id
+            commit_time=$(git show -s --format=%cd $commit_hash --date=iso-strict)
+            commit_time_no_tz=${commit_time::19}
+            pip install -e .
+
+            echo "------------------------"
+            echo "commit_id: $commit_id"
+            echo "commit_title: $commit_title"
+            echo "commit_time: $commit_time_no_tz"
+            echo "vllm branch: ${{ matrix.vllm_branch }}"
+            echo "vllm-ascend branch: ${{ matrix.vllm_ascend_branch }}"
+            echo "------------------------"
+
+            cd /github/home
+            ERROR_MSG=""
+            if ! bash benchmarks/scripts/run-performance-benchmarks.sh; then
+              ERROR_MSG="Benchmark failed to run"
+            fi
+            # send the result to es
+            escli add --vllm_branch ${{ matrix.vllm_branch }} \
+            --vllm_ascend_branch ${{ matrix.vllm_ascend_branch }} \
+            --commit_id $commit_id \
+            --commit_title "$commit_title" \
+            --created_at "$commit_time_no_tz" \
+            --res_dir ./benchmarks/results \
+            --error "$ERROR_MSG" \
+            --extra_feat '{"VLLM_USE_V1": "${{ matrix.vllm_use_v1 }}"}'
+            rm -rf ./benchmarks/results
+            cd -
+          done < commit_log.txt
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@ -0,0 +1,43 @@
+name: pre-commit
+
+on:
+    workflow_call:
+      inputs:
+        vllm:
+          required: true
+          type: string
+
+permissions:
+  contents: read
+
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout vllm-project/vllm-ascend repo
+      uses: actions/checkout@v4
+    - uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+      with:
+        python-version: "3.11"
+    - run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+    - run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
+    - name: Checkout vllm-project/vllm repo
+      uses: actions/checkout@v4
+      with:
+        repository: vllm-project/vllm
+        path: ./vllm-empty
+        ref: ${{ inputs.vllm }}
+    - name: Install vllm
+      working-directory: vllm-empty
+      run: |
+        pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+        VLLM_TARGET_DEVICE=empty pip install .
+    - name: Install vllm-ascend dev
+      run: |
+        pip install -r requirements-dev.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    - uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
+      env:
+        SHELLCHECK_OPTS: "--exclude=SC2046,SC2006,SC2086" # Exclude SC2046, SC2006, SC2086 for actionlint
+      with:
+        extra_args: --all-files --hook-stage manual
+
--- a/.github/workflows/release_code.yml
+++ b/.github/workflows/release_code.yml
@ -0,0 +1,75 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: build / sdist
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/release_code.yml'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  build:
+    name: release code
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.11"]
+    steps:
+      - uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v4.2.2
+
+      - name: Print
+        run: |
+          lscpu
+      
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install twine setuptools_scm
+
+      - name: Generate tar.gz
+        run: |
+          python3 setup.py sdist
+          ls dist
+
+      - name: Archive tar.gz
+        uses: actions/upload-artifact@v4
+        with:
+          name: vllm-ascend-src
+          path: dist/*
+
+      - name: Release
+        if: startsWith(github.ref, 'refs/tags/')
+        run: |
+          python3 -m twine upload dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/.github/workflows/release_whl.yml
+++ b/.github/workflows/release_whl.yml
@ -0,0 +1,119 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: build / wheel
+
+on:
+  schedule:
+    # Runs at 23:00 UTC (7:00 AM Beijing) every day
+    - cron: '0 23 * * *'
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/release_whl.yml'
+      - '.github/Dockerfile.buildwheel'
+      - 'vllm_ascend/**'
+      - 'setup.py'
+      - 'pyproject.toml'
+      - 'requirements.txt'
+      - 'cmake/**'
+      - 'CMakeLists.txt'
+      - 'csrc/**'
+  push:
+    tags:
+      - 'v*'
+
+jobs:
+  build:
+    name: build and release wheel
+    strategy:
+      matrix:
+        os: [ubuntu-24.04, ubuntu-24.04-arm]
+        # PR only trigger latest version
+        python-version: ${{ fromJSON(
+          (github.event_name == 'pull_request' && '["3.11"]') ||
+          '["3.9", "3.10", "3.11"]'
+         ) }}
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@ff7abcd0c3c05ccf6adc123a8cd1fd4fb30fb493 # v4.2.2
+
+    - name: Print
+      run: |
+        lscpu
+        
+    - name: Build wheel
+      run: |
+        ls
+        docker build -f ./.github/Dockerfile.buildwheel \
+        --build-arg PY_VERSION=${{ matrix.python-version }} \
+        -t wheel:v1 .
+        docker run --rm \
+        -u $(id -u):$(id -g) \
+        -v $(pwd):/outpwd \
+        wheel:v1 \
+        bash -c "cp -r /workspace/vllm-ascend/dist /outpwd"
+        ls dist
+
+    - name: Set up Python ${{ matrix.python-version }}
+      if: startsWith(github.ref, 'refs/tags/')
+      uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
+      with:
+        python-version: ${{ matrix.python-version }}
+      
+    - name: Repair wheels with auditwheel
+      run: |
+        python3 -m pip install auditwheel
+        python3 -m pip install patchelf
+        mkdir -p dist/repaired
+        for whl in dist/*.whl; do
+          auditwheel repair "$whl" -w dist/repaired/ \
+          --exclude libplatform.so \
+          --exclude libregister.so \
+          --exclude libge_common_base.so \
+          --exclude libc10.so \
+          --exclude libc_sec.so \
+          --exclude "libascend*.so" \
+          --exclude "libtorch*.so" \
+          --exclude "liberror_manager.so"
+        done
+        rm -f dist/*.whl
+        mv dist/repaired/*.whl dist/
+        rmdir dist/repaired
+        ls dist
+
+    - name: Verify automatic platform tags
+      run: |
+        cd dist
+        for wheel in *.whl; do
+          echo "verification file: $wheel"
+          auditwheel show "$wheel"
+        done
+
+    - name: Archive wheel
+      uses: actions/upload-artifact@v4
+      with:
+        name: vllm-ascend-${{ matrix.os }}-py${{ matrix.python-version }}-wheel
+        path: dist/*
+
+    - name: Release
+      if: startsWith(github.ref, 'refs/tags/')
+      run: |
+        python3 -m pip install twine
+        python3 -m twine upload --verbose dist/* -u __token__ -p ${{ secrets.PYPI_TOKEN }}
--- a/.github/workflows/reminder_comment.yml
+++ b/.github/workflows/reminder_comment.yml
@ -0,0 +1,26 @@
+name: PR Reminder Comment Bot
+permissions:
+  pull-requests: write
+on:
+  pull_request_target:
+    types: [opened]
+jobs:
+  pr_reminder:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Remind to run full CI on PR
+        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
+        with:
+          script: |
+            github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+              body: '👋 Hi! Thank you for contributing to the vLLM Ascend project. The following points will speed up your PR merge:‌‌\n\n' +
+                '- A PR should do only one thing, smaller PRs enable faster reviews.\n' +
+                '- Every PR should include unit tests and end-to-end tests ‌to ensure it works and is not broken by other future PRs.\n' +
+                '- Write the commit message by fulfilling the PR description to help reviewer and future developers understand.\n\n' +
+                'If CI fails, you can run linting and testing checks locally according [Contributing](https://vllm-ascend.readthedocs.io/zh-cn/latest/developer_guide/contribution/index.html) and [Testing](https://vllm-ascend.readthedocs.io/zh-cn/latest/developer_guide/contribution/testing.html).'
+            })
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@ -1,59 +0,0 @@
-#
-# Adapted from vllm-project/vllm/blob/main/.github
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-name: ruff
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - "**/*.py"
-      - requirements-lint.txt
-      - .github/workflows/matchers/ruff.json
-      - .github/workflows/ruff.yml
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-
-jobs:
-  ruff:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install -r requirements-lint.txt
-      - name: Analysing the code with ruff
-        run: |
-          echo "::add-matcher::.github/workflows/matchers/ruff.json"
-          ruff check --output-format github .
-      - name: Run isort
-        run: |
-          isort . --check-only
--- a/.github/workflows/shellcheck.yml
+++ b/.github/workflows/shellcheck.yml
@ -1,56 +0,0 @@
-#
-# Adapted from vllm-project/vllm/blob/main/.github
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-name: Lint shell scripts
-on:
-  push:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - '**/*.sh'
-      - '.github/workflows/shellcheck.yml'
-
-env:
-  LC_ALL: en_US.UTF-8
-
-defaults:
-  run:
-    shell: bash
-
-permissions:
-  contents: read
-
-jobs:
-  shellcheck:
-    runs-on: ubuntu-latest
-    steps:
-      - name: "Checkout"
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: "Check shell scripts"
-        run: |
-          tools/shellcheck.sh
--- a/.github/workflows/vllm_ascend_dist.yaml
+++ b/.github/workflows/vllm_ascend_dist.yaml
@ -0,0 +1,100 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'e2e test / a3-test'
+
+on:
+  workflow_call:
+
+  pull_request:
+    types: [ labeled ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 8 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'dist-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'workflow_dispatch' }}
+    strategy:
+      matrix:
+        os: [linux-aarch64-a3-8]
+        vllm_version: [v0.11.0]
+    name: vLLM Ascend test
+    runs-on: ${{ matrix.os }}
+    container:
+      image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+      env:
+        DEBIAN_FRONTEND: noninteractive
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+          apt-get update -y
+          apt install git -y
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_version }}
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend test for V1 Engine
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+        run: |
+          # TODO: enable more tests
+          pytest -sv tests/e2e/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_multistream_moe
--- a/.github/workflows/vllm_ascend_doctest.yaml
+++ b/.github/workflows/vllm_ascend_doctest.yaml
@ -0,0 +1,87 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'ascend test / doctest'
+
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      # If we are changing the doctest we should do a PR test
+      - '.github/workflows/vllm_ascend_doctest.yaml'
+      - 'tests/e2e/doctests/**'
+      - 'tests/e2e/common.sh'
+      - 'tests/e2e/run_doctests.sh'
+  schedule:
+    # Runs every 12 hours
+    - cron:  '0 */12 * * *'
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+jobs:
+  test:
+    strategy:
+      # Each version should be tested
+      fail-fast: false
+      matrix:
+        vllm_verison: [v0.9.1-dev, v0.9.1-dev-openeuler, main, main-openeuler]
+    name: vLLM Ascend test
+    runs-on: linux-aarch64-a2-1
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/vllm-ascend:${{ matrix.vllm_verison }}
+    steps:
+      - name: Check NPU/CANN and git info
+        run: |
+          echo "====> Print NPU/CANN info"
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+          echo "====> Print vllm-ascend git info"
+          cd /vllm-workspace/vllm-ascend
+          git --no-pager log -1 || true
+          echo "====> Print vllm git info"
+          cd /vllm-workspace/vllm
+          git --no-pager log -1 || true
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Run vllm-ascend/tests/e2e/run_doctests.sh
+        run: |
+          # PWD: /__w/vllm-ascend/vllm-ascend
+          # Make sure e2e tests are latest
+          echo "Replacing /vllm-workspace/vllm-ascend/tests/e2e ..."
+          rm -rf /vllm-workspace/vllm-ascend/tests/e2e
+          mkdir -p /vllm-workspace/vllm-ascend/tests
+          # Overwrite e2e and examples
+          cp -r tests/e2e /vllm-workspace/vllm-ascend/tests/
+          cp -r examples /vllm-workspace/vllm-ascend/
+
+          # Simulate container to enter directory
+          cd /workspace
+
+          # Run real test
+          echo "Test:"
+          /vllm-workspace/vllm-ascend/tests/e2e/run_doctests.sh
--- a/.github/workflows/vllm_ascend_test.yaml
+++ b/.github/workflows/vllm_ascend_test.yaml
@ -1,6 +1,5 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -13,30 +12,19 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# This file is a part of the vllm-ascend project.
 #

-name: 'e2e test'
+name: 'ascend test'

 on:
  push:
    branches:
      - 'main'
-      - '*-dev'
-    paths:
-      - '*.txt'
-      - '**/*.py'
-      - '.github/workflows/vllm_ascend_test.yaml'
-      - '!docs/**'
  pull_request:
    branches:
      - 'main'
      - '*-dev'
-    paths:
-      - '*.txt'
-      - '**/*.py'
-      - '.github/workflows/vllm_ascend_test.yaml'
-      - '!docs/**'
-
 # Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
 # declared as "shell: bash -el {0}" on steps that need to be properly activated.
 # It's used to activate ascend-toolkit environment variables.
@ -44,78 +32,126 @@ defaults:
  run:
    shell: bash -el {0}

+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
 jobs:
-  test:
-    name: vLLM Ascend test (self-host)
-    runs-on: ascend-arm64  # actionlint-ignore: runner-label
+  lint:
+    uses: ./.github/workflows/pre-commit.yml
+    with:
+      vllm: v0.11.0

-    container:
-      image: quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
-      volumes:
-        - /usr/local/dcmi:/usr/local/dcmi
-        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
-        - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
-        # Use self-host cache speed up pip and model download
-        - /home/action/actions-runner/_work/cache:/github/home/.cache/
-      options: >-
-        --device /dev/davinci6
-        --device /dev/davinci_manager
-        --device /dev/devmm_svm
-        --device /dev/hisi_hdc
-      env:
-        HF_ENDPOINT: https://hf-mirror.com
+  changes:
+    runs-on: ubuntu-latest
+    outputs:
+      e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
+      ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
    steps:
-      - name: Check npu and CANN info
-        run: |
-          npu-smi info
-          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            e2e_tracker:
+              - '.github/workflows/vllm_ascend_test.yaml'
+              - 'vllm_ascend/**'
+              - 'csrc/**'
+              - 'cmake/**'
+              - 'tests/e2e/**'
+              - 'CMakeLists.txt'
+              - 'setup.py'
+              - 'requirements.txt'
+              - 'requirements-dev.txt'
+              - 'requirements-lint.txt'
+              - 'packages.txt'
+            ut_tracker:
+              - 'tests/ut/**'

-      - name: Config mirrors
-        run: |
-          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
-          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-      - name: Checkout vllm-project/vllm-ascend repo
-        uses: actions/checkout@v4
-
-      - name: Install system dependencies
+  ut:
+    needs: [lint, changes]
+    name: unit test
+    # only trigger unit test after lint passed and the change is e2e and ut related.
+    if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
+    runs-on: ubuntu-latest
+    container:
+      image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      env:
+        VLLM_LOGGING_LEVEL: ERROR
+        VLLM_USE_MODELSCOPE: True
+    strategy:
+      matrix:
+        vllm_version: [v0.11.0]
+    steps:
+      - name: Install packages
        run: |
          apt-get update -y
-          apt-get -y install `cat packages.txt`
-
-      - name: Install dependencies
-        run: |
-          pip install -r requirements-dev.txt
+          apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2

      - name: Checkout vllm-project/vllm repo
        uses: actions/checkout@v4
        with:
          repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_version }}
          path: ./vllm-empty

      - name: Install vllm-project/vllm from source
        working-directory: ./vllm-empty
        run: |
-          VLLM_TARGET_DEVICE=empty pip install -e .
+          VLLM_TARGET_DEVICE=empty python3 -m pip install . --extra-index https://download.pytorch.org/whl/cpu/
+          python3 -m pip uninstall -y triton
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4

      - name: Install vllm-project/vllm-ascend
        run: |
-          pip install -e .
+          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+          python3 -m pip install -r requirements-dev.txt --extra-index https://download.pytorch.org/whl/cpu/
+          python3 -m pip install -v . --extra-index https://download.pytorch.org/whl/cpu/

-      - name: Install pta
+      - name: Run unit test
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          TORCH_DEVICE_BACKEND_AUTOLOAD: 0
        run: |
-          mkdir pta
-          cd pta
-          wget https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/Daily/v2.5.1/20250218.4/pytorch_v2.5.1_py310.tar.gz
-          tar -xvf pytorch_v2.5.1_py310.tar.gz
-          pip install ./torch_npu-2.5.1.dev20250218-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl
-          cd ..
-          rm -rf pta
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+          pytest -sv --cov --cov-report=xml:unittests-coverage.xml tests/ut \
+          --ignore=tests/ut/test_platform.py \
+          --ignore=tests/ut/patch/worker/patch_common/test_patch_minicpm.py \
+          --ignore=tests/ut/core/test_scheduler.py \
+          --ignore=tests/ut/kv_connector/test_llmdatadist_connector.py \
+          --ignore=tests/ut/kv_connector/test_mooncake_connector.py \
+          --ignore=tests/ut/kv_connector/test_remote_decode_lifecycle.py \
+          --ignore=tests/ut/kv_connector/test_remote_prefill_lifecycle.py \
+          --ignore=tests/ut/torchair/models/test_torchair_deepseek_v2.py \

-      - name: Run vllm-project/vllm-ascend test
-        run: |
-          pytest -sv tests
+      - name: Upload coverage to Codecov
+        # only upload coverage when commits merged
+        if: github.event_name == 'push' && github.ref == 'refs/heads/main'
+        uses: codecov/codecov-action@v5
+        env:
+          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+        with:
+          flags: unittests
+          name: vllm-ascend
+          verbose: true

-      - name: Run vllm-project/vllm test
-        run: |
-          pytest -sv
+  e2e-light:
+    name: e2e-light
+    strategy:
+      matrix:
+        vllm_version: [v0.11.0]
+    # Note (yikun): If CI resource are limited we can split job into two chain jobs
+    needs: [lint, changes]
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    if: ${{ github.event_name == 'pull_request' && needs.lint.result == 'success' && needs.changes.outputs.e2e_tracker == 'true' && !contains(github.event.pull_request.labels.*.name, 'ready') }}
+    uses: ./.github/workflows/_e2e_test.yaml
+    with:
+      vllm: ${{ matrix.vllm_version }}
+      runner: linux-aarch64-a2
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      type: light
--- a/.github/workflows/vllm_ascend_test_310p.yaml
+++ b/.github/workflows/vllm_ascend_test_310p.yaml
@ -0,0 +1,117 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'e2e test / 310p-test'
+
+on:
+  push:
+    tags:
+      - 'v*'
+  schedule:
+    # Runs every 6 hours
+    - cron:  '0 */6 * * *'
+  pull_request:
+    types: [ labeled ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e:
+    # e2e-310p-test will be triggered when tag 'e2e-310p-test' & 'ready-for-test' or schedule job
+    if: >- 
+      ${{ 
+        (contains(github.event.pull_request.labels.*.name, 'e2e-310p-test'))  && 
+        contains(github.event.pull_request.labels.*.name, 'ready-for-test') || 
+        github.event_name == 'schedule' || github.event_name == 'push' 
+        }}
+    strategy:
+      max-parallel: 2
+      matrix:
+        os: [linux-aarch64-310p-1, linux-aarch64-310p-4]
+        vllm_version: [v0.11.0]
+    name: 310p e2e test
+    runs-on: ${{ matrix.os }}
+    container:
+      # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11
+      env:
+        VLLM_LOGGING_LEVEL: ERROR
+        VLLM_USE_MODELSCOPE: True
+    steps:        
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          sed -Ei 's@(ports|archive).ubuntu.com@cache-service.nginx-pypi-cache.svc.cluster.local:8081@g' /etc/apt/sources.list
+          pip config set global.index-url http://cache-service.nginx-pypi-cache.svc.cluster.local/pypi/simple
+          pip config set global.trusted-host cache-service.nginx-pypi-cache.svc.cluster.local
+          apt-get update -y
+          apt install git -y
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install git vim wget net-tools gcc g++ cmake libnuma-dev curl gnupg2
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_version }}
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
+          export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/x86_64-linux/devlib
+          export SOC_VERSION=ASCEND310P3 
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run e2e test
+        env:
+          VLLM_WORKER_MULTIPROC_METHOD: spawn
+          VLLM_USE_MODELSCOPE: True
+          PYTORCH_NPU_ALLOC_CONF: max_split_size_mb:256
+        run: |
+          if [[ "${{ matrix.os }}" == "linux-aarch64-310p-1" ]]; then
+            pytest -sv tests/e2e/310p/test_offline_inference_310p.py
+          else
+            pytest -sv tests/e2e/310p/test_offline_inference_parallel_310p.py
+          fi
--- a/.github/workflows/vllm_ascend_test_full.yaml
+++ b/.github/workflows/vllm_ascend_test_full.yaml
@ -0,0 +1,80 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+name: 'ascend test / full'
+
+on:
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    types: [ labeled, synchronize ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  changes:
+    runs-on: ubuntu-latest
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'ready') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') }}
+    outputs:
+      e2e_tracker: ${{ steps.filter.outputs.e2e_tracker }}
+      ut_tracker: ${{ steps.filter.outputs.ut_tracker }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dorny/paths-filter@v3
+        id: filter
+        with:
+          filters: |
+            e2e_tracker:
+              - '.github/workflows/vllm_ascend_test.yaml'
+              - '.github/workflows/_e2e_test.yaml'
+              - 'vllm_ascend/**'
+              - 'csrc/**'
+              - 'cmake/**'
+              - 'tests/e2e/**'
+              - 'CMakeLists.txt'
+              - 'setup.py'
+              - 'requirements.txt'
+              - 'requirements-dev.txt'
+              - 'requirements-lint.txt'
+              - 'packages.txt'
+            ut_tracker:
+              - 'tests/ut/**'
+
+  e2e-test:
+    name: e2e-full
+    strategy:
+      matrix:
+        vllm_version: [v0.11.0]
+    needs: [changes]
+    if: ${{ needs.changes.outputs.e2e_tracker == 'true' }}
+    uses: ./.github/workflows/_e2e_test.yaml
+    with:
+      vllm: ${{ matrix.vllm_version }}
+      runner: linux-aarch64-a2
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      type: full
--- a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml
+++ b/.github/workflows/vllm_ascend_test_full_vllm_main.yaml
@ -0,0 +1,45 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+name: 'ascend test / vllm main'
+
+on:
+  # Run 1-card and 2-cards e2e tests per 2h
+  schedule:
+    - cron: '0 */2 * * *'
+  workflow_dispatch:
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  e2e-test:
+    uses: ./.github/workflows/_e2e_test.yaml
+    with:
+      vllm: main
+      runner: linux-aarch64-a2
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      type: full
--- a/.github/workflows/vllm_ascend_test_models.yaml
+++ b/.github/workflows/vllm_ascend_test_models.yaml
@ -0,0 +1,177 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+# This test will be triggered:
+# 1. schedule
+# 2. pull_request change the related files
+# 3. workflow_dispatch with models input
+
+name: ascend test / models
+
+on:
+  schedule:
+    # Runs every 6 hours
+    - cron:  '0 */6 * * *'
+  pull_request:
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - '.github/workflows/vllm_ascend_test_models.yaml'
+      - 'tests/e2e/models/test_lm_eval_correctness.py'
+  workflow_dispatch:
+    inputs:
+      vllm-ascend-version:
+        description: 'vllm-ascend:'
+        required: true
+        type: choice
+        # Current supported vLLM versions
+        options:
+          - latest
+          - main
+        default: main
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  run:
+    strategy:
+      matrix:
+        include:
+          - model_name: Qwen3-8B
+            runner: a2-1
+          - model_name: Qwen2.5-VL-7B-Instruct
+            runner: a2-1
+          - model_name: Qwen2-Audio-7B-Instruct
+            runner: a2-1
+          - model_name: Qwen3-30B-A3B
+            runner: a2-2
+          - model_name: Qwen3-VL-30B-A3B-Instruct
+            runner: a2-2
+          - model_name: DeepSeek-V2-Lite
+            runner: a2-2
+      fail-fast: false
+    uses: ./.github/workflows/_accuracy_test.yaml
+    with:
+      vllm: v0.11.0
+      runner:  linux-aarch64-${{ matrix.runner }}
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      model_name: ${{ matrix.model_name }}
+      upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
+
+  create_pr:
+    runs-on: ubuntu-latest
+    needs: run
+    if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
+    env:
+      UPSTREAM_REPO: vllm-project/vllm-ascend
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-ascend-ci/vllm-ascend
+          token: ${{ secrets.PAT_TOKEN }}
+          ref: main
+      
+      - name: Add upstream remote
+        run: |
+          git remote add upstream https://github.com/${{ env.UPSTREAM_REPO }}.git
+          git fetch upstream
+          git remote -v
+
+      - name: Set Git user info dynamically
+        run: |
+          git config user.name "${{ github.actor }}"
+          git config user.email "${{ github.actor }}@users.noreply.github.com"
+
+      - name: Create or switch to branch
+        run: |
+          TIMESTAMP=$(date +%Y%m%d%H%M%S)
+          BRANCH_NAME="auto-pr/accuracy-report-${TIMESTAMP}"
+          echo "BRANCH_NAME=${BRANCH_NAME}" >> $GITHUB_ENV
+          git checkout -B "${BRANCH_NAME}" upstream/main
+
+      - name: Download only current run reports
+        uses: actions/download-artifact@v5
+        with:
+          path: ./docs/source/developer_guide/evaluation/accuracy_report
+          pattern: report-*
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          run-id: ${{ github.run_id }}
+
+      - name: Delete old report
+        run: |
+          find ./docs/source/developer_guide/evaluation/accuracy_report -maxdepth 1 -type f -name '*.md' ! -name 'index.md' -delete
+          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 2 -type f -name '*.md' -exec mv -f {} ./docs/source/developer_guide/evaluation/accuracy_report \;
+          find ./docs/source/developer_guide/evaluation/accuracy_report -mindepth 1 -type d -empty -delete
+      
+      - name: Update accuracy_report/index.md
+        run: |
+          REPORT_DIR="./docs/source/developer_guide/evaluation/accuracy_report"
+          INDEX_MD="$REPORT_DIR/index.md"
+          {
+            echo "# Accuracy Report"
+            echo ""
+            echo ":::{toctree}"
+            echo ":caption: Accuracy Report"
+            echo ":maxdepth: 1"
+            
+            for report in "$REPORT_DIR"/*.md; do
+              filename="$(basename "$report" .md)"
+              if [ "$filename" != "index" ]; then
+                echo "$filename"
+              fi
+            done
+            echo ":::"
+          } > "$INDEX_MD"
+
+      - name: push accuracy report
+        env:
+          GITHUB_TOKEN: ${{ secrets.PAT_TOKEN }}
+        run: |
+          git add ./docs/source/developer_guide/evaluation/accuracy_report/*.md
+          git commit -s -m "[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}"
+          git push -f origin "${{ env.BRANCH_NAME }}"
+
+      - name: Create PR in upstream via API
+        uses: actions/github-script@v8
+        with:
+          github-token: ${{ secrets.PAT_TOKEN }}
+          script: |
+            const pr = await github.rest.pulls.create({
+              owner: 'vllm-project',
+              repo: 'vllm-ascend',
+              head: `vllm-ascend-ci:${{ env.BRANCH_NAME }}`,
+              base: 'main',
+              title: `[Doc] Update accuracy reports for ${{ env.BRANCH_NAME }}`,
+              body: `The accuracy results running on NPU Altlas A2 have changed, updating reports for: All models
+            
+              - [Workflow run][1]
+              
+              [1]: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`
+            });
+            core.info(`Created PR #${pr.data.number}`);
--- a/.github/workflows/vllm_ascend_test_nightly.yaml
+++ b/.github/workflows/vllm_ascend_test_nightly.yaml
@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+name: 'ascend test / nightly'
+
+on:
+  schedule:
+      # Run test at 24:00 Beijing time (UTC+8)
+      - cron: "0 16 * * *"
+  workflow_dispatch:
+  pull_request: 
+    branches:
+      - 'main'
+      - '*-dev'
+    paths:
+      - 'tests/e2e/nightly/**'
+      - '.github/workflows/vllm_ascend_test_nightly.yaml'
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only cancel in-progress runs of the same workflow
+# and ignore the lint / 1 card / 4 cards test type
+concurrency:
+  group: ascend-nightly-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  qwen3-32b:
+    strategy:
+      matrix:
+        # should add A3 chip runner when available
+        os: [linux-aarch64-a2-4]
+    # Note (yikun): If CI resource are limited we can split job into two chain jobs
+    # only trigger e2e test after lint passed and the change is e2e related with pull request.
+    uses: ./.github/workflows/_e2e_nightly.yaml
+    with:
+      vllm: v0.11.0
+      runner: ${{ matrix.os }}
+      tests: tests/e2e/nightly/models/test_qwen3_32b.py
--- a/.github/workflows/vllm_ascend_test_pd.yaml
+++ b/.github/workflows/vllm_ascend_test_pd.yaml
@ -0,0 +1,112 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+name: 'e2e test / pd-disaggregation'
+
+on:
+  schedule:
+    # Runs at 23:00 UTC (7:00 AM Beijing) every day
+    - cron: '0 23 * * *'
+  pull_request:
+    types: [ labeled ]
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+# only 1 job can runs on static-8-01-cards
+concurrency:
+  group: static-8-01-cards
+  cancel-in-progress: false
+
+jobs:
+  prefilling-decoding-disaggregation:
+    # pd-test will be triggered when tag 'pd-test' & 'ready-for-test' or schedule job
+    if: ${{ contains(github.event.pull_request.labels.*.name, 'pd-test') && contains(github.event.pull_request.labels.*.name, 'ready-for-test') || github.event_name == 'schedule' }}
+    strategy:
+      matrix:
+        vllm_verison: [
+            main, 
+            v0.9.1
+          ]
+    name: vLLM Ascend prefilling decoding disaggregation test
+    runs-on: linux-arm64-npu-static-8
+
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      volumes:
+        - /usr/local/dcmi:/usr/local/dcmi
+        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+        - /usr/local/Ascend/driver/:/usr/local/Ascend/driver/
+        # Use self-host cache speed up pip and model download
+        - /home/action/.cache:/github/home/.cache/
+      options: >-
+        --device /dev/davinci0
+        --device /dev/davinci1
+        --device /dev/davinci_manager
+        --device /dev/devmm_svm
+        --device /dev/hisi_hdc
+      env:
+        VLLM_USE_MODELSCOPE: True
+    steps:
+      - name: Check npu and CANN info
+        run: |
+          npu-smi info
+          cat /usr/local/Ascend/ascend-toolkit/latest/"$(uname -i)"-linux/ascend_toolkit_install.info
+
+      - name: Config mirrors
+        run: |
+          # keep using tuna's proxy since linux-arm64-npu-static-8 is in another region
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+          apt-get update -y
+          apt install git -y
+          git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf https://github.com/
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get -y install `cat packages.txt`
+          apt-get -y install gcc g++ cmake libnuma-dev
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          ref: ${{ matrix.vllm_verison }}
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        env:
+          PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
+        run: |
+          pip install -r requirements-dev.txt
+          pip install -v -e .
+
+      - name: Run vllm-project/vllm-ascend PD Disaggregation edge test
+        run: |
+          git config --global --add safe.directory/__w/vllm-ascend/vllm-ascend
+          bash tests/e2e/pd_disaggreate/run_edge_case_test.sh
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -1,57 +0,0 @@
-#
-# Adapted from vllm-project/vllm/blob/main/.github
-# Copyright 2023 The vLLM team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-name: yapf
-
-on:
-  # Trigger the workflow on push or pull request,
-  # but only for the main branch
-  push:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-  pull_request:
-    branches:
-      - 'main'
-      - '*-dev'
-    paths:
-      - "**/*.py"
-      - .github/workflows/yapf.yml
-
-jobs:
-  yapf:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ["3.12"]
-    steps:
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install toml
-          pip install yapf==0.32.0
-      - name: Running yapf
-        run: |
-          yapf --diff --recursive .
--- a/.gitignore
+++ b/.gitignore
@ -196,3 +196,9 @@ kernel_meta/

 # version file generated by setuptools-scm
 /vllm_ascend/_version.py
+# build info file generated by setup.py
+/vllm_ascend/_build_info.py
+/vllm_ascend/include/
+
+# generated by CANN
+fusion_result.json
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,151 @@
+default_install_hook_types:
+  - pre-commit
+  - commit-msg
+default_stages:
+  - pre-commit # Run locally
+  - manual # Run in CI
+exclude: 'examples/.*' # Exclude examples from all hooks by default
+repos:
+- repo: https://github.com/codespell-project/codespell
+  rev: v2.4.1
+  hooks:
+    - id: codespell
+      args: [
+        --toml, pyproject.toml,
+        '--skip', 'tests/e2e/multicard/test_torchair_graph_mode.py,csrc/mla_preprocess/**,tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**,./vllm_ascend.egg-info/**,.github/**,typos.toml',
+        '-L', 'CANN,cann,NNAL,nnal,ASCEND,ascend,EnQue,CopyIn,ArchType,AND'
+      ]
+      additional_dependencies:
+        - tomli
+- repo: https://github.com/google/yapf
+  rev: v0.43.0
+  hooks:
+  - id: yapf
+    args: [--in-place, --verbose]
+    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
+    exclude: '(.github|benchmarks|examples|docs)/.*'
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  rev: v0.11.7
+  hooks:
+  - id: ruff
+    args: [--output-format, github, --fix]
+  - id: ruff-format
+    files: ^(benchmarks|examples)/.*
+- repo: https://github.com/crate-ci/typos
+  rev: v1.32.0
+  hooks:
+  - id: typos
+    args: [
+      "--force-exclude",
+      "--exclude", "csrc/mla_preprocess/**"
+    ]
+- repo: https://github.com/PyCQA/isort
+  rev: 6.0.1
+  hooks:
+  - id: isort
+# - repo: https://github.com/pre-commit/mirrors-clang-format
+#   rev: v20.1.3
+#   hooks:
+#   - id: clang-format
+#     files: ^csrc/.*\.(cpp|hpp|cc|hh|cxx|hxx)$
+#     types_or: [c++]
+#     args: [--style=google, --verbose]
+- repo: https://github.com/jackdewinter/pymarkdown
+  rev: v0.9.29
+  hooks:
+  - id: pymarkdown
+    args: [fix]
+- repo: https://github.com/rhysd/actionlint
+  rev: v1.7.7
+  hooks:
+  - id: actionlint
+- repo: local
+  hooks:
+  # For local development, you can run mypy using tools/mypy.sh script if needed.
+  # - id: mypy-local
+  #   name: Run mypy for local Python installation
+  #   entry: tools/mypy.sh 0 "local"
+  #   language: system
+  #   types: [python]
+  #   stages: [pre-commit] # Don't run in CI
+  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.9
+    entry: tools/mypy.sh 1 "3.9"
+    # Use system python because vllm installation is required
+    language: system
+    types: [python]
+    stages: [manual] # Only run in CI
+  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.10
+    entry: tools/mypy.sh 1 "3.10"
+    # Use system python because vllm installation is required
+    language: system
+    types: [python]
+    stages: [manual] # Only run in CI
+  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.11
+    entry: tools/mypy.sh 1 "3.11"
+    # Use system python because vllm installation is required
+    language: system
+    types: [python]
+    stages: [manual] # Only run in CI
+  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
+    name: Run mypy for Python 3.12
+    entry: tools/mypy.sh 1 "3.12"
+    # Use system python because vllm installation is required
+    language: system
+    types: [python]
+    stages: [manual] # Only run in CI
+  # FIXME: enable shellcheck
+  # - id: shellcheck
+  #   name: Lint shell scripts
+  #   entry: tools/shellcheck.sh
+  #   language: script
+  #   types: [shell]
+  - id: png-lint
+    name: Lint PNG exports from excalidraw
+    entry: tools/png-lint.sh
+    language: script
+    types: [png]
+  - id: signoff-commit
+    name: Sign-off Commit
+    entry: bash
+    args:
+      - -c
+      - |
+        if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
+          printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
+        fi
+    language: system
+    verbose: true
+    stages: [commit-msg]
+  - id: check-filenames
+    name: Check for spaces in all filenames
+    entry: bash
+    args:
+      - -c
+      - 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
+    language: system
+    always_run: true
+    pass_filenames: false
+  - id: enforce-import-regex-instead-of-re
+    name: Enforce import regex as re
+    entry: python tools/enforce_regex_import.py
+    language: python
+    types: [python]
+    pass_filenames: false
+    additional_dependencies: [regex]
+  - id: python-init
+    name: Enforce __init__.py in Python packages
+    entry: python tools/check_python_src_init.py
+    language: python
+    types: [python]
+    pass_filenames: false
+  # Keep `suggestion` last
+  - id: suggestion
+    name: Suggestion
+    entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
+    language: system
+    verbose: true
+    pass_filenames: false
+  # Insert new entries above the `suggestion` entry
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -0,0 +1,104 @@
+cmake_minimum_required(VERSION 3.16)
+project(vllm_ascend_C)
+
+# include(CheckCXXcompilerFlag)
+# check_cxx_compiler_flag("-std=c++17", COMPILER_SUPPORTS_CXX17)
+set(CMAKE_CXX_STANDARD 17)
+
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
+
+# Suppress potential warnings about unused manually-specified variables
+set(ignoreMe "${VLLM_PYTHON_PATH}")
+
+# TODO: Add 3.12 back when torch-npu support 3.12
+set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11")
+
+find_package(pybind11 REQUIRED)
+
+append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
+set(VLLM_ASCEND_INSTALL_PATH "${CMAKE_INSTALL_PREFIX}")
+
+find_package(Torch REQUIRED)
+
+set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu")
+set(SOC_VERSION ${SOC_VERSION})
+message(STATUS "Detected SOC version: ${SOC_VERSION}")
+
+if (NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release" CACHE STRINGS "Build type Release/Debug (default Release)" FORCE)
+endif()
+
+if (CMAKE_INSTALL_PREFIX STREQUAL /usr/local)
+  set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRINGS "path to install()")
+endif()
+
+set(ASCEND_CANN_PACKAGE_PATH ${ASCEND_HOME_PATH})
+if(EXISTS ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/tools/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/compiler/tikcpp/ascendc_kernel_cmake)
+elseif(EXISTS ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+    set(ASCENDC_CMAKE_DIR ${ASCEND_HOME_PATH}/ascendc_devkit/tikcpp/samples/cmake)
+else()
+    message(FATAL_ERROR "ascendc_kernel_cmake does not exist, please check whether the cann package is installed.")
+endif()
+
+include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+
+file(GLOB KERNEL_FILES
+${CMAKE_CURRENT_SOURCE_DIR}/csrc/kernels/*.cpp)
+
+ascendc_library(vllm_ascend_kernels SHARED
+    ${KERNEL_FILES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc/mla_preprocess/op_kernel/mla_preprocess_kernel.cpp
+)
+
+message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}")
+
+file(GLOB VLLM_ASCEND_SRC
+${CMAKE_CURRENT_SOURCE_DIR}/csrc/*.cpp)
+
+include_directories(
+  ${pybind11_INCLUDE_DIRS}
+  ${PYTHON_INCLUDE_PATH}
+  ${TORCH_INCLUDE_DIRS}
+  ${TORCH_NPU_PATH}/include
+  ${ASCEND_HOME_PATH}/include
+  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
+  ${ASCEND_HOME_PATH}/x86_64-linux/include/experiment/platform
+)
+
+set(
+  INCLUDES
+  ${TORCH_INCLUDE_DIRS}
+  ${TORCH_NPU_INCLUDE_DIRS}
+  ${ASCEND_HOME_PATH}/include
+  ${ASCEND_HOME_PATH}/aarch64-linux/include/experiment/platform
+)
+
+pybind11_add_module(vllm_ascend_C ${VLLM_ASCEND_SRC})
+
+target_link_directories(
+  vllm_ascend_C
+  PRIVATE
+  ${TORCH_NPU_PATH}/lib/
+  ${ASCEND_HOME_PATH}/lib64
+)
+
+target_link_libraries(
+  vllm_ascend_C
+  PUBLIC
+  ${TORCH_LIBRARIES}
+  libtorch_npu.so
+  vllm_ascend_kernels
+  ascendcl
+  tiling_api
+  register
+  platform
+  ascendalog
+  dl
+)
+
+target_link_options(vllm_ascend_C PRIVATE "-Wl,-rpath,$ORIGIN:$ORIGIN/lib")
+
+install(TARGETS vllm_ascend_C vllm_ascend_kernels DESTINATION ${VLLM_ASCEND_INSTALL_PATH})
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
 For answers to common questions about this code of conduct, see the
 [Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
 [Contributor Covenant translations](https://www.contributor-covenant.org/translations).
-
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -0,0 +1,3 @@
+# Contributing to vLLM Ascend
+
+You may find information about contributing to vLLM Ascend on [Developer Guide - Contributing](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html), including step-by-step guide to help you setup development environment, contribute first PR and test locally.
--- a/35
+++ b/35
@ -1,6 +1,5 @@
 #
 # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
-# This file is a part of the vllm-ascend project.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -13,35 +12,49 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+# This file is a part of the vllm-ascend project.
 #

-FROM quay.io/ascend/cann:8.0.0-910b-ubuntu22.04-py3.10
+FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11

 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1

 # Define environments
 ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}

 RUN apt-get update -y && \
-    apt-get install -y python3-pip git vim && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
    rm -rf /var/cache/apt/* && \
    rm -rf /var/lib/apt/lists/*

 WORKDIR /workspace

-COPY . /workspace/vllm-ascend/
+COPY . /vllm-workspace/vllm-ascend/

 RUN pip config set global.index-url ${PIP_INDEX_URL}

-# Install vLLM main
+# Install vLLM
 ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
-RUN git clone --depth 1 $VLLM_REPO /workspace/vllm
-RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge

-# Install vllm-ascend main
-RUN python3 -m pip install /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge

-# Install modelscope
-RUN python3 -m pip install modelscope
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge

 CMD ["/bin/bash"]
--- a/Dockerfile.310p
+++ b/Dockerfile.310p
@ -0,0 +1,61 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile.310p.openEuler
+++ b/Dockerfile.310p.openEuler
@ -0,0 +1,59 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    export SOC_VERSION=ASCEND310P3 && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile.a3
+++ b/Dockerfile.a3
@ -0,0 +1,60 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
+    rm -rf /var/cache/apt/* && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile.a3.openEuler
+++ b/Dockerfile.a3.openEuler
@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/Dockerfile.openEuler
+++ b/Dockerfile.openEuler
@ -0,0 +1,58 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
+
+ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
+ARG COMPILE_CUSTOM_KERNELS=1
+
+ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
+
+RUN yum update -y && \
+    yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
+    rm -rf /var/cache/yum
+
+RUN pip config set global.index-url ${PIP_INDEX_URL}
+
+WORKDIR /workspace
+
+COPY . /vllm-workspace/vllm-ascend/
+
+# Install vLLM
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+ARG VLLM_TAG=v0.11.0
+
+RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
+# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip uninstall -y triton && \
+    python3 -m pip cache purge
+
+# Install vllm-ascend
+RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
+    source /usr/local/Ascend/nnal/atb/set_env.sh && \
+    export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
+    export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
+    python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
+    python3 -m pip cache purge
+
+# Install modelscope (for fast download) and ray (for multinode)
+RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
+    python3 -m pip cache purge
+
+CMD ["/bin/bash"]
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ vLLM Ascend Plugin
 </h3>

 <p align="center">
-| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack (#sig-ascend)</b></a> |
+| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>Users Forum</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>Weekly Meeting</b></a> |
 </p>

 <p align="center">
@ -19,80 +19,72 @@ vLLM Ascend Plugin

 ---
 *Latest News* 🔥
-
+- [2025/09] We released the new official version [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! Please follow the [official guide](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html) to start deploy large scale Expert Parallelism (EP) on Ascend.
+- [2025/08] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q) with vLLM and Tencent! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/06] [User stories](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html) page is now live! It kicks off with ‌LLaMA-Factory/verl//TRL/GPUStack‌ to demonstrate how ‌vLLM Ascend‌ assists Ascend users in enhancing their experience across fine-tuning, evaluation, reinforcement learning (RL), and deployment scenarios.
+- [2025/06] [Contributors](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html) page is now live! All contributions deserve to be recorded, thanks for all contributors.
+- [2025/05] We've released first official version [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)! We collaborated with the vLLM community to publish a blog post sharing our practice: [Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html).
+- [2025/03] We hosted the [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/VtxO9WXa5fC-mKqlxNUJUQ) with vLLM team! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF).
+- [2025/02] vLLM community officially created [vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend) repo for running vLLM seamlessly on the Ascend NPU.
 - [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
 ---
 ## Overview

-vLLM Ascend plugin (`vllm-ascend`) is a backend plugin for running vLLM on the Ascend NPU.
+vLLM Ascend (`vllm-ascend`) is a community maintained hardware plugin for running vLLM seamlessly on the Ascend NPU.

-This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+It is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.

 By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.

 ## Prerequisites

- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series
+- Hardware: Atlas 800I A2 Inference series, Atlas A2 Training series, Atlas 800I A3 Inference series, Atlas A3 Training series, Atlas 300I Duo (Experimental)
+- OS: Linux
 - Software:
-  * Python >= 3.9
-  * CANN >= 8.0.RC2
-  * PyTorch >= 2.4.0, torch-npu >= 2.4.0
+  * Python >= 3.9, < 3.12
+  * CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
+  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
  * vLLM (the same version as vllm-ascend)

-Find more about how to setup your environment step by step in [here](docs/source/installation.md).
-
 ## Getting Started

-> [!NOTE]
-> Currently, we are actively collaborating with the vLLM community to support the Ascend backend plugin, once supported you can use one line command `pip install vllm vllm-ascend` to compelete installation.
+Please use the following recommended versions to get started quickly:

-Installation from source code:
-```bash
-# Install vllm main branch according:
-# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#build-wheel-from-source
-git clone --depth 1 https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements-build.txt
-VLLM_TARGET_DEVICE=empty pip install .
-
-# Install vllm-ascend main branch
-git clone https://github.com/vllm-project/vllm-ascend.git
-cd vllm-ascend
-pip install -e .
-```
-
-Run the following command to start the vLLM server with the [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model:
-
-```bash
-# export VLLM_USE_MODELSCOPE=true to speed up download
-vllm serve Qwen/Qwen2.5-0.5B-Instruct
-curl http://localhost:8000/v1/models
-```
-
-Please refer to [QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details.
+| Version    | Release type | Doc                                  |
+|------------|--------------|--------------------------------------|
+|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
+|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|

 ## Contributing
-See [CONTRIBUTING](docs/source/developer_guide/contributing.md) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+See [CONTRIBUTING](https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html) for more details, which is a step-by-step guide to help you set up development environment, build and test.

 We welcome and value any contributions and collaborations:
- Please feel free comments [here](https://github.com/vllm-project/vllm-ascend/issues/19) about your usage of vLLM Ascend Plugin.
- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues).
+- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues)
+- Please use [User forum](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support) for usage questions and help.

 ## Branch

 vllm-ascend has main branch and dev branch.

 - **main**: main branch，corresponds to the vLLM main branch, and is continuously monitored for quality through Ascend CI.
- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.1-dev` is the dev branch for vLLM `v0.7.1` version.
+- **vX.Y.Z-dev**: development branch, created with part of new releases of vLLM. For example, `v0.7.3-dev` is the dev branch for vLLM `v0.7.3` version.

 Below is maintained branches:

 | Branch     | Status       | Note                                 |
 |------------|--------------|--------------------------------------|
-| main       | Maintained   | CI commitment for vLLM main branch   |
-| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version |
+| main       | Maintained   | CI commitment for vLLM main branch and vLLM v0.11.0 tag   |
+| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
+| v0.7.3-dev | Maintained   | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
+| v0.9.1-dev | Maintained   | CI commitment for vLLM 0.9.1 version |
+| rfc/feature-name | Maintained | [Feature branches](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) for collaboration |

-Please refer to [Versioning policy](docs/source/developer_guide/versioning_policy.md) for more details.
+Please refer to [Versioning policy](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html) for more details.
+
+## Weekly Meeting
+
+- vLLM Ascend Weekly Meeting: https://tinyurl.com/vllm-ascend-meeting
+- Wednesday, 15:00 - 16:00 (UTC+8, [Convert to your timezone](https://dateful.com/convert/gmt8?t=15))

 ## License

--- a/README.zh.md
+++ b/README.zh.md
@ -10,7 +10,7 @@ vLLM Ascend Plugin
 </h3>

 <p align="center">
-| <a href="https://www.hiascend.com/en/"><b>关于昇腾</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>官方文档</b></a> | <a href="https://slack.vllm.ai"><b>开发者 Slack (#sig-ascend)</b></a> |
+| <a href="https://www.hiascend.com/en/"><b>关于昇腾</b></a> | <a href="https://vllm-ascend.readthedocs.io/en/latest/"><b>官方文档</b></a> | <a href="https://slack.vllm.ai"><b>#sig-ascend</b></a> | <a href="https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support"><b>用户论坛</b></a> | <a href="https://tinyurl.com/vllm-ascend-meeting"><b>社区例会</b></a> |
 </p>

 <p align="center">
@ -20,11 +20,18 @@ vLLM Ascend Plugin
 ---
 *最新消息* 🔥

+- [2025/09] 我们发布了新的正式版本 [v0.9.1](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.9.1)! 请按照[官方指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/tutorials/large_scale_ep.html)开始在Ascend上部署大型专家并行 (EP)。
+- [2025/08] 我们与vLLM和腾讯合作举办了[vLLM北京Meetup](https://mp.weixin.qq.com/s/7n8OYNrCC_I9SJaybHA_-Q)，！请在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料。
+- [2025/06] [用户案例](https://vllm-ascend.readthedocs.io/en/latest/community/user_stories/index.html)现已上线！展示了LLaMA-Factory/verl/TRL/GPUStack等用户案例，展示了vLLM Ascend如何帮助昇腾用户在模型微调、评估、强化学习 (RL) 以及部署等场景中提升体验。
+- [2025/06] [贡献者](https://vllm-ascend.readthedocs.io/en/latest/community/contributors.html)页面现已上线！所有的贡献都值得被记录，感谢所有的贡献者。
+- [2025/05] 我们发布了首个正式版本 [v0.7.3](https://github.com/vllm-project/vllm-ascend/releases/tag/v0.7.3)！我们与 vLLM 社区合作发布了一篇博客文章，分享了我们的实践：[Introducing vLLM Hardware Plugin, Best Practice from Ascend NPU](https://blog.vllm.ai/2025/05/12/hardware-plugin.html)。
+- [2025/03] 我们和vLLM团队举办了[vLLM Beijing Meetup](https://mp.weixin.qq.com/s/CGDuMoB301Uytnrkc2oyjg)! 你可以在[这里](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF)找到演讲材料.
+- [2025/02] vLLM社区正式创建了[vllm-project/vllm-ascend](https://github.com/vllm-project/vllm-ascend)仓库，让vLLM可以无缝运行在Ascend NPU。
 - [2024/12] 我们正在与 vLLM 社区合作，以支持 [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
 ---
 ## 总览

-vLLM 昇腾插件 (`vllm-ascend`) 是一个让vLLM在Ascend NPU无缝运行的后端插件。
+vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NPU无缝运行的后端插件。

 此插件是 vLLM 社区中支持昇腾后端的推荐方式。它遵循[[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162)所述原则：通过解耦的方式提供了vLLM对Ascend NPU的支持。

@ -32,68 +39,52 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个让vLLM在Ascend NPU无缝运行的

 ## 准备

- 硬件：Atlas 800I A2 Inference系列、Atlas A2 Training系列
+- 硬件：Atlas 800I A2 Inference系列、Atlas A2 Training系列、Atlas 800I A3 Inference系列、Atlas A3 Training系列、Atlas 300I Duo（实验性支持）
+- 操作系统：Linux
 - 软件：
-  * Python >= 3.9
-  * CANN >= 8.0.RC2
-  * PyTorch >= 2.4.0, torch-npu >= 2.4.0
+  * Python >= 3.9, < 3.12
+  * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
+  * PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
  * vLLM (与vllm-ascend版本一致)

-在[此处](docs/source/installation.md)，您可以了解如何逐步准备环境。
-
 ## 开始使用

-> [!NOTE]
-> 目前，我们正在积极与 vLLM 社区合作以支持 Ascend 后端插件，一旦支持，您可以使用一行命令: `pip install vllm vllm-ascend` 来完成安装。
+推荐您使用以下版本快速开始使用：

-通过源码安装:
-```bash
-# 安装vllm main 分支参考文档:
-# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#build-wheel-from-source
-git clone --depth 1 https://github.com/vllm-project/vllm.git
-cd vllm
-pip install -r requirements-build.txt
-VLLM_TARGET_DEVICE=empty pip install .
+| Version    | Release type | Doc                                  |
+|------------|--------------|--------------------------------------|
+|v0.11.0rc0| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
+|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|

-# 安装vllm-ascend main 分支
-git clone https://github.com/vllm-project/vllm-ascend.git
-cd vllm-ascend
-pip install -e .
-```
+## 贡献
+请参考 [CONTRIBUTING]((https://vllm-ascend.readthedocs.io/en/latest/developer_guide/contribution/index.html)) 文档了解更多关于开发环境搭建、功能测试以及 PR 提交规范的信息。

-运行如下命令使用 [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) 模型启动服务:
-
-```bash
-# 设置环境变量 VLLM_USE_MODELSCOPE=true 加速下载
-vllm serve Qwen/Qwen2.5-0.5B-Instruct
-curl http://localhost:8000/v1/models
-```
-
-请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多.
-
-## 分支
+我们欢迎并重视任何形式的贡献与合作：
+- 请通过[Issue](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何Bug。
+- 请通过[用户论坛](https://discuss.vllm.ai/c/hardware-support/vllm-ascend-support)来交流使用问题和寻求帮助。

+## 分支策略
 vllm-ascend有主干分支和开发分支。

 - **main**: 主干分支，与vLLM的主干分支对应，并通过昇腾CI持续进行质量看护。
- **vX.Y.Z-dev**: 开发分支，随vLLM部分新版本发布而创建，比如`v0.7.1-dev`是vllm-asend针对vLLM `v0.7.1`版本的开发分支。
+- **vX.Y.Z-dev**: 开发分支，随vLLM部分新版本发布而创建，比如`v0.7.3-dev`是vllm-asend针对vLLM `v0.7.3`版本的开发分支。

 下面是维护中的分支：

 | 分支         | 状态         | 备注                  |
 |------------|------------|---------------------|
-| main       | Maintained | 基于vLLM main分支CI看护   |
-| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护 |
+| main       | Maintained | 基于vLLM main分支和vLLM最新版本（v0.11.0）CI看护   |
+| v0.7.1-dev | Unmaintained | 只允许文档修复 |
+| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复，不会再发布新版本 |
+| v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 |
+|rfc/feature-name| Maintained | 为协作创建的[特性分支](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html#feature-branches) |

-请参阅[版本策略](docs/source/developer_guide/versioning_policy.zh.md)了解更多详细信息。
+请参阅[版本策略](https://vllm-ascend.readthedocs.io/en/latest/community/versioning_policy.html)了解更多详细信息。

-## 贡献
-有关更多详细信息，请参阅 [CONTRIBUTING](docs/source/developer_guide/contributing.zh.md)，可以更详细的帮助您部署开发环境、构建和测试。
+## 社区例会

-我们欢迎并重视任何形式的贡献与合作：
- 您可以在[这里](https://github.com/vllm-project/vllm-ascend/issues/19)反馈您的使用体验。
- 请通过[提交问题](https://github.com/vllm-project/vllm-ascend/issues)来告知我们您遇到的任何错误。
+- vLLM Ascend 每周社区例会: https://tinyurl.com/vllm-ascend-meeting
+- 每周三下午，15:00 - 16:00 (UTC+8, [查看您的时区](https://dateful.com/convert/gmt8?t=15))

 ## 许可证
-
-Apache 许可证 2.0，如 [LICENSE](./LICENSE) 文件中所示。
+Apache 许可证 2.0，如 [LICENSE](./LICENSE) 文件中所示。
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -0,0 +1,175 @@
+# Introduction
+This document outlines the benchmarking methodology for vllm-ascend, aimed at evaluating the performance under a variety of workloads. The primary goal is to help developers assess whether their pull requests improve or degrade vllm-ascend's performance.
+
+# Overview
+**Benchmarking Coverage**: We measure latency, throughput, and fixed-QPS serving on the Atlas800I A2 (see [quick_start](../docs/source/quick_start.md) to learn more supported devices list), with different models(coming soon).
+- Latency tests
+    - Input length: 32 tokens.
+    - Output length: 128 tokens.
+    - Batch size: fixed (8).
+    - Models: Qwen2.5-7B-Instruct, Qwen3-8B.
+    - Evaluation metrics: end-to-end latency (mean, median, p99).
+
+- Throughput tests
+    - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+    - Output length: the corresponding output length of these 200 prompts.
+    - Batch size: dynamically determined by vllm to achieve maximum throughput.
+    - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
+    - Evaluation metrics: throughput.
+- Serving tests
+    - Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
+    - Output length: the corresponding output length of these 200 prompts.
+    - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+    - **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+    - Models: Qwen2.5-VL-7B-Instruct, Qwen2.5-7B-Instruct, Qwen3-8B.
+    - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+**Benchmarking Duration**: about 800 senond for single model.
+
+# Quick Use
+## Prerequisites
+Before running the benchmarks, ensure the following:
+
+- vllm and vllm-ascend are installed and properly set up in an NPU environment, as these scripts are specifically designed for NPU devices.
+
+- Install necessary dependencies for benchmarks:
+  
+  ```shell
+  pip install -r benchmarks/requirements-bench.txt
+  ```
+  
+- For performance benchmark, it is recommended to set the [load-format](https://github.com/vllm-project/vllm-ascend/blob/5897dc5bbe321ca90c26225d0d70bff24061d04b/benchmarks/tests/latency-tests.json#L7) as `dummy`, It will construct random weights based on the passed model without downloading the weights from internet, which can greatly reduce the benchmark time.
+- If you want to run benchmark customized, feel free to add your own models and parameters in the [JSON](https://github.com/vllm-project/vllm-ascend/tree/main/benchmarks/tests), let's take `Qwen2.5-VL-7B-Instruct`as an example:
+
+  ```shell
+  [
+  {
+    "test_name": "serving_qwen2_5vl_7B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "trust_remote_code": "",
+      "max_model_len": 16384
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "backend": "openai-chat",
+      "dataset_name": "hf",
+      "hf_split": "train",
+      "endpoint": "/v1/chat/completions",
+      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
+      "num_prompts": 200
+    }
+  }
+  ]
+  ```
+  
+this Json will be structured and parsed into server parameters and client parameters by the benchmark script. This configuration defines a test case named `serving_qwen2_5vl_7B_tp1`, designed to evaluate the performance of the `Qwen/Qwen2.5-VL-7B-Instruct` model under different request rates. The test includes both server and client parameters, for more parameters details, see vllm benchmark [cli](https://github.com/vllm-project/vllm/tree/main/vllm/benchmarks).
+
+  - **Test Overview**
+     - Test Name: serving_qwen2_5vl_7B_tp1
+
+     - Queries Per Second (QPS): The test is run at four different QPS levels: 1, 4, 16, and inf (infinite load, typically used for stress testing).
+
+  - Server Parameters
+     - Model: Qwen/Qwen2.5-VL-7B-Instruct
+
+     - Tensor Parallelism: 1 (no model parallelism is used; the model runs on a single device or node)
+
+     - Swap Space: 16 GB (used to handle memory overflow by swapping to disk)
+
+     - disable_log_stats: disables logging of performance statistics.
+
+     - disable_log_requests: disables logging of individual requests.
+
+     - Trust Remote Code: enabled (allows execution of model-specific custom code)
+
+     - Max Model Length: 16,384 tokens (maximum context length supported by the model)
+
+  - Client Parameters
+
+     - Model: Qwen/Qwen2.5-VL-7B-Instruct (same as the server)
+
+     - Backend: openai-chat (suggests the client uses the OpenAI-compatible chat API format)
+
+     - Dataset Source: Hugging Face (hf)
+
+     - Dataset Split: train
+
+     - Endpoint: /v1/chat/completions (the REST API endpoint to which chat requests are sent)
+
+     - Dataset Path: lmarena-ai/vision-arena-bench-v0.1 (the benchmark dataset used for evaluation, hosted on Hugging Face)
+
+     - Number of Prompts: 200 (the total number of prompts used during the test)
+
+## Run benchmarks
+
+### Use benchmark script
+The provided scripts automatically execute performance tests for serving, throughput, and latency. To start the benchmarking process, run command in the vllm-ascend root directory:
+
+```shell
+bash benchmarks/scripts/run-performance-benchmarks.sh
+```
+
+Once the script completes, you can find the results in the benchmarks/results folder. The output files may resemble the following:
+
+```shell
+.
+|-- serving_qwen2_5_7B_tp1_qps_1.json
+|-- serving_qwen2_5_7B_tp1_qps_16.json
+|-- serving_qwen2_5_7B_tp1_qps_4.json
+|-- serving_qwen2_5_7B_tp1_qps_inf.json
+|-- latency_qwen2_5_7B_tp1.json
+|-- throughput_qwen2_5_7B_tp1.json
+```
+
+These files contain detailed benchmarking results for further analysis.
+
+### Use benchmark cli
+
+For more flexible and customized use, benchmark cli is also provided to run online/offline benchmarks
+Similarly, let’s take `Qwen2.5-VL-7B-Instruct` benchmark as an example:
+#### Online serving
+1. Launch the server:
+
+    ```shell
+    vllm serve Qwen2.5-VL-7B-Instruct --max-model-len 16789
+    ```
+
+2. Running performance tests using cli
+  
+    ```shell
+    vllm bench serve --model Qwen2.5-VL-7B-Instruct\
+    --endpoint-type "openai-chat" --dataset-name hf \
+    --hf-split train --endpoint "/v1/chat/completions" \
+    --dataset-path "lmarena-ai/vision-arena-bench-v0.1" \
+    --num-prompts 200 \
+    --request-rate 16
+    ```
+
+#### Offline
+- **Throughput**
+
+  ```shell
+  vllm bench throughput --output-json results/throughput_qwen2_5_7B_tp1.json \
+  --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 --load-format dummy \
+  --dataset-path /github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json \
+  --num-prompts 200 --backend vllm
+  ```
+
+- **Latency**
+  
+  ```shell
+  vllm bench latency --output-json results/latency_qwen2_5_7B_tp1.json \
+  --model Qwen/Qwen2.5-7B-Instruct --tensor-parallel-size 1 \
+  --load-format dummy --num-iters-warmup 5 --num-iters 15
+  ```
--- a/benchmarks/ops/ben_vocabparallelembedding.py
+++ b/benchmarks/ops/ben_vocabparallelembedding.py
@ -0,0 +1,158 @@
+from typing import Tuple
+
+import numpy as np
+import pytest
+import torch
+import torch_npu  # noqa: F401
+import vllm  # noqa: F401
+
+import vllm_ascend.platform  # noqa: F401
+
+
+def benchmark_npu(fn, num_iterations=100, num_warmup_iterations=50):
+    """
+    Benchmark function for NPU operations
+
+    Args:
+        fn: Function to benchmark
+        num_iterations: Number of timing iterations
+        num_warmup_iterations: Number of warmup iterations
+
+    Returns:
+        float: Minimum elapsed time in seconds
+    """
+    start = torch.npu.Event(enable_timing=True)
+    end = torch.npu.Event(enable_timing=True)
+    times = np.zeros(num_iterations + num_warmup_iterations)
+
+    # Run iterations
+    for i in range(num_warmup_iterations + num_iterations):
+        with torch.no_grad():
+            start.record()
+            fn()  # Execute the function
+            end.record()
+        torch.npu.synchronize()
+        times[i] = start.elapsed_time(end)
+
+    # Remove warmup iterations and convert to seconds
+    times = times[num_warmup_iterations:]
+    elapsed_time = np.amin(times) / 1000
+    return elapsed_time
+
+
+def get_masked_input_and_mask_ref(
+    input_: torch.Tensor,
+    org_vocab_start_index: int,
+    org_vocab_end_index: int,
+    num_org_vocab_padding: int,
+    added_vocab_start_index: int,
+    added_vocab_end_index: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Reference implementation for verification"""
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ < org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index
+    )
+    added_offset = (
+        added_vocab_start_index
+        - (org_vocab_end_index - org_vocab_start_index)
+        - num_org_vocab_padding
+    )
+    valid_offset = (org_vocab_start_index * org_vocab_mask) + (
+        added_offset * added_vocab_mask
+    )
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    masked_input = vocab_mask * (input_ - valid_offset)
+    return masked_input, ~vocab_mask
+
+
+DTYPES = [torch.int32]
+SHAPES = [(3, 4, 5)]
+DEVICES = [f"npu:{0}"]
+SEEDS = [0]
+
+
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("device", DEVICES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_get_masked_input_and_mask(
+    shape: Tuple[int, ...],
+    dtype: torch.dtype,
+    device: str,
+    seed: int,
+) -> None:
+    # Set random seed and device
+    torch.manual_seed(seed)
+    torch.set_default_device(device)
+
+    # Generate random input tensor
+    input_tensor = torch.randint(0, 1000, shape, dtype=dtype)
+
+    # Test parameters
+    test_case = {
+        "org_start": 100,
+        "org_end": 200,
+        "padding": 0,
+        "added_start": 300,
+        "added_end": 400,
+    }
+
+    # Define reference function
+    def ref_fn():
+        return get_masked_input_and_mask_ref(
+            input_tensor,
+            test_case["org_start"],
+            test_case["org_end"],
+            test_case["padding"],
+            test_case["added_start"],
+            test_case["added_end"],
+        )
+
+    # Define custom function
+    def custom_fn():
+        return torch.ops._C_ascend.get_masked_input_and_mask(
+            input_tensor,
+            test_case["org_start"],
+            test_case["org_end"],
+            test_case["padding"],
+            test_case["added_start"],
+            test_case["added_end"],
+        )
+
+    # Get results for correctness testing
+    ref_masked_input, ref_mask = ref_fn()
+    custom_masked_input, custom_mask = custom_fn()
+
+    # Benchmark both implementations
+    ref_time = benchmark_npu(ref_fn)
+    custom_time = benchmark_npu(custom_fn)
+
+    # Print performance results
+    print("\nPerformance Results:")
+    print(f"Reference implementation: {ref_time * 1000:.3f} ms")
+    print(f"Custom implementation: {custom_time * 1000:.3f} ms")
+    print(f"Speedup: {ref_time / custom_time:.2f}x")
+
+    # Compare results for correctness
+    ref_masked_input = ref_masked_input.to(dtype)
+    print("\nResults comparison:")
+    print("custom_masked_input:", custom_masked_input)
+    print("ref_masked_input:", ref_masked_input)
+    print("custom_mask:", custom_mask)
+    print("ref_mask:", ref_mask)
+    torch.testing.assert_close(
+        custom_masked_input,
+        ref_masked_input,
+        rtol=1e-5,
+        atol=1e-5,
+        msg=f"Masked input mismatch for case: {test_case}",
+    )
+    torch.testing.assert_close(
+        custom_mask,
+        ref_mask,
+        rtol=1e-5,
+        atol=1e-5,
+        msg=f"Mask mismatch for case: {test_case}",
+    )
--- a/benchmarks/requirements-bench.txt
+++ b/benchmarks/requirements-bench.txt
@ -0,0 +1,4 @@
+pandas
+datasets
+modelscope
+tabulate
--- a/benchmarks/scripts/convert_json_to_markdown.py
+++ b/benchmarks/scripts/convert_json_to_markdown.py
@ -0,0 +1,188 @@
+import argparse
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+from tabulate import tabulate
+
+CUR_PATH = Path(__file__).parent.resolve()
+# latency results and the keys that will be printed into markdown
+latency_results = []
+latency_column_mapping = {
+    "test_name": "Test name",
+    "avg_latency": "Mean latency (ms)",
+    "P50": "Median latency (ms)",
+    "P99": "P99 latency (ms)",
+}
+
+# throughput tests and the keys that will be printed into markdown
+throughput_results = []
+throughput_results_column_mapping = {
+    "test_name": "Test name",
+    "num_requests": "Num of reqs",
+    "total_num_tokens": "Total num of tokens",
+    "elapsed_time": "Elapsed time (s)",
+    "requests_per_second": "Tput (req/s)",
+    "tokens_per_second": "Tput (tok/s)",
+}
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "request_rate": "Request rate (req/s)",
+    "request_throughput": "Tput (req/s)",
+    "output_throughput": "Output Tput (tok/s)",
+    "median_ttft_ms": "TTFT (ms)",
+    "median_tpot_ms": "TPOT (ms)",
+    "median_itl_ms": "ITL (ms)",
+}
+
+
+def read_markdown(file):
+    if os.path.exists(file):
+        with open(file) as f:
+            return f.read() + "\n"
+    else:
+        return f"{file} not found.\n"
+
+
+def results_to_json(latency, throughput, serving):
+    return json.dumps(
+        {
+            "latency": latency.to_dict(),
+            "throughput": throughput.to_dict(),
+            "serving": serving.to_dict(),
+        }
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process the results of the benchmark tests."
+    )
+    parser.add_argument(
+        "--results_folder",
+        type=str,
+        default="../results/",
+        help="The folder where the benchmark results are stored.",
+    )
+    parser.add_argument(
+        "--output_folder",
+        type=str,
+        default="../results/",
+        help="The folder where the benchmark results are stored.",
+    )
+    parser.add_argument(
+        "--markdown_template",
+        type=str,
+        default="./perf_result_template.md",
+        help="The template file for the markdown report.",
+    )
+    parser.add_argument(
+        "--tag", default="main", help="Tag to be used for release message."
+    )
+    parser.add_argument(
+        "--commit_id", default="", help="Commit ID to be used for release message."
+    )
+
+    args = parser.parse_args()
+    results_folder = (CUR_PATH / args.results_folder).resolve()
+    output_folder = (CUR_PATH / args.output_folder).resolve()
+    markdown_template = (CUR_PATH / args.markdown_template).resolve()
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+        with open(test_file) as f:
+            raw_result = json.loads(f.read())
+
+        if "serving" in str(test_file):
+            # this result is generated via `benchmark_serving.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            serving_results.append(raw_result)
+            continue
+
+        elif "latency" in f.name:
+            # this result is generated via `benchmark_latency.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # get different percentiles
+            for perc in [10, 25, 50, 75, 90, 99]:
+                # Multiply 1000 to convert the time unit from s to ms
+                raw_result.update(
+                    {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
+                )
+            raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
+
+            # add the result to raw_result
+            latency_results.append(raw_result)
+            continue
+
+        elif "throughput" in f.name:
+            # this result is generated via `benchmark_throughput.py`
+
+            # update the test name of this result
+            raw_result.update({"test_name": test_file.stem})
+
+            # add the result to raw_result
+            throughput_results.append(raw_result)
+            continue
+
+        print(f"Skipping {test_file}")
+    serving_results.sort(key=lambda x: (len(x["test_name"]), x["test_name"]))
+
+    latency_results = pd.DataFrame.from_dict(latency_results)
+    serving_results = pd.DataFrame.from_dict(serving_results)
+    throughput_results = pd.DataFrame.from_dict(throughput_results)
+
+    raw_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
+
+    # remapping the key, for visualization purpose
+    if not latency_results.empty:
+        latency_results = latency_results[list(latency_column_mapping.keys())].rename(
+            columns=latency_column_mapping
+        )
+    if not serving_results.empty:
+        serving_results = serving_results[list(serving_column_mapping.keys())].rename(
+            columns=serving_column_mapping
+        )
+    if not throughput_results.empty:
+        throughput_results = throughput_results[
+            list(throughput_results_column_mapping.keys())
+        ].rename(columns=throughput_results_column_mapping)
+
+    processed_results_json = results_to_json(
+        latency_results, throughput_results, serving_results
+    )
+
+    # get markdown tables
+    latency_md_table = tabulate(
+        latency_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    serving_md_table = tabulate(
+        serving_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+    throughput_md_table = tabulate(
+        throughput_results, headers="keys", tablefmt="pipe", showindex=False
+    )
+
+    # document the result
+    print(output_folder)
+    with open(output_folder / "benchmark_results.md", "w") as f:
+        results = read_markdown(markdown_template)
+        results = results.format(
+            latency_tests_markdown_table=latency_md_table,
+            throughput_tests_markdown_table=throughput_md_table,
+            serving_tests_markdown_table=serving_md_table,
+            benchmarking_results_in_json_string=processed_results_json,
+        )
+        f.write(results)
--- a/benchmarks/scripts/perf_result_template.md
+++ b/benchmarks/scripts/perf_result_template.md
@ -0,0 +1,31 @@
+## Online serving tests
+
+- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
+- Evaluation metrics: throughput, TTFT (median time to the first token ), ITL (median inter-token latency) TPOT(median time per output token).
+
+{serving_tests_markdown_table}
+
+## Offline tests
+### Latency tests
+
+- Input length: 32 tokens.
+- Output length: 128 tokens.
+- Batch size: fixed (8).
+- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
+- Evaluation metrics: end-to-end latency.
+
+{latency_tests_markdown_table}
+
+### Throughput tests
+
+- Input length: randomly sample 200 prompts from [ShareGPT](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/blob/main/ShareGPT_V3_unfiltered_cleaned_split.json) and [lmarena-ai/vision-arena-bench-v0.1](https://huggingface.co/datasets/lmarena-ai/vision-arena-bench-v0.1/tree/main)(multi-modal) dataset (with fixed random seed).
+- Output length: the corresponding output length of these 200 prompts.
+- Batch size: dynamically determined by vllm to achieve maximum throughput.
+- Models: Qwen/Qwen3-8B, Qwen/Qwen2.5-VL-7B-Instruct
+- Evaluation metrics: throughput.
+
+{throughput_tests_markdown_table}
--- a/benchmarks/scripts/run-performance-benchmarks.sh
+++ b/benchmarks/scripts/run-performance-benchmarks.sh
@ -0,0 +1,323 @@
+#!/bin/bash
+set -e
+
+check_npus() {
+  # shellcheck disable=SC2155
+  declare -g npu_count=$(npu-smi info -l | grep "Total Count" | awk -F ':' '{print $2}' | tr -d ' ')
+  
+  if [[ -z "$npu_count" || "$npu_count" -eq 0 ]]; then
+    echo "Need at least 1 NPU to run benchmarking."
+    exit 1
+  else
+    echo "found NPU conut: $npu_count"
+  fi
+
+  npu_type=$(npu-smi info | grep -E "^\| [0-9]+" | awk -F '|' '{print $2}' | awk '{$1=$1;print}' | awk '{print $2}')
+
+  echo "NPU type is: $npu_type"
+}
+
+ensure_sharegpt_downloaded() {
+  local FILE="/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json"
+  local DIR
+  DIR=$(dirname "$FILE")
+
+  if [ ! -f "$FILE" ]; then
+    echo "$FILE not found, downloading from hf-mirror ..."
+    mkdir -p "$DIR"
+    wget -O "$FILE" https://hf-mirror.com/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+    if [ $? -ne 0 ]; then
+      echo "Download failed!" >&2
+      return 1
+    fi
+    echo "Download completed and saved to $FILE"
+  else
+    echo "$FILE already exists."
+  fi
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args
+  args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  local waited=0
+  local timeout_sec=1200
+
+  while (( waited < timeout_sec )); do
+    if curl -s -X GET localhost:8000/health > /dev/null; then
+      return 0
+    fi
+    echo "Waiting for vllm server to start..."
+    sleep 1
+    ((waited++))
+  done
+
+  echo "Timeout waiting for server"
+  return 1
+}
+
+get_cur_npu_id() {
+    npu-smi info -l | awk -F ':' '/NPU ID/ {print $2+0; exit}'
+}
+
+kill_npu_processes() {
+  ps -aux
+  lsof -t -i:8000 | xargs -r kill -9
+  pgrep python3 | xargs -r kill -9
+  # vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
+  pgrep VLLM | xargs -r kill -9
+
+  sleep 4
+  rm -rf ~/.config/vllm
+
+}
+
+update_json_field() {
+  local json_file="$1"
+  local field_name="$2"
+  local field_value="$3"
+
+  jq --arg value "$field_value" \
+     --arg key "$field_name" \
+     '.[$key] = $value' "$json_file" > "${json_file}.tmp" && \
+     mv "${json_file}.tmp" "$json_file"
+}
+
+run_latency_tests() {
+  # run latency tests using `benchmark_latency.py`
+  # $1: a json file specifying latency test cases
+
+  local latency_test_file
+  latency_test_file=$1
+
+  # Iterate over latency tests
+  jq -c '.[]' "$latency_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^latency_ ]]; then
+      echo "In latency-test.json, test_name must start with \"latency_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    latency_params=$(echo "$params" | jq -r '.parameters')
+    latency_args=$(json2args "$latency_params")
+
+    latency_command="vllm bench latency \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $latency_args"
+
+    echo "Running test case $test_name"
+    echo "Latency command: $latency_command"
+
+    # run the benchmark
+    eval "$latency_command"
+    # echo model_name to result file
+    model_name=$(echo "$latency_params" | jq -r '.model')
+    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
+    kill_npu_processes
+
+  done
+}
+
+run_throughput_tests() {
+  # run throughput tests using `benchmark_throughput.py`
+  # $1: a json file specifying throughput test cases
+
+  local throughput_test_file
+  throughput_test_file=$1
+
+  # Iterate over throughput tests
+  jq -c '.[]' "$throughput_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^throughput_ ]]; then
+      echo "In throughput-test.json, test_name must start with \"throughput_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get arguments
+    throughput_params=$(echo "$params" | jq -r '.parameters')
+    throughput_args=$(json2args "$throughput_params")
+
+    throughput_command="vllm bench throughput \
+      --output-json $RESULTS_FOLDER/${test_name}.json \
+      $throughput_args"
+
+    echo "Running test case $test_name"
+    echo "Throughput command: $throughput_command"
+
+    # run the benchmark
+    eval "$throughput_command"
+    # echo model_name to result file
+    model_name=$(echo "$throughput_params" | jq -r '.model')
+    update_json_field "$RESULTS_FOLDER/${test_name}.json" "model_name" "$model_name"
+    kill_npu_processes
+
+  done
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the NPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    if [[ ! "$test_name" =~ ^serving_ ]]; then
+      echo "In serving-test.json, test_name must start with \"serving_\"."
+      exit 1
+    fi
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.server_parameters')
+    client_params=$(echo "$params" | jq -r '.client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    bash -c "$server_command" &
+    server_pid=$!
+
+    # wait until the server is alive
+    if wait_for_server; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="vllm bench serve \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      bash -c "$client_command"
+    done
+
+    # clean up
+    kill -9 $server_pid
+    kill_npu_processes
+  done
+}
+
+cleanup() {
+  rm -rf ./vllm_benchmarks
+}
+
+cleanup_on_error() {
+  echo "An error occurred. Cleaning up results folder..."
+  rm -rf $RESULTS_FOLDER
+}
+
+main() {
+  START_TIME=$(date +%s)
+  check_npus
+  
+  # dependencies
+  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+  (which jq) || (apt-get update && apt-get -y install jq)
+  (which lsof) || (apt-get update && apt-get install -y lsof)
+
+  # get the current IP address, required by benchmark_serving.py
+  # shellcheck disable=SC2155
+  export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+  # turn of the reporting of the status of each request, to clean up the terminal output
+  export VLLM_LOG_LEVEL="WARNING"
+  
+  # set env
+  export VLLM_USE_MODELSCOPE=True
+
+  # prepare for benchmarking
+  cd benchmarks || exit 1
+  trap cleanup EXIT
+
+  QUICK_BENCHMARK_ROOT=./
+
+  declare -g RESULTS_FOLDER=results
+  mkdir -p $RESULTS_FOLDER
+
+  trap cleanup_on_error ERR
+  ensure_sharegpt_downloaded
+  # benchmarks
+  run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
+  run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
+  run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
+
+  END_TIME=$(date +%s)
+  ELAPSED_TIME=$((END_TIME - START_TIME))
+  echo "Total execution time: $ELAPSED_TIME seconds"
+
+}
+
+main "$@"
--- a/benchmarks/tests/latency-tests.json
+++ b/benchmarks/tests/latency-tests.json
@ -0,0 +1,23 @@
+[
+  {
+    "test_name": "latency_qwen3_8B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen3-8B",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "max_model_len": 16384,
+      "num_iters_warmup": 5,
+      "num_iters": 15
+    }
+  },
+  {
+    "test_name": "latency_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "num_iters_warmup": 5,
+      "num_iters": 15
+    }
+  }
+]
--- a/benchmarks/tests/serving-tests.json
+++ b/benchmarks/tests/serving-tests.json
@ -0,0 +1,78 @@
+[
+  {
+    "test_name": "serving_qwen2_5vl_7B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "trust_remote_code": "",
+      "max_model_len": 16384
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "endpoint_type": "openai-chat",
+      "dataset_name": "hf",
+      "hf_split": "train",
+      "endpoint": "/v1/chat/completions",
+      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
+      "num_prompts": 200,
+      "no_stream": ""
+    }
+  },
+  {
+    "test_name": "serving_qwen3_8B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen3-8B",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen3-8B",
+      "endpoint_type": "vllm",
+      "dataset_name": "sharegpt",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
+  },
+  {
+    "test_name": "serving_qwen2_5_7B_tp1",
+    "qps_list": [
+      1,
+      4,
+      16,
+      "inf"
+    ],
+    "server_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "swap_space": 16,
+      "disable_log_stats": "",
+      "disable_log_requests": "",
+      "load_format": "dummy"
+    },
+    "client_parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "endpoint_type": "vllm",
+      "dataset_name": "sharegpt",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200
+    }
+  }
+]
--- a/benchmarks/tests/throughput-tests.json
+++ b/benchmarks/tests/throughput-tests.json
@ -0,0 +1,38 @@
+[
+  {
+    "test_name": "throughput_qwen3_8B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen3-8B",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200,
+      "backend": "vllm"
+    }
+  },
+  {
+    "test_name": "throughput_qwen2_5vl_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-VL-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "backend": "vllm-chat",
+      "dataset_name": "hf",
+      "hf_split": "train",
+      "max_model_len": 16384,
+      "dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
+      "num_prompts": 200
+    }
+  },
+  {
+    "test_name": "throughput_qwen2_5_7B_tp1",
+    "parameters": {
+      "model": "Qwen/Qwen2.5-7B-Instruct",
+      "tensor_parallel_size": 1,
+      "load_format": "dummy",
+      "dataset_path": "/github/home/.cache/datasets/ShareGPT_V3_unfiltered_cleaned_split.json",
+      "num_prompts": 200,
+      "backend": "vllm"
+    }
+  }
+]
+
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@ -0,0 +1,133 @@
+#
+# Attempt to find the python package that uses the same python executable as
+# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
+#
+macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
+  file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
+  set(Python_EXECUTABLE ${EXECUTABLE})
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
+  if (NOT Python_FOUND)
+    message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
+  endif()
+  set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
+  set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
+  if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
+    message(FATAL_ERROR
+      "Python version (${_VER}) is not one of the supported versions: "
+      "${_SUPPORTED_VERSIONS_LIST}.")
+  endif()
+  message(STATUS "Found python matching: ${EXECUTABLE}.")
+endmacro()
+
+#
+# Run `EXPR` in python.  The standard output of python is stored in `OUT` and
+# has trailing whitespace stripped.  If an error is encountered when running
+# python, a fatal message `ERR_MSG` is issued.
+#
+function (run_python OUT EXPR ERR_MSG)
+  execute_process(
+    COMMAND
+    "${PYTHON_EXECUTABLE}" "-c" "${EXPR}"
+    OUTPUT_VARIABLE PYTHON_OUT
+    RESULT_VARIABLE PYTHON_ERROR_CODE
+    ERROR_VARIABLE PYTHON_STDERR
+    OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+  if(NOT PYTHON_ERROR_CODE EQUAL 0)
+    message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
+  endif()
+  set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
+endfunction()
+
+# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
+# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
+macro (append_cmake_prefix_path PKG EXPR)
+  run_python(_PREFIX_PATH
+    "import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
+  list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
+endmacro()
+
+
+# This cmake function is adapted from vllm /Users/ganyi/workspace/vllm-ascend/cmake/utils.cmake
+# Define a target named `GPU_MOD_NAME` for a single extension. The
+# arguments are:
+#
+# DESTINATION <dest>         - Module destination directory.
+# LANGUAGE <lang>            - The GPU language for this module, e.g CUDA, HIP,
+#                              etc.
+# SOURCES <sources>          - List of source files relative to CMakeLists.txt
+#                              directory.
+#
+# Optional arguments:
+#
+# ARCHITECTURES <arches>     - A list of target GPU architectures in cmake
+#                              format.
+#                              Refer `CMAKE_CUDA_ARCHITECTURES` documentation
+#                              and `CMAKE_HIP_ARCHITECTURES` for more info.
+#                              ARCHITECTURES will use cmake's defaults if
+#                              not provided.
+# COMPILE_FLAGS <flags>      - Extra compiler flags passed to NVCC/hip.
+# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
+# LIBRARIES <libraries>      - Extra link libraries.
+# WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
+#
+# Note: optimization level/debug info is set via cmake build type.
+#
+function (define_gpu_extension_target GPU_MOD_NAME)
+  cmake_parse_arguments(PARSE_ARGV 1
+    GPU
+    "WITH_SOABI"
+    "DESTINATION;LANGUAGE;USE_SABI"
+    "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
+
+  # Add hipify preprocessing step when building with HIP/ROCm.
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_WITH_SOABI)
+    set(GPU_WITH_SOABI WITH_SOABI)
+  else()
+    set(GPU_WITH_SOABI)
+  endif()
+
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
+
+  if (GPU_LANGUAGE STREQUAL "HIP")
+    # Make this target dependent on the hipify preprocessor step.
+    add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
+  endif()
+
+  if (GPU_ARCHITECTURES)
+    set_target_properties(${GPU_MOD_NAME} PROPERTIES
+      ${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
+  endif()
+
+  set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
+
+  target_compile_options(${GPU_MOD_NAME} PRIVATE
+    $<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
+
+  target_compile_definitions(${GPU_MOD_NAME} PRIVATE
+    "-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
+
+  target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
+    ${GPU_INCLUDE_DIRECTORIES})
+
+  target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})
+
+  # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+  # dependencies that are not necessary and may not be installed.
+  if (GPU_LANGUAGE STREQUAL "CUDA")
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE CUDA::cudart CUDA::cuda_driver)
+  else()
+    target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+  endif()
+
+  install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION} COMPONENT ${GPU_MOD_NAME})
+endfunction()
--- a/codecov.yml
+++ b/codecov.yml
@ -0,0 +1,28 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+
+coverage:
+  status:
+    # Patch coverage is mandatory and must be >= 80%
+    patch:
+      default:
+        target: 80%
+    # non-voting
+    project:
+      default:
+        # non-voting
+        informational: true
--- a/collect_env.py
+++ b/collect_env.py
@ -0,0 +1,489 @@
+#
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/collect_env.py
+#
+
+import datetime
+import locale
+import os
+import re
+import subprocess
+import sys
+from collections import namedtuple
+
+from vllm.envs import environment_variables
+
+try:
+    import torch
+    TORCH_AVAILABLE = True
+except (ImportError, NameError, AttributeError, OSError):
+    TORCH_AVAILABLE = False
+
+# System Environment Information
+SystemEnv = namedtuple(
+    'SystemEnv',
+    [
+        'torch_version',
+        'is_debug_build',
+        'gcc_version',
+        'clang_version',
+        'cmake_version',
+        'os',
+        'libc_version',
+        'python_version',
+        'python_platform',
+        'pip_version',  # 'pip' or 'pip3'
+        'pip_packages',
+        'conda_packages',
+        'cpu_info',
+        'vllm_version',  # vllm specific field
+        'vllm_ascend_version',  # vllm ascend specific field
+        'env_vars',
+        'npu_info',  # ascend specific field
+        'cann_info',  # ascend specific field
+    ])
+
+DEFAULT_CONDA_PATTERNS = {
+    "torch",
+    "numpy",
+    "soumith",
+    "mkl",
+    "magma",
+    "optree",
+    "transformers",
+    "zmq",
+    "pynvml",
+}
+
+DEFAULT_PIP_PATTERNS = {
+    "torch",
+    "numpy",
+    "mypy",
+    "flake8",
+    "optree",
+    "onnx",
+    "transformers",
+    "zmq",
+    "pynvml",
+}
+
+
+def run(command):
+    """Return (return-code, stdout, stderr)."""
+    shell = True if type(command) is str else False
+    p = subprocess.Popen(command,
+                         stdout=subprocess.PIPE,
+                         stderr=subprocess.PIPE,
+                         shell=shell)
+    raw_output, raw_err = p.communicate()
+    rc = p.returncode
+    if get_platform() == 'win32':
+        enc = 'oem'
+    else:
+        enc = locale.getpreferredencoding()
+    output = raw_output.decode(enc)
+    err = raw_err.decode(enc)
+    return rc, output.strip(), err.strip()
+
+
+def run_and_read_all(run_lambda, command):
+    """Run command using run_lambda; reads and returns entire output if rc is 0."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out
+
+
+def run_and_parse_first_match(run_lambda, command, regex):
+    """Run command using run_lambda, returns the first regex match if it exists."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    match = re.search(regex, out)
+    if match is None:
+        return None
+    return match.group(1)
+
+
+def run_and_return_first_line(run_lambda, command):
+    """Run command using run_lambda and returns first line if output is not empty."""
+    rc, out, _ = run_lambda(command)
+    if rc != 0:
+        return None
+    return out.split('\n')[0]
+
+
+def get_conda_packages(run_lambda, patterns=None):
+    if patterns is None:
+        patterns = DEFAULT_CONDA_PATTERNS
+    conda = os.environ.get('CONDA_EXE', 'conda')
+    out = run_and_read_all(run_lambda, "{} list".format(conda))
+    if out is None:
+        return out
+
+    return "\n".join(line for line in out.splitlines()
+                     if not line.startswith("#") and any(name in line
+                                                         for name in patterns))
+
+
+def get_gcc_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
+
+
+def get_clang_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'clang --version',
+                                     r'clang version (.*)')
+
+
+def get_cmake_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cmake --version',
+                                     r'cmake (.*)')
+
+
+def _parse_version(version, version_tuple):
+    version_str = version_tuple[-1]
+    if isinstance(version_str, str) and version_str.startswith('g'):
+        if '.' in version_str:
+            git_sha = version_str.split('.')[0][1:]
+            date = version_str.split('.')[-1][1:]
+            return f"{version} (git sha: {git_sha}, date: {date})"
+        else:
+            git_sha = version_str[1:]  # type: ignore
+            return f"{version} (git sha: {git_sha})"
+    return version
+
+
+def get_vllm_version():
+    from vllm import __version__, __version_tuple__
+    return _parse_version(__version__, __version_tuple__)
+
+
+def get_vllm_ascend_version():
+    from vllm_ascend._version import __version__, __version_tuple__
+    return _parse_version(__version__, __version_tuple__)
+
+
+def get_cpu_info(run_lambda):
+    rc, out, err = 0, '', ''
+    if get_platform() == 'linux':
+        rc, out, err = run_lambda('lscpu')
+    elif get_platform() == 'win32':
+        rc, out, err = run_lambda(
+            'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
+        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
+        )
+    elif get_platform() == 'darwin':
+        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
+    cpu_info = 'None'
+    if rc == 0:
+        cpu_info = out
+    else:
+        cpu_info = err
+    return cpu_info
+
+
+def get_platform():
+    if sys.platform.startswith('linux'):
+        return 'linux'
+    elif sys.platform.startswith('win32'):
+        return 'win32'
+    elif sys.platform.startswith('cygwin'):
+        return 'cygwin'
+    elif sys.platform.startswith('darwin'):
+        return 'darwin'
+    else:
+        return sys.platform
+
+
+def get_mac_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
+                                     r'(.*)')
+
+
+def get_windows_version(run_lambda):
+    system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
+    wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
+    findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
+    return run_and_read_all(
+        run_lambda,
+        '{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
+
+
+def get_lsb_version(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'lsb_release -a',
+                                     r'Description:\t(.*)')
+
+
+def check_release_file(run_lambda):
+    return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
+                                     r'PRETTY_NAME="(.*)"')
+
+
+def get_os(run_lambda):
+    from platform import machine
+    platform = get_platform()
+
+    if platform == 'win32' or platform == 'cygwin':
+        return get_windows_version(run_lambda)
+
+    if platform == 'darwin':
+        version = get_mac_version(run_lambda)
+        if version is None:
+            return None
+        return 'macOS {} ({})'.format(version, machine())
+
+    if platform == 'linux':
+        # Ubuntu/Debian based
+        desc = get_lsb_version(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        # Try reading /etc/*-release
+        desc = check_release_file(run_lambda)
+        if desc is not None:
+            return '{} ({})'.format(desc, machine())
+
+        return '{} ({})'.format(platform, machine())
+
+    # Unknown platform
+    return platform
+
+
+def get_python_platform():
+    import platform
+    return platform.platform()
+
+
+def get_libc_version():
+    import platform
+    if get_platform() != 'linux':
+        return 'N/A'
+    return '-'.join(platform.libc_ver())
+
+
+def get_pip_packages(run_lambda, patterns=None):
+    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
+    if patterns is None:
+        patterns = DEFAULT_PIP_PATTERNS
+
+    # People generally have `pip` as `pip` or `pip3`
+    # But here it is invoked as `python -mpip`
+    def run_with_pip(pip):
+        out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
+        return "\n".join(line for line in out.splitlines()
+                         if any(name in line for name in patterns))
+
+    pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
+    out = run_with_pip([sys.executable, '-mpip'])
+
+    return pip_version, out
+
+
+def get_npu_info(run_lambda):
+    return run_and_read_all(run_lambda, 'npu-smi info')
+
+
+def get_cann_info(run_lambda):
+    out = run_and_read_all(run_lambda, 'lscpu | grep Architecture:')
+    cpu_arch = str(out).split()[-1]
+    return run_and_read_all(
+        run_lambda,
+        'cat /usr/local/Ascend/ascend-toolkit/latest/{}-linux/ascend_toolkit_install.info'
+        .format(cpu_arch))
+
+
+def get_env_vars():
+    env_vars = ''
+    secret_terms = ('secret', 'token', 'api', 'access', 'password')
+    report_prefix = ("TORCH", "PYTORCH", "ASCEND_", "ATB_")
+    for k, v in os.environ.items():
+        if any(term in k.lower() for term in secret_terms):
+            continue
+        if k in environment_variables:
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+        if k.startswith(report_prefix):
+            env_vars = env_vars + "{}={}".format(k, v) + "\n"
+
+    return env_vars
+
+
+def get_env_info():
+    run_lambda = run
+    pip_version, pip_list_output = get_pip_packages(run_lambda)
+
+    if TORCH_AVAILABLE:
+        version_str = torch.__version__
+        debug_mode_str = str(torch.version.debug)
+    else:
+        version_str = debug_mode_str = 'N/A'
+
+    sys_version = sys.version.replace("\n", " ")
+
+    conda_packages = get_conda_packages(run_lambda)
+
+    return SystemEnv(
+        torch_version=version_str,
+        is_debug_build=debug_mode_str,
+        python_version='{} ({}-bit runtime)'.format(
+            sys_version,
+            sys.maxsize.bit_length() + 1),
+        python_platform=get_python_platform(),
+        pip_version=pip_version,
+        pip_packages=pip_list_output,
+        conda_packages=conda_packages,
+        os=get_os(run_lambda),
+        libc_version=get_libc_version(),
+        gcc_version=get_gcc_version(run_lambda),
+        clang_version=get_clang_version(run_lambda),
+        cmake_version=get_cmake_version(run_lambda),
+        cpu_info=get_cpu_info(run_lambda),
+        vllm_version=get_vllm_version(),
+        vllm_ascend_version=get_vllm_ascend_version(),
+        env_vars=get_env_vars(),
+        npu_info=get_npu_info(run_lambda),
+        cann_info=get_cann_info(run_lambda),
+    )
+
+
+env_info_fmt = """
+PyTorch version: {torch_version}
+Is debug build: {is_debug_build}
+
+OS: {os}
+GCC version: {gcc_version}
+Clang version: {clang_version}
+CMake version: {cmake_version}
+Libc version: {libc_version}
+
+Python version: {python_version}
+Python platform: {python_platform}
+
+CPU:
+{cpu_info}
+
+Versions of relevant libraries:
+{pip_packages}
+{conda_packages}
+""".strip()
+
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n"
+
+env_info_fmt += """
+vLLM Version: {vllm_version}
+vLLM Ascend Version: {vllm_ascend_version}
+
+ENV Variables:
+{env_vars}
+
+NPU:
+{npu_info}
+
+CANN:
+{cann_info}
+""".strip()
+
+
+def pretty_str(envinfo):
+
+    def replace_nones(dct, replacement='Could not collect'):
+        for key in dct.keys():
+            if dct[key] is not None:
+                continue
+            dct[key] = replacement
+        return dct
+
+    def replace_bools(dct, true='Yes', false='No'):
+        for key in dct.keys():
+            if dct[key] is True:
+                dct[key] = true
+            elif dct[key] is False:
+                dct[key] = false
+        return dct
+
+    def prepend(text, tag='[prepend]'):
+        lines = text.split('\n')
+        updated_lines = [tag + line for line in lines]
+        return '\n'.join(updated_lines)
+
+    def replace_if_empty(text, replacement='No relevant packages'):
+        if text is not None and len(text) == 0:
+            return replacement
+        return text
+
+    def maybe_start_on_next_line(string):
+        # If `string` is multiline, prepend a \n to it.
+        if string is not None and len(string.split('\n')) > 1:
+            return '\n{}\n'.format(string)
+        return string
+
+    mutable_dict = envinfo._asdict()
+
+    # Replace True with Yes, False with No
+    mutable_dict = replace_bools(mutable_dict)
+
+    # Replace all None objects with 'Could not collect'
+    mutable_dict = replace_nones(mutable_dict)
+
+    # If either of these are '', replace with 'No relevant packages'
+    mutable_dict['pip_packages'] = replace_if_empty(
+        mutable_dict['pip_packages'])
+    mutable_dict['conda_packages'] = replace_if_empty(
+        mutable_dict['conda_packages'])
+
+    # Tag conda and pip packages with a prefix
+    # If they were previously None, they'll show up as ie '[conda] Could not collect'
+    if mutable_dict['pip_packages']:
+        mutable_dict['pip_packages'] = prepend(
+            mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
+    if mutable_dict['conda_packages']:
+        mutable_dict['conda_packages'] = prepend(
+            mutable_dict['conda_packages'], '[conda] ')
+    mutable_dict['cpu_info'] = envinfo.cpu_info
+    mutable_dict['npu_info'] = envinfo.npu_info
+    mutable_dict['cann_info'] = envinfo.cann_info
+    return env_info_fmt.format(**mutable_dict)
+
+
+def get_pretty_env_info():
+    return pretty_str(get_env_info())
+
+
+def main():
+    print("Collecting environment information...")
+    output = get_pretty_env_info()
+    print(output)
+
+    if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
+            torch.utils, '_crash_handler'):
+        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
+        if sys.platform == "linux" and os.path.exists(minidump_dir):
+            dumps = [
+                os.path.join(minidump_dir, dump)
+                for dump in os.listdir(minidump_dir)
+            ]
+            latest = max(dumps, key=os.path.getctime)
+            ctime = os.path.getctime(latest)
+            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
+                '%Y-%m-%d %H:%M:%S')
+            msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
+                  "if this is related to your bug please include it when you file a report ***"
+            print(msg, file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
--- a/csrc/camem_allocator.cpp
+++ b/csrc/camem_allocator.cpp
@ -0,0 +1,338 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <iostream>
+
+extern "C" {
+
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+#include <sys/types.h>
+#include "acl/acl.h"
+
+// Global references to Python callables
+// NOTE: this is borrowed reference, so we don't need to DECREF them.
+// This brings the limitation that the allocator needs to be singleton.
+static PyObject* g_python_malloc_callback = nullptr;
+static PyObject* g_python_free_callback = nullptr;
+
+
+// ---------------------------------------------------------------------------
+// Helper functions:
+
+void ensure_context(unsigned long long device) {
+  aclrtContext pctx;
+  aclrtGetCurrentContext(&pctx);
+  if (!pctx) {
+    // Ensure device context.
+    aclrtCreateContext(&pctx, device);
+    aclrtSetCurrentContext(pctx);
+  }
+}
+
+void create_and_map(unsigned long long device, ssize_t size, void* d_mem,
+                    aclrtDrvMemHandle* p_memHandle) {
+  ensure_context(device);
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Allocate memory using aclrtMallocPhysical
+  aclError error_code = aclrtMallocPhysical(p_memHandle, size, &prop, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+  error_code = aclrtMapMem(d_mem, size, 0, *p_memHandle, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+}
+
+void unmap_and_release(unsigned long long device, ssize_t size,
+                       void* d_mem,
+                       aclrtDrvMemHandle* p_memHandle) {
+  // std::cout << "unmap_and_release: device=" << device << ", size=" << size <<
+  // ", d_mem=" << d_mem << ", p_memHandle=" << p_memHandle << std::endl;
+  ensure_context(device);
+  aclError error_code = aclrtUnmapMem(d_mem);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+  error_code = aclrtFreePhysical(*p_memHandle);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return;
+  }
+}
+
+PyObject* create_tuple_from_c_integers(unsigned long long a,
+                                       unsigned long long b,
+                                       unsigned long long c,
+                                       unsigned long long d) {
+  // Create a new tuple of size 4
+  PyObject* tuple = PyTuple_New(4);
+  if (!tuple) {
+    return NULL;  // Return NULL on failure
+  }
+
+  // Convert integers to Python objects and set them in the tuple
+  PyTuple_SetItem(
+      tuple, 0,
+      PyLong_FromUnsignedLongLong(a));  // Steals reference to the PyLong
+  PyTuple_SetItem(tuple, 1, PyLong_FromUnsignedLongLong(b));
+  PyTuple_SetItem(tuple, 2, PyLong_FromUnsignedLongLong(c));
+  PyTuple_SetItem(tuple, 3, PyLong_FromUnsignedLongLong(d));
+
+  // Note: PyTuple_SetItem "steals" a reference to each object,
+  // so we do not need to Py_DECREF the PyLong objects explicitly.
+
+  return tuple;  // Return the created tuple
+}
+
+// ---------------------------------------------------------------------------
+// Our exported C functions that call Python:
+
+__attribute__ ((visibility("default"))) void* my_malloc(ssize_t size, int device, aclrtStream stream) {
+  ensure_context(device);
+
+  // first allocation, align the size, and reserve an address, and also allocate
+  // a aclrtDrvMemHandle
+
+  // Define memory allocation properties
+  aclrtPhysicalMemProp prop = {};
+  prop.handleType = ACL_MEM_HANDLE_TYPE_NONE ;
+  prop.allocationType = ACL_MEM_ALLOCATION_TYPE_PINNED;
+  prop.memAttr = ACL_HBM_MEM_HUGE;
+  prop.location.id = device;
+  prop.location.type = ACL_MEM_LOCATION_TYPE_DEVICE;
+  prop.reserve = 0;
+
+  // Check if the allocation is supported
+  size_t granularity;
+  aclError error_code = aclrtMemGetAllocationGranularity(&prop,
+                                   ACL_RT_MEM_ALLOC_GRANULARITY_MINIMUM,
+                                   &granularity);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+            << __LINE__ << std::endl;  
+    return nullptr;
+  }
+  size_t alignedSize = ((size + granularity - 1) / granularity) * granularity;
+  void *d_mem;
+  error_code = aclrtReserveMemAddress(&d_mem, alignedSize, 0, nullptr, 0);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+                << __LINE__ << std::endl;  
+    return nullptr;
+  }
+  // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)malloc(sizeof(aclrtDrvMemHandle));
+
+  if (!g_python_malloc_callback) {
+    std::cerr << "ERROR: g_python_malloc_callback not set.\n";
+    return nullptr;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* arg_tuple = create_tuple_from_c_integers(
+      (unsigned long long)device, (unsigned long long)alignedSize,
+      (unsigned long long)d_mem, (unsigned long long)p_memHandle);
+
+  // Call g_python_malloc_callback
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_malloc_callback, arg_tuple, NULL);
+  Py_DECREF(arg_tuple);
+
+  if (!py_result) {
+    PyErr_Print();
+    PyGILState_Release(gstate);
+    return nullptr;
+  }
+
+  PyGILState_Release(gstate);
+
+  // do the final mapping
+  create_and_map(device, alignedSize, d_mem, p_memHandle);
+
+  return (void*)d_mem;
+}
+
+__attribute__ ((visibility("default"))) void my_free(void* ptr, ssize_t size, int device, aclrtStream stream) {
+  // get memory handle from the pointer
+  if (!g_python_free_callback) {
+    std::cerr << "ERROR: g_python_free_callback not set.\n";
+    return;
+  }
+
+  // Acquire GIL (not in stable ABI officially, but often works)
+  PyGILState_STATE gstate = PyGILState_Ensure();
+
+  PyObject* py_ptr =
+      PyLong_FromUnsignedLongLong(reinterpret_cast<unsigned long long>(ptr));
+
+  PyObject* py_result =
+      PyObject_CallFunctionObjArgs(g_python_free_callback, py_ptr, NULL);
+
+  if (!py_result || !PyTuple_Check(py_result) || PyTuple_Size(py_result) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(py_result, "KKKK", &recv_device, &recv_size,
+                        &recv_d_mem, &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return;
+  }
+
+  PyGILState_Release(gstate);
+
+  // recv_size == size
+  // recv_device == device
+
+  // Free memory
+
+  void *d_mem = (void*)recv_d_mem;
+    // allocate the aclrtDrvMemHandle
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+  unmap_and_release(device, size, d_mem, p_memHandle);
+
+  // free address and the handle
+  aclError error_code = aclrtReleaseMemAddress(d_mem);
+  if (error_code != 0) {
+    std::cerr << "acl Error, code: " << error_code << " at " << __FILE__ << ":" \
+        << __LINE__ << std::endl;  
+    return;
+  }
+  free(p_memHandle);
+}
+
+// ---------------------------------------------------------------------------
+// Python extension boilerplate:
+
+// Python-exposed function: init_module(python_malloc, python_free)
+static PyObject* py_init_module(PyObject* self, PyObject* args) {
+  PyObject* malloc_callback = nullptr;
+  PyObject* free_callback = nullptr;
+
+  if (!PyArg_ParseTuple(args, "OO", &malloc_callback, &free_callback)) {
+    return nullptr;
+  }
+
+  if (!PyCallable_Check(malloc_callback) || !PyCallable_Check(free_callback)) {
+    PyErr_SetString(PyExc_TypeError, "Both arguments must be callables");
+    return nullptr;
+  }
+
+  // Save the Python callables
+  // This module does not handle GC of these objects, so they must be kept alive
+  // outside of this module.
+  g_python_malloc_callback = malloc_callback;
+  g_python_free_callback = free_callback;
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_unmap_and_release(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  void *d_mem_ptr = (void*)recv_d_mem;
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+
+  unmap_and_release(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyObject* python_create_and_map(PyObject* self, PyObject* args) {
+  if (!args || !PyTuple_Check(args) || PyTuple_Size(args) != 4) {
+    PyErr_SetString(PyExc_TypeError, "Expected a tuple of size 4");
+    return nullptr;
+  }
+
+  unsigned long long recv_device, recv_size;
+  unsigned long long recv_d_mem, recv_p_memHandle;
+  // Unpack the tuple into four C integers
+  if (!PyArg_ParseTuple(args, "KKKK", &recv_device, &recv_size, &recv_d_mem,
+                        &recv_p_memHandle)) {
+    // PyArg_ParseTuple sets an error if it fails
+    return nullptr;
+  }
+
+  void *d_mem_ptr = (void*)recv_d_mem;
+  aclrtDrvMemHandle* p_memHandle =
+      (aclrtDrvMemHandle*)recv_p_memHandle;
+
+  create_and_map(recv_device, recv_size, d_mem_ptr, p_memHandle);
+
+  Py_RETURN_NONE;
+}
+
+static PyMethodDef module_methods[] = {
+    {"init_module", (PyCFunction)py_init_module, METH_VARARGS,
+     "Initialize module with python_malloc and python_free callables."},
+    {"python_create_and_map", (PyCFunction)python_create_and_map, METH_VARARGS,
+     "Create and map memory on the device."},
+    {"python_unmap_and_release", (PyCFunction)python_unmap_and_release,
+     METH_VARARGS, "Unmap and release memory on the device."},
+    {NULL, NULL, 0, NULL}  // sentinel
+};
+
+static struct PyModuleDef camem_allocator_module = {
+    PyModuleDef_HEAD_INIT, "camem_allocator",
+    "CANN-mem-based allocator for NPUPluggableAllocator", -1, module_methods};
+
+PyMODINIT_FUNC PyInit_vllm_ascend_C(void) {
+  // Initialize the module
+  PyObject* module = PyModule_Create(&camem_allocator_module);
+  if (!module) {
+    return NULL;
+  }
+  return module;
+}
+}  // extern "C"
--- a/csrc/kernels/bgmv_expand.cpp
+++ b/csrc/kernels/bgmv_expand.cpp
@ -0,0 +1,369 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVExpand {
+public:
+    using X_T = float;
+    using W_T = scalar_t;
+    using Y_T = scalar_t;
+
+    static constexpr uint64_t LORA_RANK_8 = 8;
+    static constexpr uint64_t LORA_RANK_16 = 16;
+    static constexpr uint64_t LORA_RANK_32 = 32;
+    static constexpr uint64_t LORA_RANK_64 = 64;
+    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
+    static constexpr int32_t BUFFER_NUM = 2;
+
+    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
+    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
+    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
+    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
+    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Mask is used to control the elements that participate in computation in each iteration.
+    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
+    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
+    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
+    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
+    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
+    // So need to read them all and apply PairReduceSum
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
+        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
+    // Same for rank=64, we do not support ranks greater than 64.
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
+
+public:
+    __aicore__ inline BGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
+
+    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,
+                                uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+    {
+        batchSize_ = batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        maxLoRARank_ = maxLoRARank;
+        outputHiddenDim_ = outputHiddenDim;
+        sliceOffset_ = sliceOffset;
+        outputFullDim_ = outputFullDim;
+        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
+
+        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
+        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+
+        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+
+        // Each compute iteration would generate not one, but several output elements.
+        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
+        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
+        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            yOffset_ = outputFullDim_ * idx + sliceOffset_;
+
+            // Set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            CopyInX(idx);
+            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+            for (int32_t i = 0; i < numStreamOut; i++) {
+                CopyInY(i);
+                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
+                    CopyInW(i * numStreamInPerOutputTile_ + j);
+                    Compute(j * numOutputElementsPerInputTile_);
+                }
+                ScaleOutput();
+                CopyOut(i);
+            }
+            ComputeLastIteration();
+        }
+    }
+
+private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void ComputeLastIteration()
+    {
+        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
+        if (remainingY == 0) {
+            return;
+        }
+        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+        int32_t remainingW = remainingY * maxLoRARank_;
+        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
+        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
+
+        CopyInY(numStreamOut, remainingY);
+
+        int32_t outputIdx = 0;
+        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
+            Compute(outputIdx * numOutputElementsPerInputTile_);
+        }
+
+        if (remainingWForLastRepeat != 0) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
+                    remainingWForLastRepeat);
+            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat16 = 
+                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
+            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
+            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
+        }
+
+        ScaleOutput(remainingY);
+        CopyOut(numStreamOut, remainingY);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        if constexpr (std::is_same_v<X_T, float>) {
+            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
+        } else {
+            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
+            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
+        }
+        inQueueX_.EnQue(xLocal);
+        xLocal = inQueueX_.DeQue<X_T>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+
+        // As we are generating multiple output elements with one API invocation,
+        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
+        if constexpr (std::is_same_v<X_T, float>) {
+            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xLocal.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        } else {
+            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
+            pipe_barrier(PIPE_V);
+
+            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xDup.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        }
+        inQueueX_.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
+        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
+        inQueueY_.EnQue(yInLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
+        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
+        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
+        pipe_barrier(PIPE_V);
+        inQueueY_.FreeTensor(yInLocal);
+
+        Add(yLocal, yLocal, yInLocalFP32, numElements);
+        pipe_barrier(PIPE_V);
+
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void Compute(int32_t progress,
+                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
+                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
+                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
+        pipe_barrier(PIPE_V);
+        inQueueW_.FreeTensor(wLocal);
+
+        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
+        pipe_barrier(PIPE_V);
+
+        if (maxLoRARank_ == LORA_RANK_8) {
+            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_16) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_32) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_64) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        }
+    }
+
+    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<Y_T> yInGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t maxLoRARank_;
+    uint32_t outputHiddenDim_;
+    uint32_t sliceOffset_;
+    uint32_t outputFullDim_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    uint32_t numOutputElementsPerInputTile_;
+    uint32_t numStreamInPerOutputTile_;
+    uint64_t yOffset_;
+
+    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
+    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
+    // reads next 8 consecutive blocks in the second repeat.
+    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
+
+    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
+    // so we set dstRepStride = 1
+    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
+
+    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
+    // For xDup we repeatedly use it, so we set src0RepStride = 0
+    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
+
+};
+
+#define BGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             uint32_t indicesSize, __gm__ void* yIn, __gm__ void* yOut,\
+                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
+                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
+                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVExpand<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                  \
+                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_EXPAND_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
+                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize, numTokensPerCore,
+                                                        maxLoRARank, outputHiddenDim, sliceOffset, outputFullDim);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, yIn, yOut, batchSize,
+                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
+                                                                  sliceOffset, outputFullDim);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/bgmv_shrink.cpp
+++ b/csrc/kernels/bgmv_shrink.cpp
@ -0,0 +1,252 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class BGMVShrink {
+public:
+    using X_T = scalar_t;
+    using W_T = scalar_t;
+    using Y_T = float;
+
+    static constexpr uint64_t BUFFER_NUM = 1;
+    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
+
+public:
+    __aicore__ inline BGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *indices, uint32_t indicesSize, __gm__ void *y,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                                uint32_t maxLoRARank, float scale)
+    {
+        batchSize_ =  batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        inputHiddenDim_ = inputHiddenDim;
+        maxLoRARank_ = maxLoRARank;
+        scale_ = scale;
+        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
+        incremental_ = inputHiddenDim_ > TILE_LENGTH;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        indicesGm_.SetGlobalBuffer((__gm__ int64_t *)indices, indicesSize);
+
+        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
+        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
+        
+        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
+        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            // set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            if (incremental_) {
+                ProcessImpl<true>(idx);
+            } else {
+                ProcessImpl<false>(idx);
+            }
+
+            ScaleOutput();
+            CopyOut(idx);
+        }
+    }
+
+private:
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void ProcessImpl(const int64_t idx)
+    {
+        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
+        if constexpr (!INCREMENTAL_MODE) {
+            CopyInX(idx, 0, inputHiddenDim_);
+            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+        }
+
+        for (int i = 0; i < maxLoRARank_; i++) {
+            float acc(0);
+            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
+                if constexpr (INCREMENTAL_MODE) {
+                    CopyInX(idx, j);
+                }
+                CopyInW(i, j);
+                Compute<INCREMENTAL_MODE>(acc);
+            }
+            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
+            yOutLocal.SetValue(i, acc);
+        }
+    }
+
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        reqLoRAIndex_ = indicesGm_.GetValue(idx);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
+        inQueueX_.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        if constexpr (INCREMENTAL_MODE) {
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+            inQueueW_.FreeTensor(wLocal);
+        } else {
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueW_.FreeTensor(wLocal);
+        }
+        // dot product of the one tile of X and W 
+        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+        // reduce sum generate one number, which is the summation of all the dot product
+        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+
+        acc += wTmpTensor.GetValue(0);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
+    {
+        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
+        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
+        if (remaining == 0) {
+            return;
+        }
+        if constexpr (INCREMENTAL_MODE) {
+            CopyInX(idx, colIdx, remaining);
+        }
+        CopyInW(rowIdx, colIdx, remaining);
+        Compute<INCREMENTAL_MODE>(acc, remaining);
+    }
+
+    __aicore__ inline void ScaleOutput()
+    {
+        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+
+        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void CopyOut(const int64_t idx)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<int64_t> indicesGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t inputHiddenDim_;
+    uint32_t maxLoRARank_;
+    float scale_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    bool incremental_;
+};
+
+#define BGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void bgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight, __gm__ void* indices,\
+                                                             uint32_t indicesSize, __gm__ void* y, uint32_t batchSize, \
+                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
+                                                             uint32_t maxLoRARank, float scale)                        \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        BGMVShrink<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);  \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+BGMV_SHRINK_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    BGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void bgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, void* indices, uint32_t indicesSize,
+                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                             uint32_t maxLoRARank, float scale)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        bgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
+                                                        inputHiddenDim, maxLoRARank, scale);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            bgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, indices, indicesSize, y, batchSize, numTokensPerCore, 
+                                                                  inputHiddenDim, maxLoRARank, scale);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/get_masked_input_and_mask_kernel.cpp
+++ b/csrc/kernels/get_masked_input_and_mask_kernel.cpp
@ -0,0 +1,378 @@
+/* 
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ */
+
+#include "kernel_operator.h"
+#include "kernel_tensor_impl.h"
+#include "kernel_type.h"
+#include "types.h"
+#include "utils.h"
+using vllm_ascend::AccType;
+
+template<typename scalar_t>
+class GetMaskedInputAndMask {
+public:
+    __aicore__ inline GetMaskedInputAndMask() {}
+    
+    __aicore__ inline ~GetMaskedInputAndMask() {
+        pipe.Reset();
+    }
+
+    
+    __aicore__ inline void Init(
+        __gm__ scalar_t* input,
+        __gm__ scalar_t* masked_input, 
+        __gm__ bool* mask_out,
+        const int64_t org_vocab_start_index,
+        const int64_t org_vocab_end_index,
+        const int64_t num_org_vocab_padding,
+        const int64_t added_vocab_start_index,
+        const int64_t added_vocab_end_index,
+        const int64_t size)
+    {
+        // Initialize basic parameters
+        input_ = input;
+        masked_input_ = masked_input;
+        mask_out_ = mask_out;
+        org_vocab_start_index_ = org_vocab_start_index;
+        org_vocab_end_index_ = org_vocab_end_index;
+        size_ = ((size + 31) / 32) * 32;
+        added_offset_ = added_vocab_start_index - 
+            (org_vocab_end_index - org_vocab_start_index) - 
+            num_org_vocab_padding;
+        added_vocab_start_index_ = added_vocab_start_index;
+        added_vocab_end_index_ = added_vocab_end_index;
+
+        // Initialize global tensors
+        inputGlobal.SetGlobalBuffer(input);
+        maskedOutputGlobal.SetGlobalBuffer(masked_input); 
+        maskOutGlobal.SetGlobalBuffer(mask_out);
+
+        // Initialize queues
+        pipe.InitBuffer(inQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(outQueue, 1, size_ * sizeof(scalar_t));
+        pipe.InitBuffer(maskQueue, 1, size_ * sizeof(bool));
+        
+        // Initialize calculation buffers
+        // NOTE: calc_buf_1 and calc_buf_2 are also used for int16 casting on older archs.
+        pipe.InitBuffer(calc_buf_1, size_ * sizeof(float));
+        pipe.InitBuffer(calc_buf_2, size_ * sizeof(float));
+        
+        // Initialize result queues
+        pipe.InitBuffer(result_ge_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_le_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_org_mask_que, BUFFER_NUM, size_ * sizeof(float));
+        pipe.InitBuffer(result_add_mask_que, BUFFER_NUM, size_ * sizeof(float));
+
+        // Initialize temporary buffers
+        pipe.InitBuffer(start_buf, size_ * sizeof(float));
+        pipe.InitBuffer(end_buf, size_ * sizeof(float));
+        pipe.InitBuffer(inputFloat_buf, size_ * sizeof(float)); // Also used for half intermediate in casting
+        pipe.InitBuffer(validOffset_buf, size_ * sizeof(float));
+        pipe.InitBuffer(vocabMask_buf_, size_ * sizeof(int8_t));
+        pipe.InitBuffer(ones_buf_, size_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        CopyIn();
+        Compute();
+        CopyOut();
+    }
+
+private:
+    __aicore__ inline void CopyIn()
+    {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.AllocTensor<scalar_t>();
+        AscendC::DataCopy(inputLocal, inputGlobal, size_);
+        inQueue.EnQue(inputLocal);
+    }
+
+    __aicore__ inline void CompareWithValue(
+        AscendC::LocalTensor<int8_t>& result,
+        const AscendC::LocalTensor<float>& input,
+        const AscendC::LocalTensor<float>& compare_value,
+        bool is_greater_equal) {
+
+        AscendC::LocalTensor<float> compute_buf = calc_buf_1.Get<float>();
+        if (is_greater_equal) {
+            AscendC::Max(compute_buf, input, compare_value, size_);  
+            AscendC::Sub(compute_buf, compare_value, compute_buf, size_);  
+        } else {
+            AscendC::Max(compute_buf, input, compare_value, size_); 
+            AscendC::Sub(compute_buf, compute_buf, compare_value, size_); 
+        }
+
+        AscendC::Abs(compute_buf, compute_buf, size_);
+        AscendC::Mins(compute_buf, compute_buf, MIN_ACCURACY_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_1_FP32, size_);
+        AscendC::Muls(compute_buf, compute_buf, MAX_MUL_2_FP32, size_);
+        AscendC::Adds(compute_buf, compute_buf, NEGATIVE_ONE_FP32, size_);
+        AscendC::Abs(compute_buf, compute_buf, size_);
+
+        AscendC::LocalTensor<half> compute_buf_fp16 = calc_buf_2.Get<half>();
+        AscendC::Cast(compute_buf_fp16, compute_buf, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(result, compute_buf_fp16, AscendC::RoundMode::CAST_NONE, size_);
+    }
+
+    __aicore__ inline void ComputeRangeMask(
+        AscendC::LocalTensor<int8_t>& range_mask,
+        const AscendC::LocalTensor<float>& input,
+        const float start_value, 
+        const float end_value) {
+        
+        AscendC::LocalTensor<float> start_value_tensor = start_buf.Get<float>();
+        AscendC::LocalTensor<float> end_value_tensor = end_buf.Get<float>();
+
+        AscendC::Duplicate(start_value_tensor, start_value, size_);
+        AscendC::Duplicate(end_value_tensor, end_value, size_);
+        
+        AscendC::LocalTensor<int8_t> ge_result = result_ge_que.AllocTensor<int8_t>();
+        AscendC::LocalTensor<int8_t> lt_result = result_le_que.AllocTensor<int8_t>();
+
+        CompareWithValue(ge_result, start_value_tensor, input, true);
+        CompareWithValue(lt_result, input, end_value_tensor, false);
+        
+#if (__CCE_AICORE__ >= 220) 
+        AscendC::And(range_mask, ge_result, lt_result, size_);
+#else
+        {
+            // WORKAROUND for older arch
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 And. Use int16 And.
+            AscendC::LocalTensor<int16_t> ge_result_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> lt_result_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> range_mask_i16 = ge_result_i16; 
+            
+            // Use a temporary buffer for half type
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, ge_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(ge_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+            
+            AscendC::Cast(tmp_half, lt_result, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(lt_result_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform And on int16_t tensors
+            AscendC::And(range_mask_i16, ge_result_i16, lt_result_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, range_mask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(range_mask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
+    }
+
+    __aicore__ inline void Compute() {
+        AscendC::LocalTensor<scalar_t> inputLocal = inQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.AllocTensor<scalar_t>();
+        AscendC::LocalTensor<int8_t> maskLocal = maskQueue.AllocTensor<int8_t>();
+
+        AscendC::LocalTensor<float> inputFloat = inputFloat_buf.Get<float>();
+        AscendC::Cast(inputFloat, inputLocal, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::LocalTensor<int8_t> orgVocabMask = result_org_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(orgVocabMask, 
+                        inputFloat,
+                        static_cast<float>(org_vocab_start_index_),
+                        static_cast<float>(org_vocab_end_index_));
+
+        AscendC::LocalTensor<int8_t> addedVocabMask = result_add_mask_que.AllocTensor<int8_t>();
+        ComputeRangeMask(addedVocabMask,
+                        inputFloat,
+                        static_cast<float>(added_vocab_start_index_),
+                        static_cast<float>(added_vocab_end_index_));
+
+        AscendC::LocalTensor<float> validOffset = validOffset_buf.Get<float>();
+        AscendC::LocalTensor<float> constOrgStartIndex = start_buf.Get<float>();
+        
+        AscendC::Duplicate(constOrgStartIndex, float(org_vocab_start_index_), size_);
+        
+        AscendC::LocalTensor<half> orgVocabMask_fp16;
+        AscendC::LocalTensor<float> orgVocabMask_fp32;
+        AscendC::Cast(orgVocabMask_fp16, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(orgVocabMask_fp32, orgVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(validOffset, constOrgStartIndex, orgVocabMask_fp32, size_);
+
+        AscendC::LocalTensor<float> addedOffset;
+        AscendC::LocalTensor<float> addedOffsetTensor = end_buf.Get<float>();
+        AscendC::Duplicate(addedOffsetTensor, float(added_offset_), size_);
+
+        AscendC::LocalTensor<half> addedVocabMask_fp16;
+        AscendC::LocalTensor<float> addedVocabMask_fp32;
+        AscendC::Cast(addedVocabMask_fp16, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(addedVocabMask_fp32, addedVocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+
+        AscendC::Mul(addedOffset, addedOffsetTensor, addedVocabMask_fp32, size_);
+        AscendC::Add(validOffset, validOffset, addedOffset, size_);
+
+        AscendC::LocalTensor<int8_t> vocabMask = vocabMask_buf_.Get<int8_t>();
+        
+#if (__CCE_AICORE__ >= 220)
+        AscendC::Or(vocabMask,
+                    orgVocabMask,
+                    addedVocabMask,
+                    size_);
+#else
+        {
+            // WORKAROUND for older arch 
+            // No direct int8->int16 cast. Use half as intermediate.
+            // No direct int8 Or. Use int16 Or.
+            AscendC::LocalTensor<int16_t> orgVocabMask_i16 = calc_buf_1.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> addedVocabMask_i16 = calc_buf_2.Get<int16_t>();
+            AscendC::LocalTensor<int16_t> vocabMask_i16 = orgVocabMask_i16; 
+
+            // Use a temporary buffer for half type. inputFloat_buf is free now.
+            AscendC::LocalTensor<half> tmp_half = inputFloat_buf.Get<half>();
+
+            // 1. Cast inputs: int8_t -> half -> int16_t
+            AscendC::Cast(tmp_half, orgVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(orgVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            AscendC::Cast(tmp_half, addedVocabMask, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(addedVocabMask_i16, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+
+            // 2. Perform Or on int16_t tensors
+            AscendC::Or(vocabMask_i16, orgVocabMask_i16, addedVocabMask_i16, size_);
+
+            // 3. Cast result back: int16_t -> half -> int8_t
+            AscendC::Cast(tmp_half, vocabMask_i16, AscendC::RoundMode::CAST_NONE, size_);
+            AscendC::Cast(vocabMask, tmp_half, AscendC::RoundMode::CAST_NONE, size_);
+        }
+#endif
+
+        AscendC::Sub(inputFloat, inputFloat, validOffset, size_);
+
+        AscendC::LocalTensor<half> vocabMask_fp16;
+        AscendC::LocalTensor<float> vocabMask_fp32;
+        AscendC::Cast(vocabMask_fp16, vocabMask, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(vocabMask_fp32, vocabMask_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        
+        AscendC::Mul(inputFloat, inputFloat, vocabMask_fp32, size_);
+
+        AscendC::Cast(maskedLocal, inputFloat, AscendC::RoundMode::CAST_CEIL, size_);  
+        outQueue.EnQue(maskedLocal);
+
+        AscendC::LocalTensor<float> ones_tensor = ones_buf_.Get<float>();
+        AscendC::Duplicate(ones_tensor, (float)1, size_);
+        AscendC::LocalTensor<float> maskLocal_fp32;
+
+        AscendC::Sub(maskLocal_fp32, ones_tensor, vocabMask_fp32, size_);
+
+        AscendC::LocalTensor<half> maskLocal_fp16;
+        AscendC::Cast(maskLocal_fp16, maskLocal_fp32, AscendC::RoundMode::CAST_NONE, size_);
+        AscendC::Cast(maskLocal, maskLocal_fp16, AscendC::RoundMode::CAST_NONE, size_);
+        maskQueue.EnQue(maskLocal);
+        inQueue.FreeTensor(inputLocal);
+    }
+
+    __aicore__ inline void CopyOut()
+    {
+        AscendC::LocalTensor<scalar_t> maskedLocal = outQueue.DeQue<scalar_t>();
+        AscendC::LocalTensor<bool> maskLocal = maskQueue.DeQue<bool>();
+        
+        AscendC::DataCopy(maskedOutputGlobal, maskedLocal, size_);
+        AscendC::DataCopy(maskOutGlobal, maskLocal, size_);
+        
+        outQueue.FreeTensor(maskedLocal);
+        maskQueue.FreeTensor(maskLocal);
+    }
+
+private:
+    static constexpr int32_t BUFFER_NUM = 2;
+    AscendC::TPipe pipe;
+    AscendC::TQue<AscendC::TPosition::VECIN, 1> inQueue;
+    AscendC::TQue<AscendC::TPosition::VECOUT, 1> outQueue, maskQueue;
+    AscendC::GlobalTensor<scalar_t> inputGlobal, maskedOutputGlobal;
+    AscendC::GlobalTensor<bool> maskOutGlobal;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_1;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calc_buf_2;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_ge_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_le_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_org_mask_que;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> result_add_mask_que;
+
+    // Temporary buffers
+    AscendC::TBuf<AscendC::TPosition::VECCALC> start_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> end_buf; 
+    AscendC::TBuf<AscendC::TPosition::VECCALC> inputFloat_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> validOffset_buf;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> vocabMask_buf_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> ones_buf_;
+    
+    __gm__ scalar_t *input_, *masked_input_;
+    __gm__ bool *mask_out_;
+    int64_t size_;
+    int64_t org_vocab_start_index_, org_vocab_end_index_;
+    int64_t added_vocab_start_index_, added_vocab_end_index_;
+    int64_t added_offset_;
+
+    static constexpr float MIN_ACCURACY_FP32 = 1.1754943508222875e-38;
+    static constexpr float MAX_MUL_1_FP32 = 1125899906842624;
+    static constexpr float MAX_MUL_2_FP32 = 67108864;
+    static constexpr float NEGATIVE_ONE_FP32 = -1.0f;
+};
+
+extern "C" __global__ __aicore__ void get_masked_input_and_mask_kernel(
+    __gm__ int32_t* input,
+    __gm__ int32_t* masked_input,
+    __gm__ bool* mask_out, 
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding,
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    {
+        GetMaskedInputAndMask<int32_t> op{};
+
+        for (int64_t i = AscendC::GetBlockIdx(); i < loop_cnt; i += aiv_num) {
+            op.Init(input + i * size/loop_cnt, 
+                   masked_input + i * size/loop_cnt,
+                   mask_out + i * size/loop_cnt,
+                   org_vocab_start_index, org_vocab_end_index,
+                   num_org_vocab_padding, added_vocab_start_index,
+                   added_vocab_end_index, size/loop_cnt);
+                
+            op.Process();
+        }
+    } // op destructor called here
+}
+
+namespace vllm_ascend {
+
+void get_masked_input_and_mask_impl(
+    void* stream,
+    void* input,
+    void* masked_input,
+    void* mask_out,
+    const int64_t org_vocab_start_index,
+    const int64_t org_vocab_end_index,
+    const int64_t num_org_vocab_padding, 
+    const int64_t added_vocab_start_index,
+    const int64_t added_vocab_end_index,
+    const int64_t size,
+    const uint32_t loop_cnt,
+    const uint32_t aiv_num)
+{
+    get_masked_input_and_mask_kernel<<<aiv_num, nullptr, stream>>>(
+        static_cast<int32_t*>(input),
+        static_cast<int32_t*>(masked_input),
+        static_cast<bool*>(mask_out),
+        org_vocab_start_index,
+        org_vocab_end_index,
+        num_org_vocab_padding,
+        added_vocab_start_index,
+        added_vocab_end_index,
+        size,
+        loop_cnt,
+        aiv_num);
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/pos_encoding_kernels.cpp
+++ b/csrc/kernels/pos_encoding_kernels.cpp
@ -0,0 +1,372 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include <stdio.h>
+#include "types.h"
+#include "utils.h"
+
+
+using vllm_ascend::AccType;
+using vllm_ascend::local_mem_copy;
+template <typename scalar_t, bool isNeox> class RotaryEmbedding {
+    // NOTE(ganyi): we use 512B as load stride for pipe, need to find another way to
+    // retrieve this size from runtime for more Soc support
+    #if (__CCE_AICORE__ >= 220)
+        static int constexpr loadSize = 512;
+    #else
+        static int constexpr loadSize = 1024 * 4;
+    #endif
+    using dst_t = scalar_t;
+    using acc_t = typename AccType<scalar_t>::type;
+    // only half tensor have cast instruct to int8, hardcode acc_dst_t as half
+    using local_scalar_t = AscendC::LocalTensor<scalar_t>;
+    using local_acc_t = AscendC::LocalTensor<acc_t>;
+    using local_dst_t = AscendC::LocalTensor<dst_t>;
+
+public:
+    __aicore__ inline RotaryEmbedding()
+    {
+    }
+
+    // Allocate buffers for input and output queue and the temp buffer used during kernel compute process,
+    // this init process happens only in the kernel compute on a single vector core.
+    __aicore__ inline void init(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
+                                __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
+                                const int rotDim, const int64_t dstQueryStride,
+                                const int64_t dstKeyStride, const int64_t queryStride, const int64_t keyStride,
+                                const int numHeads, const int numKvHeads, const int headSize, AscendC::TPipe *pipe)
+    {
+        pipe_ = pipe;
+        rotDim_ = rotDim;
+        // query stride and key stride is used to handle the strided tensor which is not contiguous on num_tokens dim
+        queryStride_ = queryStride;
+        keyStride_ = keyStride;
+        dstQueryStride_ = dstQueryStride;
+        dstKeyStride_ = dstKeyStride;
+        numHeads_ = numHeads;
+        numKvHeads_ = numKvHeads;
+        headSize_ = headSize;
+        embedDim_ = rotDim / 2;
+
+        pipe_->InitBuffer(inQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
+        pipe_->InitBuffer(inQueSinCos_, 1 /* buffer_num */, rotDim_ * sizeof(scalar_t) /* buffer_size */);
+        pipe_->InitBuffer(outQue_, 1 /* buffer_num */, loadSize /* buffer_size */);
+        // 2 temporary calculation buffer
+        calcTmpBufferOffset_ = 0;
+        // 1 upcast buffer for bf16 (headSize)
+        upcastInputBufferOffset_ = calcTmpBufferOffset_ + sizeof(acc_t) * embedDim_ * 2;
+        // 1 upcast temp buffer for bf16 (2 * embed_dim)
+        upcastTempBufferOffset_ = upcastInputBufferOffset_ + sizeof(acc_t) * headSize_;
+        // 2 sin cos upcast buffer for bf16
+        cosSinUpcastBufferOffset_ = upcastTempBufferOffset_ + sizeof(acc_t) * 2 * embedDim_;
+        // 2. bf16 path: needs 2 cos sin upcast buffer size
+        // 3. fp16 path: needs 2 temporary calculation buffer size
+        tempBufferSize_ = cosSinUpcastBufferOffset_ + 2 * embedDim_ * sizeof(acc_t);
+        // need to consider upcast the bf16 to fp32, so we might need 4 buffer just in case
+        // 2 temporary buffer, 2 input buffer, 1 cos buffer, 1 sin buffer, 2 scale buffer (headSize), 2 zp
+        // buffer(headSize int8), 1 dst_temp buffer(headSize, int32)
+        pipe_->InitBuffer(calcBuf_, tempBufferSize_ /* buffer_size */);
+        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+            pipe_->InitBuffer(copyBuf_, loadSize);
+        }
+    }
+    __aicore__ inline void update_mem_offset(__gm__ int64_t *positions, __gm__ void *queryDst, __gm__ void *keyDst,
+                                  __gm__ scalar_t *query, __gm__ scalar_t *key, __gm__ scalar_t *cosSinCache,
+                                  const int rotDim, const int64_t dstQueryStride, const int64_t dstKeyStride,
+                                  const int64_t queryStride, const int64_t keyStride, const int numHeads,
+                                  const int numKvHeads, const int headSize, const int64_t idx)
+    {
+        int64_t pos = positions[idx];
+        cosSin_.SetGlobalBuffer(cosSinCache + pos * rotDim_, rotDim_);
+        query_.SetGlobalBuffer(query + queryStride * idx, headSize * numHeads_);
+        key_.SetGlobalBuffer(key + keyStride * idx, headSize * numKvHeads_);
+        queryDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(queryDst) + dstQueryStride * idx,
+                                  headSize * numHeads_);
+        keyDst_.SetGlobalBuffer(reinterpret_cast<__gm__ dst_t *>(keyDst) + dstKeyStride * idx, headSize * numKvHeads_);
+    }
+
+    // compute per head for neox on bf16
+    template <typename acc_t_, typename std::enable_if<!std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
+    __aicore__ inline void
+    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
+                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
+    {
+        // slice dst
+        local_dst_t dstX = dst;
+        local_dst_t dstY = dst[embedDim_];
+
+        // slice src
+        local_scalar_t srcX = src;
+        local_scalar_t srcY = src[embedDim_];
+
+        // slice temp buffer
+        local_acc_t calcTmpBufferX = calcTmpBuffer;
+        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
+
+        // slice upcast input buffer
+        local_acc_t upcastBufferX = upcastInputBuffer;
+        local_acc_t upcastBufferY = upcastBufferX[embedDim_];
+
+        // dst x calc
+        Cast(upcastInputBuffer, src, AscendC::RoundMode::CAST_NONE, headSize_);
+        Mul(calcTmpBufferX, upcastBufferX, cos, embedDim_);
+        Mul(calcTmpBufferY, upcastBufferY, sin, embedDim_);
+        Sub(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+        Cast(dstX, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
+
+        // dst y calc
+        Mul(calcTmpBufferX, upcastBufferX, sin, embedDim_);
+        Mul(calcTmpBufferY, upcastBufferY, cos, embedDim_);
+        Add(calcTmpBufferX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+        Cast(dstY, calcTmpBufferX, AscendC::RoundMode::CAST_TRUNC, embedDim_);
+    }
+
+    // compute per head output for neox
+    template <typename acc_t_, typename std::enable_if<std::is_same_v<acc_t_, scalar_t>, void>::type * = nullptr>
+    __aicore__ inline void
+    neox_compute(local_scalar_t src, local_dst_t dst, AscendC::LocalTensor<acc_t_> sin, AscendC::LocalTensor<acc_t_> cos,
+                 AscendC::LocalTensor<acc_t_> upcastInputBuffer, AscendC::LocalTensor<acc_t_> calcTmpBuffer)
+    {
+        // slice dst buffer
+        local_dst_t dstX = dst;
+        local_dst_t dstY = dst[embedDim_];
+        // slice src buffer
+        local_scalar_t srcX = src;
+        local_scalar_t srcY = src[embedDim_];
+        // slice temp buffer
+        local_acc_t calcTmpBufferX = calcTmpBuffer;
+        local_acc_t calcTmpBufferY = calcTmpBuffer[embedDim_];
+
+        // dst x calc
+        Mul(calcTmpBufferX, srcX, cos, embedDim_);
+        Mul(calcTmpBufferY, srcY, sin, embedDim_);
+        Sub(dstX, calcTmpBufferX, calcTmpBufferY, embedDim_);
+
+        // dst y calc
+        Mul(calcTmpBufferX, srcX, sin, embedDim_);
+        Mul(calcTmpBufferY, srcY, cos, embedDim_);
+        Add(dstY, calcTmpBufferX, calcTmpBufferY, embedDim_);
+    }
+
+    __aicore__ inline void compute_qk(AscendC::GlobalTensor<scalar_t> srcG, AscendC::GlobalTensor<dst_t> dstG,
+                                          local_acc_t localCos, local_acc_t localSin, local_acc_t upcastInputBuffer,
+                                          local_acc_t calcTmpBuffer, int loopCnt, int tailHeads, int loadStride,
+                                          int headNumPerLoad)
+    {
+        for (int loopNum = 0; loopNum < loopCnt; ++loopNum) {
+            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
+            local_dst_t dst = outQue_.AllocTensor<dst_t>();
+            AscendC::DataCopy(src, srcG[loopNum * loadStride], loadStride);
+            inQue_.EnQue(src);
+
+            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
+            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+                int elem_num = loadStride / sizeof(scalar_t);
+                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
+                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
+                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
+            } else {
+                local_mem_copy(dst, srcDeque, loadStride);
+            }
+            for (int i = 0; i < headNumPerLoad; ++i) {
+                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
+                             calcTmpBuffer);
+            }
+            outQue_.EnQue(dst);
+            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
+            AscendC::DataCopy(dstG[loopNum * loadStride], dstDeque, loadStride);
+            outQue_.FreeTensor(dstDeque);
+            inQue_.FreeTensor(srcDeque);
+        }
+        // process tail
+        {
+            local_scalar_t src = inQue_.AllocTensor<scalar_t>();
+            local_dst_t dst = outQue_.AllocTensor<dst_t>();
+
+            AscendC::DataCopy(src, srcG[loopCnt * loadStride], tailHeads * headSize_);
+            inQue_.EnQue(src);
+            local_scalar_t srcDeque = inQue_.DeQue<scalar_t>();
+
+            if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+                int elem_num = tailHeads * headSize_ / sizeof(scalar_t);
+                AscendC::LocalTensor<acc_t> upBuffer = copyBuf_.GetWithOffset<acc_t>(elem_num, 0);
+                Cast(upBuffer, srcDeque, AscendC::RoundMode::CAST_TRUNC, elem_num);
+                Cast(dst, upBuffer, AscendC::RoundMode::CAST_TRUNC, elem_num);
+            } else {
+                local_mem_copy(dst, srcDeque, tailHeads * headSize_);
+            }
+
+            for (int i = 0; i < tailHeads; ++i) {
+                neox_compute(srcDeque[i * headSize_], dst[i * headSize_], localSin, localCos, upcastInputBuffer,
+                             calcTmpBuffer);
+            }
+            outQue_.EnQue(dst);
+            local_dst_t dstDeque = outQue_.DeQue<dst_t>();
+            AscendC::DataCopy(dstG[loopCnt * loadStride], dstDeque, tailHeads * headSize_);
+            outQue_.FreeTensor(dstDeque);
+            inQue_.FreeTensor(srcDeque);
+        }
+    }
+
+    __aicore__ inline void compute_function()
+    {
+        local_scalar_t cosSinLocal = inQueSinCos_.AllocTensor<scalar_t>();
+
+        AscendC::DataCopy(cosSinLocal, cosSin_, embedDim_ * 2);
+
+        inQueSinCos_.EnQue(cosSinLocal);
+        local_scalar_t localSinCosDeque = inQueSinCos_.DeQue<scalar_t>();
+        local_scalar_t localCos = localSinCosDeque;
+        local_scalar_t localSin = localSinCosDeque[embedDim_];
+
+        local_acc_t calcTmpBuffer;
+        local_acc_t upcastInputBuffer;
+        local_acc_t upcastTempBuffer;
+        local_acc_t cosSinUpcastBuffer;
+        local_acc_t scaleBuffer;
+        local_acc_t offsetBuffer;
+        calcTmpBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, calcTmpBufferOffset_);
+        upcastInputBuffer = calcBuf_.GetWithOffset<acc_t>(headSize_, upcastInputBufferOffset_);
+        upcastTempBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, upcastTempBufferOffset_);
+        cosSinUpcastBuffer = calcBuf_.GetWithOffset<acc_t>(embedDim_ * 2, cosSinUpcastBufferOffset_);
+
+        local_acc_t cosAccBuffer;
+        local_acc_t sinAccBuffer;
+
+        if constexpr (!std::is_same_v<scalar_t, acc_t>) {
+            Cast(cosSinUpcastBuffer, localSinCosDeque, AscendC::RoundMode::CAST_NONE, 2 * embedDim_);
+            cosAccBuffer = cosSinUpcastBuffer;
+            sinAccBuffer = cosSinUpcastBuffer[embedDim_];
+        } else {
+            cosAccBuffer = localCos;
+            sinAccBuffer = localSin;
+        }
+
+        constexpr const int loadSizeByElem = loadSize / sizeof(scalar_t);
+        int64_t headNumPerLoad = loadSizeByElem / headSize_;
+        int64_t loopCnt = numHeads_ / headNumPerLoad;
+        int64_t tailHeads = numHeads_ - loopCnt * headNumPerLoad;
+        int64_t loadStride = headNumPerLoad * headSize_;
+        int64_t loopCntKv = numKvHeads_ / headNumPerLoad;
+        int64_t tailHeadsKv = numKvHeads_ - loopCntKv * headNumPerLoad;
+        compute_qk(query_, queryDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer,
+                       calcTmpBuffer, loopCnt, tailHeads, loadStride, headNumPerLoad);
+
+        compute_qk(key_, keyDst_, cosAccBuffer, sinAccBuffer, upcastInputBuffer, calcTmpBuffer,
+                       loopCntKv, tailHeadsKv, loadStride, headNumPerLoad);
+
+        inQueSinCos_.FreeTensor(localSinCosDeque);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQue_, inQueSinCos_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQue_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> calcBuf_;
+    AscendC::TBuf<AscendC::TPosition::VECCALC> copyBuf_;
+    AscendC::GlobalTensor<dst_t> queryDst_;
+    AscendC::GlobalTensor<dst_t> keyDst_;
+    AscendC::GlobalTensor<scalar_t> query_;
+    AscendC::GlobalTensor<scalar_t> key_;
+    AscendC::GlobalTensor<scalar_t> cosSin_;
+    int rotDim_;
+    int embedDim_;
+    int64_t queryStride_;
+    int64_t keyStride_;
+    int64_t dstQueryStride_;
+    int64_t dstKeyStride_;
+    int numHeads_;
+    int numKvHeads_;
+    int headSize_;
+    int calcTmpBufferOffset_;
+    int upcastInputBufferOffset_;
+    int upcastTempBufferOffset_;
+    int cosSinUpcastBufferOffset_;
+    int tempBufferSize_;
+};
+
+// Note: Need to use macro to instaniate all the target functions here, for the current build system dose not support template call in cpp
+// We use C style symbol here for kernel compilation, cpp style kernel entry may lead to compilation failure
+#define ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, NEOX)                                                                            \
+    extern "C" __global__ __aicore__ void rope_custom_##NEOX##_##TYPE(                                                          \
+        __gm__ int64_t* positions, __gm__ void* queryDst, __gm__ void* keyDst, __gm__ TYPE* query, __gm__ TYPE* key,            \
+        __gm__ TYPE* cosSinCache, const int rotDim, const int64_t queryStride, const int64_t keyStride,                         \
+        const int64_t dstQueryStride, const int64_t dstKeyStride, const int numHeads, const int numKvHeads,                     \
+        const int headSize, const int64_t numTokens, const int loopNum, const int coreNum)                                      \
+    {                                                                                                                           \
+        AscendC::TPipe pipe;                                                                                                    \
+        RotaryEmbedding<TYPE, NEOX> op{};                                                                                       \
+        op.init(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,                     \
+                queryStride, keyStride, numHeads, numKvHeads, headSize, &pipe);                                                 \
+        for (int64_t i = AscendC::GetBlockIdx(); i < numTokens; i += coreNum) {                                                 \
+            op.update_mem_offset(positions, queryDst, keyDst, query, key, cosSinCache, rotDim, dstQueryStride, dstKeyStride,    \
+                      queryStride, keyStride, numHeads, numKvHeads, headSize, i);                                               \
+            op.compute_function();                                                                                              \
+        }                                                                                                                       \
+    }
+
+#define ROPE_CUSTOM_KERNEL_DECLARE(TYPE)    \
+    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, true); \
+    ROPE_CUSTOM_KERNEL_TYPE_DECLARE(TYPE, false);
+
+// Declare all the kernel entry here
+ROPE_CUSTOM_KERNEL_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    ROPE_CUSTOM_KERNEL_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+
+#define ROTARY_EMBEDDING_KERNEL_CALL(TYPE)                                                                       \
+    if (isNeox)                                                                                                  \
+        rope_custom_true_##TYPE<<<blockDim, nullptr, stream>>>(                                                  \
+            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
+            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
+            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);                                       \
+    else                                                                                                         \
+        rope_custom_false_##TYPE<<<blockDim, nullptr, stream>>>(                                                 \
+            positions, queryDst, keyDst, reinterpret_cast<TYPE *>(query), reinterpret_cast<TYPE *>(key),         \
+            reinterpret_cast<TYPE *>(cosSinCache), rotDim, queryStride, keyStride, dstQueryStride, dstKeyStride, \
+            numHeads, numKvHeads, headSize, numTokens, loopCnt, blockDim);
+
+// maximum number for runtime to launch a ascendc kernel.
+// we use this to constrain the maximum number of block size
+static const int64_t maxParallelSize = 65535;
+
+extern void rotary_embedding_impl(AscendType type, bool isNeox, void *stream, int64_t *positions, void *queryDst,
+                                    void *keyDst, void *query, void *key, void *cosSinCache, const int rotDim,
+                                    const int64_t queryStride, const int64_t keyStride, const int64_t dstQueryStride,
+                                    const int64_t dstKeyStride, const int numHeads, const int numKvHeads,
+                                    const int headSize, const int64_t numTokens, const uint32_t loopCnt,
+                                    uint32_t aivNum)
+{
+
+    int blockDim = maxParallelSize > numTokens ? numTokens : maxParallelSize;
+    if (type == AscendType::FP16) {
+        ROTARY_EMBEDDING_KERNEL_CALL(half);
+    }
+    #if (__CCE_AICORE__ >= 220)
+    else if (type == AscendType::BF16) {
+        ROTARY_EMBEDDING_KERNEL_CALL(bfloat16_t);
+    }
+    #endif
+    else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/sgmv_expand.cpp
+++ b/csrc/kernels/sgmv_expand.cpp
@ -0,0 +1,389 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class SGMVExpand {
+public:
+    using X_T = float;
+    using W_T = scalar_t;
+    using Y_T = scalar_t;
+
+    static constexpr uint64_t LORA_RANK_8 = 8;
+    static constexpr uint64_t LORA_RANK_16 = 16;
+    static constexpr uint64_t LORA_RANK_32 = 32;
+    static constexpr uint64_t LORA_RANK_64 = 64;
+    static constexpr uint64_t SUPPORTED_RANKS[] = {LORA_RANK_8, LORA_RANK_16, LORA_RANK_32, LORA_RANK_64};
+    static constexpr int32_t BUFFER_NUM = 2;
+
+    // The vector unit reads 8 blocks (32 bytes each and 256 bytes in total) of contiguous data each time.
+    static constexpr int32_t NUM_BYTES_PER_REPEAT = 256;
+    static constexpr int32_t NUM_BLOCKS_PER_REPEAT = 8;
+    // The maximum number of elements in a single iteration is 256 / sizeof(intermediate data type).
+    static constexpr int32_t NUM_ELEMENTS_PER_REPEAT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Mask is used to control the elements that participate in computation in each iteration.
+    static constexpr int32_t MASK_COUNT = NUM_BYTES_PER_REPEAT / sizeof(float);
+    // Refer to numOutputElementsPerInputTile_ initialization for the constraints on the following constants.
+    static constexpr int32_t W_IN_TILE_NUM_ELEMENTS = 8192;
+    static constexpr int32_t Y_OUT_TILE_NUM_ELEMENTS = 4096;
+    static constexpr int32_t BLOCK_REDUCE_NUM_REPEATS = W_IN_TILE_NUM_ELEMENTS / NUM_ELEMENTS_PER_REPEAT;
+    // BlockReduceSum would generate(BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT)floats. 
+    // So need to read them all and apply PairReduceSum
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_16 = 
+        (BLOCK_REDUCE_NUM_REPEATS * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+    // The second PairReduceSum for rank=32, needs half of the repetition that happened for rank=16.
+    // Same for rank=64, we do not support ranks greater than 64.
+    static constexpr int32_t PAIR_REDUCE_NUM_REPEATS_32 = (PAIR_REDUCE_NUM_REPEATS_16 + 1) / 2;
+
+public:
+    __aicore__ inline SGMVExpand(AscendC::TPipe* pipe) : pipe_(pipe) {}
+
+    __aicore__ inline void Init(__gm__ void* x, __gm__ void* weight, __gm__ void* loraIndices, uint32_t loraIndicesSize,
+                                __gm__ void* seqLen, uint32_t seqLenSize, __gm__ void* yIn, __gm__ void* yOut,
+                                uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                                uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+    {
+        batchSize_ = batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        maxLoRARank_ = maxLoRARank;
+        outputHiddenDim_ = outputHiddenDim;
+        sliceOffset_ = sliceOffset;
+        outputFullDim_ = outputFullDim;
+        singleLoRAWeightLen_ = maxLoRARank_ * outputHiddenDim_;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        yInGm_.SetGlobalBuffer((__gm__ Y_T *)yIn);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)yOut);
+        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
+        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
+
+        pipe_->InitBuffer(inQueueX_, 1, NUM_ELEMENTS_PER_REPEAT * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, W_IN_TILE_NUM_ELEMENTS * sizeof(W_T));
+        pipe_->InitBuffer(inQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+        pipe_->InitBuffer(outQueueY_, BUFFER_NUM, Y_OUT_TILE_NUM_ELEMENTS * sizeof(Y_T));
+
+        pipe_->InitBuffer(dupBufferX_, NUM_ELEMENTS_PER_REPEAT * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, W_IN_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(inBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+        pipe_->InitBuffer(tmpBufferY_, Y_OUT_TILE_NUM_ELEMENTS * sizeof(float));
+
+        // Each compute iteration would generate not one, but several output elements.
+        // Therefore, the following variable would determine how many output elements are calculated in each iteration.
+        numOutputElementsPerInputTile_ = BLOCK_REDUCE_NUM_REPEATS * (NUM_ELEMENTS_PER_REPEAT / maxLoRARank_);
+        numStreamInPerOutputTile_ = Y_OUT_TILE_NUM_ELEMENTS / numOutputElementsPerInputTile_;
+
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            yOffset_ = outputFullDim_ * idx + sliceOffset_;
+
+            // Set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            CopyInX(idx);
+            int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+            for (int32_t i = 0; i < numStreamOut; i++) {
+                CopyInY(i);
+                for (int32_t j = 0; j < numStreamInPerOutputTile_; j++) {
+                    CopyInW(i * numStreamInPerOutputTile_ + j);
+                    Compute(j * numOutputElementsPerInputTile_);
+                }
+                ScaleOutput();
+                CopyOut(i);
+            }
+            ComputeLastIteration();
+        }
+    }
+
+private:
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // Look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
+    __aicore__ inline void ComputeLastIteration()
+    {
+        int32_t remainingY = outputHiddenDim_ % Y_OUT_TILE_NUM_ELEMENTS;
+        if (remainingY == 0) {
+            return;
+        }
+        int32_t numStreamOut = outputHiddenDim_ / Y_OUT_TILE_NUM_ELEMENTS;
+        int32_t remainingW = remainingY * maxLoRARank_;
+        int32_t numCompleteWTileInForLastIteration = remainingW / W_IN_TILE_NUM_ELEMENTS;
+        int32_t remainingWForLastRepeat = remainingW % W_IN_TILE_NUM_ELEMENTS;
+
+        CopyInY(numStreamOut, remainingY);
+
+        int32_t outputIdx = 0;
+        for (outputIdx = 0; outputIdx < numCompleteWTileInForLastIteration; outputIdx++) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + outputIdx);
+            Compute(outputIdx * numOutputElementsPerInputTile_);
+        }
+
+        if (remainingWForLastRepeat != 0) {
+            CopyInW(numStreamOut * numStreamInPerOutputTile_ + numCompleteWTileInForLastIteration,
+                    remainingWForLastRepeat);
+            int32_t lastRepeatCount = remainingWForLastRepeat / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat16 = 
+                (lastRepeatCount * NUM_BLOCKS_PER_REPEAT + NUM_ELEMENTS_PER_REPEAT - 1) / NUM_ELEMENTS_PER_REPEAT;
+            int32_t pairReduceRepeat32 = (pairReduceRepeat16 + 1) / 2;
+            int32_t lastComputeOutputElement = outputIdx * numOutputElementsPerInputTile_;
+            Compute(lastComputeOutputElement, lastRepeatCount, pairReduceRepeat16, pairReduceRepeat32);
+        }
+
+        ScaleOutput(remainingY);
+        CopyOut(numStreamOut, remainingY);
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        if constexpr (std::is_same_v<X_T, float>) {
+            DataCopy(xLocal, xGm_[maxLoRARank_ * idx], maxLoRARank_);
+        } else {
+            uint16_t blockLen = static_cast<uint16_t>(maxLoRARank_ * sizeof(X_T));
+            DataCopyPad(xLocal, xGm_[maxLoRARank_ * idx], {1, blockLen, 0, 0}, {});
+        }
+        inQueueX_.EnQue(xLocal);
+        xLocal = inQueueX_.DeQue<X_T>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+
+        // As we are generating multiple output elements with one API invocation,
+        // we need to duplicate the X vector multiple times to fill one NUM_BYTES_PER_REPEAT
+        if constexpr (std::is_same_v<X_T, float>) {
+            for (int32_t i = 0; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xLocal.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        } else {
+            Cast(xDup, xLocal, AscendC::RoundMode::CAST_NONE, maxLoRARank_);
+            pipe_barrier(PIPE_V);
+
+            for (int32_t i = maxLoRARank_; i < NUM_ELEMENTS_PER_REPEAT; i += maxLoRARank_) {
+                for (int32_t j = 0; j < maxLoRARank_; j++) {
+                    float entry = xDup.GetValue(j);
+                    xDup.SetValue(i + j, entry);
+                }
+            }
+        }
+        inQueueX_.FreeTensor(xLocal);
+    }
+
+    __aicore__ inline void CopyInY(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.AllocTensor<Y_T>();
+        DataCopy(yInLocal, yInGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], numElements);
+        inQueueY_.EnQue(yInLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t progress, int32_t numElements = W_IN_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + progress * W_IN_TILE_NUM_ELEMENTS], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    __aicore__ inline void ScaleOutput(int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yInLocal = inQueueY_.DeQue<Y_T>();
+        AscendC::LocalTensor<float> yInLocalFP32 = inBufferY_.Get<float>();
+        Cast(yInLocalFP32, yInLocal, AscendC::RoundMode::CAST_NONE, numElements);
+        pipe_barrier(PIPE_V);
+        inQueueY_.FreeTensor(yInLocal);
+
+        Add(yLocal, yLocal, yInLocalFP32, numElements);
+        pipe_barrier(PIPE_V);
+
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+        Cast(yOutLocal, yLocal, AscendC::RoundMode::CAST_RINT, numElements);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void Compute(int32_t progress,
+                                   int32_t blockReduceRepeatCount=BLOCK_REDUCE_NUM_REPEATS,
+                                   int32_t pairReduceRepeat16=PAIR_REDUCE_NUM_REPEATS_16,
+                                   int32_t pairReduceRepeat32=PAIR_REDUCE_NUM_REPEATS_32)
+    {
+        AscendC::LocalTensor<float> yLocal = tmpBufferY_.Get<float>();
+        AscendC::LocalTensor<float> xDup = dupBufferX_.Get<float>();
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, MASK_COUNT, blockReduceRepeatCount, castParams_);
+        pipe_barrier(PIPE_V);
+        inQueueW_.FreeTensor(wLocal);
+
+        Mul(wTmpTensor, xDup, wTmpTensor, MASK_COUNT, blockReduceRepeatCount, dotProductParams_);
+        pipe_barrier(PIPE_V);
+
+        if (maxLoRARank_ == LORA_RANK_8) {
+            BlockReduceSum(yLocal[progress], wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_16) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_32) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(wTmpTensor, wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            PairReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat32, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        } else if (maxLoRARank_ == LORA_RANK_64) {
+            BlockReduceSum(wTmpTensor, wTmpTensor, blockReduceRepeatCount, MASK_COUNT,
+                           reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+            BlockReduceSum(yLocal[progress], wTmpTensor, pairReduceRepeat16, MASK_COUNT,
+                          reduceSumParams_.dstRepStride, reduceSumParams_.srcBlkStride, reduceSumParams_.srcRepStride);
+            pipe_barrier(PIPE_V);
+        }
+    }
+
+    __aicore__ inline void CopyOut(int32_t progress, int32_t numElements = Y_OUT_TILE_NUM_ELEMENTS)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[yOffset_ + progress * Y_OUT_TILE_NUM_ELEMENTS], yOutLocal, numElements);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe* pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueY_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, 1> inQueueX_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, BUFFER_NUM> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferW_, dupBufferX_, inBufferY_, tmpBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<Y_T> yInGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
+    AscendC::GlobalTensor<int64_t> seqLenGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t maxLoRARank_;
+    uint32_t outputHiddenDim_;
+    uint32_t sliceOffset_;
+    uint32_t outputFullDim_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    uint32_t numOutputElementsPerInputTile_;
+    uint32_t numStreamInPerOutputTile_;
+    uint64_t yOffset_;
+
+    // The block stride is set to 1, and 8 blocks in the same repeat are processed continuously.
+    // The repeat stride is 8, so the vector unit reads 8 consecutive blocks in the first repeat,
+    // reads next 8 consecutive blocks in the second repeat.
+    AscendC::UnaryRepeatParams castParams_ = {1, 1, 8, 4};
+
+    // For each repeat in BlockReduceSum and PairReduceSum we should move forward only one block,
+    // so we set dstRepStride = 1
+    AscendC::UnaryRepeatParams reduceSumParams_ = {1, 1, 1, 8};
+
+    // When the repeat stride is 0, the vector unit repeatedly reads and computes the first 8 consecutive blocks.
+    // For xDup we repeatedly use it, so we set src0RepStride = 0
+    AscendC::BinaryRepeatParams dotProductParams_ = {1, 1, 1, 8, 0, 8};
+
+};
+
+#define SGMV_EXPAND_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void sgmv_expand_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
+                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
+                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
+                                                             __gm__ void* yIn,  __gm__ void* yOut,                     \
+                                                             uint32_t batchSize, uint32_t numTokensPerCore,            \
+                                                             uint32_t maxLoRARank, uint32_t outputHiddenDim,           \
+                                                             uint32_t sliceOffset, uint32_t outputFullDim)             \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        SGMVExpand<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
+                yIn, yOut, batchSize, numTokensPerCore, maxLoRARank,                                                   \
+                outputHiddenDim, sliceOffset, outputFullDim);                                                          \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+SGMV_EXPAND_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    SGMV_EXPAND_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void sgmv_expand_impl(AscendType type, void* stream, void* x, void* weight, 
+                             void* loraIndices, uint32_t loraIndicesSize,
+                             void* seqLen, uint32_t seqLenSize,
+                             void* yIn, void* yOut, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t maxLoRARank,
+                             uint32_t outputHiddenDim, uint32_t sliceOffset, uint32_t outputFullDim)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        sgmv_expand_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
+                                                        yIn, yOut, batchSize,
+                                                        numTokensPerCore, maxLoRARank, outputHiddenDim, sliceOffset, 
+                                                        outputFullDim);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            sgmv_expand_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+                                                                  seqLen, seqLenSize, yIn, yOut, batchSize,
+                                                                  numTokensPerCore, maxLoRARank, outputHiddenDim,
+                                                                  sliceOffset, outputFullDim);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/sgmv_shrink.cpp
+++ b/csrc/kernels/sgmv_shrink.cpp
@ -0,0 +1,275 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernel_operator.h"
+#include "types.h"
+
+template <typename scalar_t>
+class SGMVShrink {
+public:
+    using X_T = scalar_t;
+    using W_T = scalar_t;
+    using Y_T = float;
+
+    static constexpr uint64_t BUFFER_NUM = 1;
+    static constexpr uint64_t TILE_LENGTH = 11776;  // optimal performance tile length
+
+public:
+    __aicore__ inline SGMVShrink(AscendC::TPipe *pipe) : pipe_(pipe) {}
+    __aicore__ inline void Init(__gm__ void *x, __gm__ void *weight, __gm__ void *loraIndices, uint32_t loraIndicesSize,
+                                __gm__ void *seqLen, uint32_t seqLenSize,
+                                __gm__ void *y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                                uint32_t maxLoRARank, float scale)
+    {
+        batchSize_ =  batchSize;
+        numTokensPerCore_ = numTokensPerCore;
+        inputHiddenDim_ = inputHiddenDim;
+        maxLoRARank_ = maxLoRARank;
+        scale_ = scale;
+        singleLoRAWeightLen_ = inputHiddenDim_ * maxLoRARank_;
+        incremental_ = inputHiddenDim_ > TILE_LENGTH;
+
+        xGm_.SetGlobalBuffer((__gm__ X_T *)x);
+        yOutGm_.SetGlobalBuffer((__gm__ Y_T *)y);
+        wGm_.SetGlobalBuffer((__gm__ W_T *)weight);
+        loraIndicesGm_.SetGlobalBuffer((__gm__ int64_t *)loraIndices, loraIndicesSize);
+        seqLenGm_.SetGlobalBuffer((__gm__ int64_t *)seqLen, seqLenSize);
+
+        pipe_->InitBuffer(inQueueX_, BUFFER_NUM, TILE_LENGTH * sizeof(X_T));
+        pipe_->InitBuffer(inQueueW_, BUFFER_NUM, TILE_LENGTH * sizeof(W_T));
+        pipe_->InitBuffer(tmpBufferX_, TILE_LENGTH * sizeof(float));
+        pipe_->InitBuffer(tmpBufferW_, TILE_LENGTH * sizeof(float));
+        
+        pipe_->InitBuffer(outQueueY_, 1, maxLoRARank_ * sizeof(Y_T));
+        pipe_->InitBuffer(outBufferY_, maxLoRARank_ * sizeof(float));
+    }
+
+    __aicore__ inline void Process()
+    {
+        int64_t blockIdx = AscendC::GetBlockIdx();
+        int64_t startIdx = blockIdx * numTokensPerCore_;
+        int64_t endIdx = startIdx + numTokensPerCore_;
+        if (endIdx > batchSize_) {
+            endIdx = batchSize_;
+        }
+        for (int64_t idx = startIdx; idx < endIdx; idx++) {
+            // set up LoRA index
+            CopyInIndex(idx);
+            if (reqLoRAIndex_ < 0) {
+                continue;
+            }
+            reqLoRAWeightOffset_ = reqLoRAIndex_ * singleLoRAWeightLen_;
+
+            if (incremental_) {
+                ProcessImpl<true>(idx);
+            } else {
+                ProcessImpl<false>(idx);
+            }
+
+            ScaleOutput();
+            CopyOut(idx);
+        }
+    }
+
+private:
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void ProcessImpl(const int64_t idx)
+    {
+        AscendC::LocalTensor<float> yOutLocal = outBufferY_.Get<float>();
+        if constexpr (!INCREMENTAL_MODE) {
+            CopyInX(idx, 0, inputHiddenDim_);
+            AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, inputHiddenDim_);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+        }
+
+        for (int i = 0; i < maxLoRARank_; i++) {
+            float acc(0);
+            for (int32_t j = 0; j < inputHiddenDim_ / TILE_LENGTH; j++) {
+                if constexpr (INCREMENTAL_MODE) {
+                    CopyInX(idx, j);
+                }
+                CopyInW(i, j);
+                Compute<INCREMENTAL_MODE>(acc);
+            }
+            CopyAndComputeLastIteration<INCREMENTAL_MODE>(idx, i, acc);
+            yOutLocal.SetValue(i, acc);
+        }
+    }
+
+    __aicore__ inline void CopyInIndex(const int64_t idx)
+    {
+        // look up the LoRA index
+        int64_t weightIdx = idx;
+        uint64_t i = 0;
+        for (; i < seqLenGm_.GetSize(); i++) {
+            int64_t repeatValue = seqLenGm_.GetValue(i);
+            if (weightIdx >= repeatValue) {
+                weightIdx -= repeatValue;
+                continue;
+            }
+            break;
+        }
+        reqLoRAIndex_ = (i < seqLenGm_.GetSize()) ? loraIndicesGm_.GetValue(i) : -1;
+    }
+
+    __aicore__ inline void CopyInX(const int64_t idx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<X_T> xLocal = inQueueX_.AllocTensor<X_T>();
+        DataCopy(xLocal, xGm_[inputHiddenDim_ * idx + colIdx * TILE_LENGTH], numElements);
+        inQueueX_.EnQue(xLocal);
+    }
+
+    __aicore__ inline void CopyInW(int32_t rowIdx, int32_t colIdx, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.AllocTensor<W_T>();
+        DataCopy(wLocal, wGm_[reqLoRAWeightOffset_ + rowIdx * inputHiddenDim_ + colIdx * TILE_LENGTH], numElements);
+        inQueueW_.EnQue(wLocal);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void Compute(float &acc, int32_t numElements = TILE_LENGTH)
+    {
+        AscendC::LocalTensor<W_T> wLocal = inQueueW_.DeQue<W_T>();
+        AscendC::LocalTensor<float> xTmpTensor = tmpBufferX_.Get<float>();
+        AscendC::LocalTensor<float> wTmpTensor = tmpBufferW_.Get<float>();
+
+        if constexpr (INCREMENTAL_MODE) {
+            AscendC::LocalTensor<X_T> xLocal = inQueueX_.DeQue<X_T>();
+            Cast(xTmpTensor, xLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueX_.FreeTensor(xLocal);
+            inQueueW_.FreeTensor(wLocal);
+        } else {
+            Cast(wTmpTensor, wLocal, AscendC::RoundMode::CAST_NONE, numElements);
+            pipe_barrier(PIPE_V);
+            inQueueW_.FreeTensor(wLocal);
+        }
+        // dot product of the one tile of X and W 
+        Mul(wTmpTensor, xTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+        // reduce sum generate one number, which is the summation of all the dot product
+        ReduceSum<float>(wTmpTensor, wTmpTensor, wTmpTensor, numElements);
+        pipe_barrier(PIPE_V);
+
+        acc += wTmpTensor.GetValue(0);
+    }
+
+    template <bool INCREMENTAL_MODE>
+    __aicore__ inline void CopyAndComputeLastIteration(const int64_t idx, int32_t rowIdx, float &acc)
+    {
+        int32_t colIdx = inputHiddenDim_ / TILE_LENGTH;
+        int32_t remaining = inputHiddenDim_ % TILE_LENGTH;
+        if (remaining == 0) {
+            return;
+        }
+        if constexpr (INCREMENTAL_MODE) {
+            CopyInX(idx, colIdx, remaining);
+        }
+        CopyInW(rowIdx, colIdx, remaining);
+        Compute<INCREMENTAL_MODE>(acc, remaining);
+    }
+
+    __aicore__ inline void ScaleOutput()
+    {
+        AscendC::LocalTensor<float> yLocal = outBufferY_.Get<float>();
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.AllocTensor<Y_T>();
+
+        Muls(yOutLocal, yLocal, scale_, maxLoRARank_);
+        pipe_barrier(PIPE_V);
+
+        outQueueY_.EnQue<Y_T>(yOutLocal);
+    }
+
+    __aicore__ inline void CopyOut(const int64_t idx)
+    {
+        AscendC::LocalTensor<Y_T> yOutLocal = outQueueY_.DeQue<Y_T>();
+        DataCopy(yOutGm_[maxLoRARank_ * idx], yOutLocal, maxLoRARank_);
+        outQueueY_.FreeTensor(yOutLocal);
+    }
+
+private:
+    AscendC::TPipe *pipe_;
+    AscendC::TQue<AscendC::QuePosition::VECIN, BUFFER_NUM> inQueueX_, inQueueW_;
+    AscendC::TQue<AscendC::QuePosition::VECOUT, 1> outQueueY_;
+    AscendC::TBuf<AscendC::QuePosition::VECCALC> tmpBufferX_, tmpBufferW_, outBufferY_;
+    AscendC::GlobalTensor<X_T> xGm_;
+    AscendC::GlobalTensor<W_T> wGm_;
+    AscendC::GlobalTensor<int64_t> loraIndicesGm_;
+    AscendC::GlobalTensor<int64_t> seqLenGm_;
+    AscendC::GlobalTensor<Y_T> yOutGm_;
+    uint32_t batchSize_;
+    uint32_t numTokensPerCore_;
+    uint32_t inputHiddenDim_;
+    uint32_t maxLoRARank_;
+    float scale_;
+    uint32_t singleLoRAWeightLen_;
+    int64_t reqLoRAIndex_;
+    uint64_t reqLoRAWeightOffset_;
+    bool incremental_;
+};
+
+#define SGMV_SHRINK_TYPE_DECLARE(TYPE)                                                                                 \
+    extern "C" __global__ __aicore__ void sgmv_shrink_##TYPE(__gm__ void* x, __gm__ void* weight,                      \
+                                                             __gm__ void* loraIndices, uint32_t loraIndicesSize,       \
+                                                             __gm__ void* seqLen, uint32_t seqLenSize,                 \
+                                                             __gm__ void* y, uint32_t batchSize,                       \
+                                                             uint32_t numTokensPerCore, uint32_t inputHiddenDim,       \
+                                                             uint32_t maxLoRARank, float scale)                        \
+    {                                                                                                                  \
+        AscendC::TPipe pipe;                                                                                           \
+        SGMVShrink<TYPE> op(&pipe);                                                                                    \
+        op.Init(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize,                                           \
+            y, batchSize, numTokensPerCore, inputHiddenDim, maxLoRARank, scale);                                       \
+        op.Process();                                                                                                  \
+    }
+
+// declare all dtype kernel
+SGMV_SHRINK_TYPE_DECLARE(half)
+#if (__CCE_AICORE__ >= 220)
+    SGMV_SHRINK_TYPE_DECLARE(bfloat16_t)
+#endif
+
+namespace vllm_ascend {
+extern void sgmv_shrink_impl(AscendType type, void* stream, void* x, void* weight, 
+                             void* loraIndices, uint32_t loraIndicesSize,
+                             void* seqLen, uint32_t seqLenSize,
+                             void* y, uint32_t batchSize, uint32_t numTokensPerCore, uint32_t inputHiddenDim,
+                             uint32_t maxLoRARank, float scale)
+{
+    uint32_t blockDim = (batchSize + numTokensPerCore - 1) / numTokensPerCore;
+    if (type == AscendType::FP16) {
+        sgmv_shrink_half<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, seqLen, seqLenSize, 
+                                                        y, batchSize, 
+                                                        numTokensPerCore, inputHiddenDim, maxLoRARank,
+                                                        scale);
+    } else if (type == AscendType::BF16) {
+        #if (__CCE_AICORE__ >= 220)
+            sgmv_shrink_bfloat16_t<<<blockDim, nullptr, stream>>>(x, weight, loraIndices, loraIndicesSize, 
+                                                                  seqLen, seqLenSize, 
+                                                                  y, batchSize,
+                                                                  numTokensPerCore, inputHiddenDim, maxLoRARank,
+                                                                  scale);
+        #endif
+    } else {
+        return;
+    }
+}
+
+} // namespace vllm_ascend
--- a/csrc/kernels/types.h
+++ b/csrc/kernels/types.h
@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace vllm_ascend {
+enum struct AscendType {
+    FP16 = 0,
+    BF16 = 1,
+    FP32 = 2,
+};
+}
--- a/csrc/kernels/utils.h
+++ b/csrc/kernels/utils.h
@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include "kernel_type.h"
+namespace vllm_ascend {
+
+template <typename scalar_t> struct AccType;
+
+#if (__CCE_AICORE__ >= 220)
+template <> struct AccType<bfloat16_t> {
+  using type = float;
+};
+#endif
+
+template <> struct AccType<half> {
+    using type = half;
+};
+
+template <> struct AccType<float> {
+    using type = float;
+};
+
+template <> struct AccType<int8_t> {
+    using type = int;
+};
+
+template <typename scalar_t>
+__aicore__ inline void local_mem_copy(AscendC::LocalTensor<scalar_t> dst, AscendC::LocalTensor<scalar_t> src, int size)
+{
+    constexpr int loadSize = 256 / sizeof(scalar_t);
+    int loopCnt = size / loadSize;
+    int tailSize = size % loadSize;
+    if (loopCnt)
+        AscendC::Copy(dst, src, loadSize, loopCnt, {1, 1, 8, 8});
+    AscendC::Copy(dst[loopCnt * loadSize], src[loopCnt * loadSize], tailSize, 1, {1, 1, 8, 8});
+}
+} // namespace vllm_ascend
--- a/csrc/mla_preprocess/op_host/mla_preprocess.h
+++ b/csrc/mla_preprocess/op_host/mla_preprocess.h
@ -0,0 +1,698 @@
+// Adapted from
+//   https://gitee.com/ascend/ascend-transformer-boost.git
+//   https://gitee.com/ascend/op-plugin.git
+//
+// Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+// This file is a part of the CANN Open Software.
+// Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+//
+
+#include <fstream>
+#include <iostream>
+#include <math.h>
+#include <stdexcept>
+#include "acl/acl.h"
+// #include "defines.h"
+// #include "torch_helper.h"
+#include "tiling/platform/platform_ascendc.h"
+#include "tiling/mla_preprocess_tiling.h"
+
+// #include "aclrtlaunch_mla_preprocess.h"
+
+// namespace sglang {
+namespace mlapo {
+
+constexpr uint32_t DIM_2 = 2;
+
+constexpr uint32_t AXES_ALIGN_SIZE = 512;
+constexpr uint32_t BASE_BLOCK_STEP = 2;
+constexpr uint32_t CONST_16 = 16;
+constexpr uint32_t CONST_32 = 32;
+constexpr uint32_t CONST_128 = 128;
+constexpr uint32_t CONST_256 = 256;
+constexpr uint32_t CONST_512 = 512;
+constexpr uint32_t L1_BUFFER_SIZE = 524288;
+constexpr uint32_t L1_PINGPONG_BUFFER_LEN = 262144;
+constexpr uint32_t L0AB_PINGPONG_BUFFER_LEN = 131072;
+constexpr uint32_t L1_SCALE_SIZE = 4096;
+constexpr uint32_t L1_BIAS_SIZE = 2048;
+constexpr uint32_t L0C_SIZE = 128 * 1024;
+constexpr uint32_t CONCAT_SIZE = 512;
+
+constexpr uint32_t HIDDEN_STRATE = 7168;
+constexpr uint32_t HIDDEN_STRATE_ROPE = 192;
+constexpr uint32_t HIDDEN_STRATE_MM = 2112;
+constexpr uint32_t HIDDEN_STRATE_RMS = 1536;
+constexpr uint32_t UB_SIZE = 196352;
+constexpr uint32_t HEADDIM = 64;
+constexpr uint32_t FP32_REPEAT_MASK = 64;
+constexpr uint32_t FP16_REPEAT_MASK = 128;
+
+constexpr int32_t NUM1 = 1;
+constexpr int32_t NUM2 = 2;
+constexpr int32_t NUM3 = 3;
+constexpr int32_t NUM4 = 4;
+constexpr int32_t NUM8 = 8;
+constexpr uint32_t INDEX_WDQKV = 5;
+constexpr uint32_t INDEX_WUQ = 18;
+constexpr uint32_t INDEX_WUK = 20;
+
+constexpr uint32_t MAX_SUPPORT_TOKEN_NUMS = 1024;
+
+inline uint32_t CeilDiv(const uint32_t dividend, const uint32_t divisor)
+{
+    if (divisor == 0) {
+        return UINT32_MAX;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+inline uint32_t RoundUp(const uint32_t val, const uint32_t align = 16)
+{
+    if (align == 0) {
+        return 0;
+    }
+    return (val + align - 1) / align * align;
+}
+
+inline uint32_t RoundDown(const uint32_t val, const uint32_t align = 16)
+{
+    if (align == 0) {
+        return 0;
+    }
+    return val / align * align;
+}
+
+template <typename T = uint32_t>
+inline T Max(const T a, const T b)
+{
+    return a > b ? a : b;
+}
+
+template <typename T = uint32_t>
+inline T Min(const T a, const T b)
+{
+    return a < b ? a : b;
+}
+
+struct MlaPreprocess {
+    enum class QuantMode : int32_t {
+        PER_TENSOR_ASYMM_QUANT = 0,
+        PER_TOKEN_SYMM_QUANT,
+        PER_TOKEN_ASYMM_QUANT,
+        NO_QUANT
+    };
+};
+using QuantMode = MlaPreprocess::QuantMode;
+
+struct PlatformInfo {
+    uint32_t coreNum;
+    uint32_t coreNumAic;
+    uint32_t coreNumAiv;
+    uint64_t ubSize;
+    uint64_t l1Size;
+    uint64_t l2Size;
+    uint64_t l0aSize;
+    uint64_t l0bSize;
+    uint64_t l0cSize;
+};
+
+struct OpParam {
+    uint32_t N;
+    uint32_t headNum;
+    int32_t cacheMode;
+    QuantMode quantMode;
+    caffe2::TypeMeta inDtype;
+};
+
+class PpMatmulTilingApi
+{
+public:
+    PpMatmulTilingApi(struct PlatformInfo &platformInfo, uint32_t numBatch, uint32_t m, uint32_t k, uint32_t n,
+                      bool transA, bool transB, bool enDequant, bool deqOnTheFly)
+        : platformInfo_(platformInfo),
+          numBatch_(numBatch),
+          m_(m),
+          k_(k),
+          n_(n),
+          transA_(transA),
+          transB_(transB),
+          enDequant_(enDequant),
+          deqOnTheFly_(deqOnTheFly)
+    {
+        inDataSize_ = enDequant ? sizeof(uint8_t) : sizeof(uint16_t);
+    }
+    void GetTilingData(PpMatmulTilingData &tiling);
+
+private:
+    void GetTileSize();
+    float GetCost(const uint32_t m0, const uint32_t n0);
+    void UpdateTileSize(const uint32_t m0, const uint32_t n0);
+    void Swizzle();
+    uint32_t ComputeL1AbSize();
+    uint32_t ComputeK0ForABpingpong(uint32_t l1AbSize);
+    bool IsLoadAllAmat(uint32_t l1AbSize);
+    uint32_t ComputeK0ForOnlyBpingpong(uint32_t l1AbSize);
+
+private:
+    uint32_t numBatch_{0};
+    uint32_t m_{0};
+    uint32_t k_{0};
+    uint32_t n_{0};
+    uint32_t m0_{0};
+    uint32_t k0_{0};
+    uint32_t n0_{0};
+    uint32_t mLoop_{0};
+    uint32_t kLoop_{0};
+    uint32_t nLoop_{0};
+    uint32_t coreLoop_{0};
+    uint32_t swizzleCount_{0};
+    uint32_t blockDim_{0};
+    uint32_t swizzleDirect_{0};
+    uint32_t inDataSize_{0};
+    uint32_t b0matPingPongBufferLen_{L1_PINGPONG_BUFFER_LEN};
+    bool transA_{false};
+    bool transB_{false};
+    bool enDequant_{false};
+    bool enShuffleK_{false};
+    bool enLoadAllAmat_{false};
+    bool deqOnTheFly_{false};
+
+    struct PlatformInfo platformInfo_;
+};
+
+void PpMatmulTilingApi::GetTilingData(PpMatmulTilingData &tiling)
+{
+    GetTileSize();
+    tiling.numBatch = numBatch_;
+    tiling.m = m_;
+    tiling.k = k_;
+    tiling.n = n_;
+    tiling.m0 = m0_;
+    tiling.k0 = k0_;
+    tiling.n0 = n0_;
+    tiling.mLoop = mLoop_;
+    tiling.kLoop = kLoop_;
+    tiling.nLoop = nLoop_;
+    tiling.coreLoop = coreLoop_;
+    tiling.swizzleCount = swizzleCount_;
+    tiling.swizzleDirect = swizzleDirect_;
+    tiling.enShuffleK = static_cast<uint32_t>(enShuffleK_);
+    tiling.blockDim = blockDim_;
+    tiling.enLoadAllAmat = static_cast<uint32_t>(enLoadAllAmat_);
+    tiling.b0matPingPongBufferLen = b0matPingPongBufferLen_;
+}
+
+void PpMatmulTilingApi::GetTileSize()
+{
+    bool priFlag = !(m_ < n_);
+    uint32_t roundBase = pow(2, ceil(log(CeilDiv(priFlag ? n_ : m_, CONST_16)))) * CONST_16;
+    uint32_t priAxes = RoundUp(priFlag ? m_ : n_, CONST_16);
+    uint32_t subAxes = RoundUp(priFlag ? n_ : m_, roundBase);
+    float minCost = __FLT_MAX__;
+    uint32_t maxAxes0 = AXES_ALIGN_SIZE;
+    uint32_t maxPriAxes0 = Min(maxAxes0, priAxes);
+    uint32_t maxSubAxes0 = Min(maxAxes0, subAxes);
+    for (uint32_t priAxes0 = CONST_16; priAxes0 <= maxPriAxes0; priAxes0 *= BASE_BLOCK_STEP) {
+        for (uint32_t subAxes0 = CONST_16; subAxes0 <= maxSubAxes0; subAxes0 *= BASE_BLOCK_STEP) {
+            if (priAxes0 * subAxes0 * sizeof(float) > platformInfo_.l0cSize) {
+                continue;
+            }
+            uint32_t newM0 = priFlag ? priAxes0 : subAxes0;
+            uint32_t newN0 = priFlag ? subAxes0 : priAxes0;
+            if (newN0 > CONST_256 && enDequant_) {
+                continue;
+            }
+            float cost = GetCost(newM0, newN0);
+            if (cost < minCost) {
+                minCost = cost;
+                UpdateTileSize(newM0, newN0);
+            }
+        }
+    }
+
+    Swizzle();
+
+    uint32_t l1AbSize = ComputeL1AbSize();
+    k0_ = ComputeK0ForABpingpong(l1AbSize);
+    kLoop_ = CeilDiv(k_, k0_);
+}
+
+uint32_t PpMatmulTilingApi::ComputeK0ForOnlyBpingpong(uint32_t l1AbSize)
+{
+    enLoadAllAmat_ = true;
+    b0matPingPongBufferLen_ = static_cast<uint32_t>(
+        static_cast<float>((l1AbSize - RoundUp(m_, CONST_16) * RoundUp(k_, CONST_32) * inDataSize_) / DIM_2));
+    uint32_t k0MaxB0 =
+        static_cast<uint32_t>(static_cast<float>(b0matPingPongBufferLen_ / (RoundUp(n0_, CONST_16) * inDataSize_)));
+    uint32_t k0B0 = k0MaxB0 < CONST_512 ? RoundDown(k0MaxB0, CONST_32) : RoundDown(k0MaxB0, CONST_512);
+    return k0B0 > CONST_512 ? RoundDown(k0B0, CONST_512) : k0B0;
+}
+
+bool PpMatmulTilingApi::IsLoadAllAmat(uint32_t l1AbSize)
+{
+    return (coreLoop_ > blockDim_) && enDequant_ && (kLoop_ > 1) &&
+           (l1AbSize > RoundUp(m_, CONST_16) * RoundUp(k_, CONST_32) * inDataSize_) && (mLoop_ == 1);
+}
+
+uint32_t PpMatmulTilingApi::ComputeK0ForABpingpong(uint32_t l1AbSize)
+{
+    uint32_t k0Max = static_cast<uint32_t>(static_cast<float>(l1AbSize / DIM_2) / ((m0_ + n0_) * inDataSize_));
+    uint32_t tmpK0;
+    if (enDequant_) {
+        tmpK0 = k0Max < CONST_512 ? RoundDown(k0Max, CONST_32) : RoundDown(k0Max, CONST_512);
+    } else {
+        tmpK0 = k0Max < CONST_256 ? RoundDown(k0Max, CONST_16) : RoundDown(k0Max, CONST_256);
+    }
+    if (tmpK0 > CONST_512) {
+        tmpK0 = RoundDown(tmpK0, CONST_512);
+    }
+    return tmpK0;
+}
+
+uint32_t PpMatmulTilingApi::ComputeL1AbSize()
+{
+    if (enDequant_ && deqOnTheFly_) {
+        return L1_BUFFER_SIZE;
+    }
+    return enDequant_ ? (L1_BUFFER_SIZE - L1_BIAS_SIZE - L1_SCALE_SIZE) : L1_BUFFER_SIZE;
+}
+
+float PpMatmulTilingApi::GetCost(const uint32_t m0, const uint32_t n0)
+{
+    float aCoef = 1.0;
+    float bCoef = 1.0;
+    float bwCoef = 5.0;
+    uint32_t mLoop = CeilDiv(m_, m0);
+    uint32_t nLoop = CeilDiv(n_, n0);
+    if (mLoop == 0 || nLoop == 0) {
+        return __FLT_MAX__;
+    }
+    uint32_t rqdNumCore = numBatch_ * mLoop * nLoop;
+    uint32_t blockDim = Min(rqdNumCore, platformInfo_.coreNumAic);
+    uint32_t mOnce = blockDim < nLoop ? m0 : blockDim / nLoop * m0;
+    uint32_t nOnce = blockDim < nLoop ? platformInfo_.coreNumAic * n0 : n_;
+    if (mOnce * k_ * sizeof(uint16_t) > platformInfo_.l2Size) {
+        aCoef = bwCoef;
+    }
+    if (nOnce * k_ * sizeof(uint16_t) > platformInfo_.l2Size) {
+        bCoef = bwCoef;
+    }
+    if (transA_ && m0 % CONST_256 == 0) {
+        aCoef *= NUM2;
+    }
+    if (!transB_ && n0 % CONST_256 == 0) {
+        bCoef *= NUM2;
+    }
+    return 1 / (aCoef * static_cast<float>(n0)) + 1 / (bCoef * static_cast<float>(m0));
+}
+
+void PpMatmulTilingApi::UpdateTileSize(const uint32_t m0, const uint32_t n0)
+{
+    m0_ = m0;
+    n0_ = n0;
+    mLoop_ = CeilDiv(m_, m0_);
+    nLoop_ = CeilDiv(n_, n0_);
+    coreLoop_ = numBatch_ * mLoop_ * nLoop_;
+    const uint32_t maxNumCubeCore = platformInfo_.coreNumAic;
+    if (mLoop_ == 1 && transB_ && coreLoop_ % maxNumCubeCore < maxNumCubeCore / NUM4 * NUM3) {
+        uint32_t tmpM0 = RoundUp(m_, CONST_16);
+        uint32_t maxN0 = L0C_SIZE / (tmpM0 * sizeof(float));
+        if (enDequant_) {
+            maxN0 = maxN0 < CONST_256 ? maxN0 : CONST_256;
+        }
+        uint32_t x = CeilDiv(n_, maxNumCubeCore);
+        uint32_t y = CeilDiv(x, maxN0);
+        uint32_t tmpN0 = RoundUp(CeilDiv(x, y), CONST_16);
+        uint32_t rqdL0cSize = tmpM0 * tmpN0 * sizeof(float);
+        if (rqdL0cSize < L0C_SIZE && (tmpM0 + tmpN0) * CONST_256 * inDataSize_ < L1_BUFFER_SIZE) {
+            m0_ = tmpM0;
+            n0_ = tmpN0;
+            nLoop_ = CeilDiv(n_, n0_);
+            coreLoop_ = numBatch_ * nLoop_;
+        }
+    }
+    blockDim_ = Min(coreLoop_, maxNumCubeCore);
+}
+
+void PpMatmulTilingApi::Swizzle()
+{
+    float minCost = m_ * k_ + k_ * n_;
+    for (uint32_t i = 1; i <= blockDim_; ++i) {
+        int c = static_cast<int32_t>((blockDim_ + i - 1) / i);
+        float cost;
+        // B0 + A < A0 + B
+        if (i * n0_ + m_ < m0_ * c + n_) {
+            swizzleDirect_ = 1;  // Nz
+            cost = n0_ * i + m0_ * c;
+            if (cost <= minCost) {
+                minCost = cost;
+                swizzleCount_ = i;
+            }
+        } else {
+            swizzleDirect_ = 0;  // Zn
+            cost = m0_ * i + n0_ * c;
+            if (cost < minCost) {
+                minCost = cost;
+                swizzleCount_ = i;
+            }
+        }
+    }
+}
+
+class MlaPreprocessTiling
+{
+public:
+    MlaPreprocessTiling(struct PlatformInfo &platformInfo, struct OpParam &opParam, MlaTilingData *tilingData)
+    {
+        this->tilingData = tilingData;
+        this->platformInfo = platformInfo;
+        this->opParam = opParam;
+    }
+    void Init();
+
+    void RmsNormQuantTiling();
+    void RopeConcatTiling();
+    void EinSumQuantTiling();
+
+    void SetTilingKey();
+    void SetMlapoWorkSpace();
+
+private:
+    MlaTilingData *tilingData;
+    struct PlatformInfo platformInfo;
+    struct OpParam opParam;
+};
+
+void MlaPreprocessTiling::RmsNormQuantTiling()
+{
+    tilingData->rmsNumCore1 = platformInfo.coreNumAiv;
+    tilingData->rmsNumCol1 = HIDDEN_STRATE;
+    tilingData->rmsNumRow1 = opParam.N;
+    tilingData->rmsQuantMin1 = -CONST_128;
+    tilingData->rmsNumCore2 = platformInfo.coreNumAiv;
+    tilingData->rmsNumCol2 = HIDDEN_STRATE_MM;
+    tilingData->rmsNumRow2 = opParam.N;
+    tilingData->rmsQuantMin2 = -CONST_128;
+}
+
+void MlaPreprocessTiling::RopeConcatTiling()
+{
+    uint32_t ntokens = opParam.N;
+    uint32_t hiddenSizeQ = HEADDIM * opParam.headNum;
+    uint32_t headDim = HEADDIM;
+    uint32_t headNumQ = hiddenSizeQ / headDim;
+    uint32_t concatSize = CONCAT_SIZE;
+    uint32_t maxCore = platformInfo.coreNumAiv;
+    uint32_t maxUbSize = platformInfo.ubSize;
+
+    uint32_t allHeadNum = ntokens * headNumQ;
+
+    uint32_t tempCore = (allHeadNum + maxCore - 1) / maxCore;
+    uint32_t realCore = (allHeadNum + tempCore - 1) / tempCore;   // Actual number of the core for operation
+    uint32_t nlCoreRun = (allHeadNum + realCore - 1) / realCore;  // The number of heads in the front core
+    uint32_t lCoreRun = allHeadNum - (realCore - 1) * nlCoreRun;  // The number of heads in the tail core
+
+    uint32_t dataTypeSize = 2;
+
+    // Calculate how many lines can be moved at a time. q 4+2、reverseq 4、neg 4、sin 4+2、cos 4+2  + concat 2
+    uint32_t allSize =
+        headDim * (3 * (4 + dataTypeSize) + 2 * 4) + concatSize * dataTypeSize;  // lift precision calculation of ROPE
+    uint32_t maxNPerLoopForUb = maxUbSize / allSize;  // the maximum number of rows at a time for UB
+    uint32_t preCoreLoopTime = (nlCoreRun + maxNPerLoopForUb - 1) / maxNPerLoopForUb;  // Number of cycles of front core
+    uint32_t preCoreLoopNLast =
+        nlCoreRun -
+        (preCoreLoopTime - 1) * maxNPerLoopForUb;  // rows of data processed in the last batch of the front core
+    uint32_t lastCoreLoopTime = (lCoreRun + maxNPerLoopForUb - 1) / maxNPerLoopForUb;  // Number of cycles of tail core
+    uint32_t lastCoreLoopNLast =
+        lCoreRun -
+        (lastCoreLoopTime - 1) * maxNPerLoopForUb;  // rows of data processed in the last batch of the tail core
+
+    tilingData->hiddenSizeQ = hiddenSizeQ;
+    tilingData->headNumQ = headNumQ;
+    tilingData->headDim = headDim;
+    tilingData->concatSize = concatSize;
+    tilingData->rotaryCoeff = NUM2;
+    tilingData->ntokens = ntokens;
+    tilingData->realCore = realCore;
+    tilingData->nlCoreRun = nlCoreRun;
+    tilingData->lCoreRun = nlCoreRun;
+    tilingData->maxNPerLoopForUb = maxNPerLoopForUb;
+    tilingData->preCoreLoopTime = preCoreLoopTime;
+    tilingData->preCoreLoopNLast = preCoreLoopNLast;
+    tilingData->lastCoreLoopTime = lastCoreLoopTime;
+    tilingData->lastCoreLoopNLast = lastCoreLoopNLast;
+}
+
+void MlaPreprocessTiling::EinSumQuantTiling()
+{
+    uint32_t aivCore = platformInfo.coreNumAiv;
+    uint32_t ubSize = UB_SIZE - 1024;
+
+    // input shape
+    uint32_t esqBatch = opParam.N;          // tokenNum
+    uint32_t esqHeadNum = opParam.headNum;  // headNum
+    uint32_t esqColNum = AXES_ALIGN_SIZE;   // 512
+
+    // split core
+    uint32_t esqFrontCore = esqBatch % aivCore;
+    uint32_t esqTailCore = aivCore - esqFrontCore;
+    uint32_t esqFrontCoreBatch = CeilDiv(esqBatch, aivCore);
+    uint32_t esqTailCoreBatch = esqBatch / aivCore;
+
+    // split ub --> calc H' <-- The number of rows handled in a UB cycle.
+    uint32_t splitFactor = 0;
+    uint32_t esqHeadPerLoop = 0;  // The number of head rows per UB calculation
+    uint32_t repeatMask = 0;
+
+    if (opParam.inDtype == at::kBFloat16 || opParam.quantMode == QuantMode::PER_TOKEN_SYMM_QUANT) {
+        // Move scales in at once, broadcast, and cache them all H * 32bytes
+        uint32_t scaleUb = RoundUp(esqHeadNum) * CONST_32;
+        // bf16 input [H', colNum](f16 + fp32 + int8), ub reuse
+        splitFactor = esqColNum * (sizeof(uint16_t) + sizeof(float) + sizeof(uint8_t));
+        splitFactor *= NUM2;
+        esqHeadPerLoop = (ubSize - scaleUb) / splitFactor;  // 26
+        repeatMask = FP32_REPEAT_MASK;
+    } else {
+        // fp16 input [H', cloNum](fp16*2 + int8) + [H', 1](fp16) + [H', 16](fp16)
+        splitFactor =
+            esqColNum * (NUM2 * sizeof(uint16_t) + sizeof(uint8_t)) + sizeof(uint16_t) + (CONST_16 * sizeof(uint16_t));
+        esqHeadPerLoop = ubSize / splitFactor;
+        repeatMask = FP16_REPEAT_MASK;
+        esqHeadPerLoop = RoundDown(esqHeadPerLoop);
+    }
+    uint32_t esqUbHeadLoop = esqHeadNum / esqHeadPerLoop;  // UB complete cycles
+    uint32_t esqHeadTail = esqHeadNum % esqHeadPerLoop;    // The number of rows that UB last processed the head.
+    uint32_t esqColLoop = esqColNum / repeatMask;  // Each row counts the number of times to cycle through columns.
+    uint32_t esqColTail =
+        esqColNum % repeatMask;  // colNum is not 64/128 aligned, the number of columns is calculated last.
+
+    tilingData->esqFrontCore = esqFrontCore;
+    tilingData->esqTailCore = esqTailCore;
+    tilingData->esqFrontCoreBatch = esqFrontCoreBatch;
+    tilingData->esqTailCoreBatch = esqTailCoreBatch;
+    tilingData->esqHeadNum = esqHeadNum;
+    tilingData->esqColNum = esqColNum;
+    tilingData->esqUbHeadLoop = esqUbHeadLoop;
+    tilingData->esqHeadPerLoop = esqHeadPerLoop;
+    tilingData->esqHeadTail = esqHeadTail;
+    tilingData->esqColLoop = esqColLoop;
+    tilingData->esqColTail = esqColTail;
+}
+
+void MlaPreprocessTiling::SetMlapoWorkSpace()
+{
+    uint64_t s1wsFactor =
+        static_cast<uint64_t>(opParam.cacheMode == 2 ? std::max(HIDDEN_STRATE * sizeof(int8_t),
+                                                                opParam.headNum * AXES_ALIGN_SIZE * sizeof(uint16_t))
+                                                     : HIDDEN_STRATE * sizeof(int8_t));
+    uint64_t workSizeS1 = s1wsFactor;
+    uint64_t workSizeS2 = opParam.headNum * HIDDEN_STRATE_ROPE * sizeof(uint16_t);
+    uint64_t workSizeS3 = HIDDEN_STRATE_MM * sizeof(uint16_t);
+    uint64_t workSizeS4 = std::max(opParam.headNum * HIDDEN_STRATE_ROPE, HIDDEN_STRATE_MM) * sizeof(uint32_t);
+
+    uint64_t maxWorkspaceSize = workSizeS1;
+    maxWorkspaceSize = std::max(maxWorkspaceSize, workSizeS2);
+    maxWorkspaceSize = std::max(maxWorkspaceSize, workSizeS3);
+    maxWorkspaceSize = std::max(maxWorkspaceSize, workSizeS4);
+    maxWorkspaceSize *= static_cast<uint64_t>(opParam.N);
+
+    uint64_t pertokenWorkspace = static_cast<uint64_t>(opParam.N) * sizeof(float) * 2;
+
+    uint64_t userWorkspaceSize;
+    if (opParam.inDtype == at::kBFloat16 || opParam.quantMode == QuantMode::PER_TOKEN_SYMM_QUANT) {
+        userWorkspaceSize = 4 * maxWorkspaceSize + pertokenWorkspace;
+    } else {
+        userWorkspaceSize = 3 * maxWorkspaceSize;
+    }
+
+    tilingData->userWorkspaceSize = userWorkspaceSize;
+    tilingData->s1Offset = 0;
+    tilingData->s2Offset = tilingData->s1Offset + maxWorkspaceSize;
+    tilingData->s3Offset = tilingData->s2Offset + maxWorkspaceSize;
+    tilingData->s4Offset = tilingData->s3Offset + maxWorkspaceSize;
+    tilingData->s5Offset = tilingData->s4Offset + maxWorkspaceSize;
+}
+
+void MlaPreprocessTiling::SetTilingKey()
+{
+    uint64_t tilingKey = (static_cast<uint64_t>(opParam.inDtype == at::kBFloat16)) << 8;
+
+    tilingKey |= static_cast<uint64_t>(opParam.cacheMode);
+    tilingKey |= (static_cast<uint64_t>(opParam.quantMode) << 3);
+
+    tilingData->tilingKey = tilingKey;
+}
+
+void MlaPreprocessTiling::Init()
+{
+    tilingData->numCore = platformInfo.coreNumAic;
+    tilingData->n = opParam.N;
+
+    bool deqOnTheFly = false;
+    if (opParam.inDtype == at::kBFloat16 || opParam.quantMode == QuantMode::PER_TOKEN_SYMM_QUANT) {
+        deqOnTheFly = true;
+    }
+
+    PpMatmulTilingApi mm1TilingApi(platformInfo,
+                                   1,                 // numBatch
+                                   opParam.N,         // m
+                                   HIDDEN_STRATE,     // k
+                                   HIDDEN_STRATE_MM,  // n
+                                   false,             // transA
+                                   true,              // transB
+                                   true,              // enDequant
+                                   deqOnTheFly);      // in bf16.cce?
+    mm1TilingApi.GetTilingData(tilingData->mm1);
+
+    PpMatmulTilingApi mm2TilingApi(platformInfo,
+                                   1,                                     // numBatch
+                                   opParam.N,                             // m
+                                   HIDDEN_STRATE_RMS,                     // k
+                                   opParam.headNum * HIDDEN_STRATE_ROPE,  // n
+                                   false,                                 // transA
+                                   true,                                  // transB
+                                   true,                                  // enDequant
+                                   deqOnTheFly);                          // in bf16.cce?
+    mm2TilingApi.GetTilingData(tilingData->mm2);
+
+    PpMatmulTilingApi mm3TilingApi(platformInfo,
+                                   opParam.headNum,  // numBatch
+                                   opParam.N,        // m
+                                   CONST_128,        // k
+                                   CONCAT_SIZE,      // n
+                                   false,            // transA
+                                   false,            // transB
+                                   false,            // enDequant
+                                   deqOnTheFly);     // in bf16.cce?
+    mm3TilingApi.GetTilingData(tilingData->mm3);
+
+    RmsNormQuantTiling();
+    RopeConcatTiling();
+    EinSumQuantTiling();
+
+    SetMlapoWorkSpace();
+    SetTilingKey();
+
+    return;
+}
+
+std::unordered_map<c10::string_view, uint16_t> cache_mode_map = {
+    {"krope_ctkv", 1}, {"int8_nzcache", 2}, {"nzcache", 3}};
+
+std::unordered_map<c10::string_view, uint16_t> quant_mode_map = {
+    {"per_tensor_quant_asymm", 0},
+    {"per_token_quant_symm", 1},
+};
+
+template <typename MapType>
+inline int get_op_mode(const MapType &mode_map, c10::optional<c10::string_view> mode_opt, c10::string_view default_mode,
+                       const char *mode_name)
+{
+    c10::string_view mode_str = mode_opt.value_or(default_mode);
+    auto it = mode_map.find(mode_str);
+    TORCH_CHECK(it != mode_map.end(), "Unsupported ", mode_name, " value: '", mode_str, "'");
+    return it->second;
+}
+
+// std::tuple<at::Tensor &, at::Tensor &, at::Tensor &, at::Tensor &> mla_preprocess(
+//     const at::Tensor &hiddenState, const at::Tensor &gamma0, const at::Tensor &beta0, const at::Tensor &wdqkv,
+//     const at::Tensor &descale0, const at::Tensor &gamma1, const at::Tensor &beta1, const at::Tensor &wuq,
+//     const at::Tensor &descale1, const at::Tensor &gamma2, const at::Tensor &cos, const at::Tensor &sin,
+//     const at::Tensor &wuk, const at::Tensor &kv_cache, const at::Tensor &kv_cache_rope, const at::Tensor &slotmapping,
+//     const at::Tensor &quant_scale0, const at::Tensor &quant_offset0, const at::Tensor &bias0,
+//     const at::Tensor &quant_scale1, const at::Tensor &quant_offset1, const at::Tensor &bias1,
+//     const c10::optional<at::Tensor> &ctkv_scale, const c10::optional<at::Tensor> &q_nope_scale,
+//     c10::optional<c10::string_view> cache_mode, c10::optional<c10::string_view> quant_mode, at::Tensor &q_out0,
+//     at::Tensor &kv_cache_out0, at::Tensor &q_out1, at::Tensor &kv_cache_out1)
+std::tuple<at::Tensor, at::Tensor, uint32_t> mla_preprocess_tiling(
+    const at::Tensor &hiddenState,
+    const at::Tensor &wuk,
+    c10::optional<c10::string_view> cache_mode,
+    c10::optional<c10::string_view> quant_mode
+)
+{
+    auto cacheMode = get_op_mode(cache_mode_map, cache_mode, "krope_ctkv", "cache_mode");
+    auto quantMode = get_op_mode(quant_mode_map, quant_mode, "per_token_quant_symm", "quant_mode");
+
+    platform_ascendc::PlatformAscendC *platformAscendC = platform_ascendc::PlatformAscendCManager::GetInstance();
+
+    struct PlatformInfo platformInfo;
+    platformInfo.coreNum = platformAscendC->GetCoreNum();
+    platformInfo.coreNumAic = platformAscendC->GetCoreNumAic();
+    platformInfo.coreNumAiv = platformAscendC->GetCoreNumAiv();
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::UB, platformInfo.ubSize);
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::L1, platformInfo.l1Size);
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::L2, platformInfo.l2Size);
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::L0_A, platformInfo.l0aSize);
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::L0_B, platformInfo.l0bSize);
+    platformAscendC->GetCoreMemSize(platform_ascendc::CoreMemType::L0_C, platformInfo.l0cSize);
+
+    int32_t N = hiddenState.sizes()[0];
+    int32_t headNum = wuk.sizes()[0];
+
+    OpParam opParam;
+    opParam.N = N;
+    opParam.headNum = headNum;
+    opParam.cacheMode = static_cast<int32_t>(cacheMode);
+    opParam.quantMode = static_cast<QuantMode>(quantMode);
+    opParam.inDtype = hiddenState.options().dtype();
+
+    MlaTilingData tilingData;
+    MlaPreprocessTiling mlaTiling(platformInfo, opParam, &tilingData);
+
+    mlaTiling.Init();
+    uint32_t blockDim = platformInfo.coreNumAic;
+
+    // workspace
+    uint64_t system_workspace_size = static_cast<uint64_t>(platformAscendC->GetLibApiWorkSpaceSize());
+    uint64_t workspace_size = system_workspace_size + tilingData.userWorkspaceSize;
+    auto options = at::TensorOptions().dtype(at::kByte).device(hiddenState.options().device());
+    auto workspace_tensor = at::empty({static_cast<int64_t>(workspace_size)}, options);
+
+    // tiling
+    int32_t bIndex = N - 1;
+    uint32_t tilingSize = sizeof(MlaTilingData);
+    static auto global_tiling_data =
+        at::empty({tilingSize * MAX_SUPPORT_TOKEN_NUMS},
+                  at::TensorOptions().dtype(at::kByte).device(hiddenState.options().device()));
+    if (bIndex >= 0 && bIndex < MAX_SUPPORT_TOKEN_NUMS) {
+        aclrtMemcpy(global_tiling_data.data_ptr<uint8_t>() + (tilingSize * bIndex), tilingSize, &tilingData, tilingSize,
+                    ACL_MEMCPY_HOST_TO_DEVICE);
+    } else {
+        // Handle the case where bIndex is out of range
+        TORCH_CHECK(false, "bIndex is out of range: ", bIndex);
+    }
+    at::Tensor tiling = at::from_blob(
+        global_tiling_data.data_ptr<uint8_t>() + (tilingSize * bIndex),
+        tilingSize,
+        at::kByte);
+
+    return std::make_tuple(workspace_tensor, tiling, blockDim);
+}
+
+}  // namespace npu_kernel
--- a/csrc/mla_preprocess/op_host/tiling/mla_preprocess_tiling.h
+++ b/csrc/mla_preprocess/op_host/tiling/mla_preprocess_tiling.h
@ -0,0 +1,95 @@
+// Adapted from
+//   https://gitee.com/ascend/ascend-transformer-boost
+//
+// Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved.
+// This file is a part of the CANN Open Software.
+// Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+// Please refer to the License for details. You may not use this file except in compliance with the License.
+// THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+// INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+// See LICENSE in the root of the software repository for the full text of the License.
+//
+
+#ifndef MLAPREPROCESS_TILING_H
+#define MLAPREPROCESS_TILING_H
+
+#include <cstdint>
+
+struct PpMatmulTilingData {
+    uint32_t numBatch{0};
+    uint32_t m{0};
+    uint32_t k{0};
+    uint32_t n{0};
+    uint32_t m0{0};
+    uint32_t k0{0};
+    uint32_t n0{0};
+    uint32_t mLoop{0};
+    uint32_t kLoop{0};
+    uint32_t nLoop{0};
+    uint32_t coreLoop{0};
+    uint32_t swizzleCount{0};
+    uint32_t swizzleDirect{0};
+    uint32_t enShuffleK{0};
+    uint32_t blockDim{0};
+    uint32_t enLoadAllAmat{0};
+    uint32_t b0matPingPongBufferLen{0};
+};
+
+struct MlaTilingData {
+    uint32_t tilingKey{0};
+    uint64_t userWorkspaceSize{0};
+    uint64_t s1Offset{0};
+    uint64_t s2Offset{0};
+    uint64_t s3Offset{0};
+    uint64_t s4Offset{0};
+    uint64_t s5Offset{0};
+
+    uint32_t numCore{0};
+    uint32_t n{0};
+    uint32_t perTaskNum{0};
+    uint32_t resTaskNum{0};
+
+    PpMatmulTilingData mm1;
+    PpMatmulTilingData mm2;
+    PpMatmulTilingData mm3;
+    // rms1
+    uint32_t rmsNumCore1{0};
+    uint32_t rmsNumCol1{0};
+    uint32_t rmsNumRow1{0};
+    uint32_t rmsQuantMin1{0};
+    // rms2
+    uint32_t rmsNumCore2{0};
+    uint32_t rmsNumCol2{0};
+    uint32_t rmsNumRow2{0};
+    uint32_t rmsQuantMin2{0};
+
+    uint32_t hiddenSizeQ{0};
+    uint32_t headNumQ{0};
+    uint32_t headDim{0};
+    uint32_t concatSize{0};
+    uint32_t rotaryCoeff{0};
+    uint32_t ntokens{0};
+    uint32_t realCore{0};
+    uint32_t nlCoreRun{0};
+    uint32_t lCoreRun{0};
+    uint32_t maxNPerLoopForUb{0};
+    uint32_t preCoreLoopTime{0};
+    uint32_t preCoreLoopNLast{0};
+    uint32_t lastCoreLoopTime{0};
+    uint32_t lastCoreLoopNLast{0};
+
+    // EinSumQuant
+    uint32_t esqFrontCore{0};
+    uint32_t esqTailCore{0};
+    uint32_t esqFrontCoreBatch{0};
+    uint32_t esqTailCoreBatch{0};
+    uint32_t esqHeadNum{0};
+    uint32_t esqColNum{0};
+    uint32_t esqUbHeadLoop{0};
+    uint32_t esqHeadPerLoop{0};
+    uint32_t esqHeadTail{0};
+    uint32_t esqColLoop{0};
+    uint32_t esqColTail{0};
+};
+
+#endif  // MLAPREPROCESS_TILING_H
--- a/csrc/mla_preprocess/op_kernel/kernel/common.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/common.h
@ -0,0 +1,25 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_COMMON_H
+#define INCLUDE_COMMON_H
+
+#define CONST_2 2
+
+#define SET_FLAG(trigger, waiter, e) AscendC::SetFlag<AscendC::HardEvent::trigger##_##waiter>((e))
+#define WAIT_FLAG(trigger, waiter, e) AscendC::WaitFlag<AscendC::HardEvent::trigger##_##waiter>((e))
+#define PIPE_BARRIER(pipe) AscendC::PipeBarrier<PIPE_##pipe>()
+
+#ifndef __force_inline__
+#define __force_inline__ inline __attribute__((always_inline))
+#endif
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/common_func.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/common_func.h
@ -0,0 +1,121 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef INCLUDE_COMMON_FUNC_H
+#define INCLUDE_COMMON_FUNC_H
+
+#include <limits>
+#include <type_traits>
+
+#ifdef __CCE_KT_TEST__
+#include "stub_def.h"
+#include "stub_fun.h"
+#else
+#include "kernel_macros.h"
+#endif
+
+template <uint32_t ALIGN, typename T = uint32_t>
+inline __aicore__ T RoundUp(const T val)
+{
+    static_assert(ALIGN != 0, "align must not be zero");
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    T align = ALIGN;
+    if (val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+
+template <typename T>
+inline __aicore__ T RoundUp(const T val, const T align)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (align == 0 || val + align - 1 < val) {
+        return val;
+    }
+    return (val + align - 1) / align * align;
+}
+
+template <uint32_t DIVISOR, typename T = uint32_t>
+inline __aicore__ T CeilDiv(const T dividend)
+{
+    static_assert(DIVISOR != 0, "align must not be zero");
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    T divisor = DIVISOR;
+    if (dividend + divisor - 1 < dividend) {
+        return dividend;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+template <typename T>
+constexpr T T_MAX = std::numeric_limits<T>::max();
+
+template <typename T>
+inline __aicore__ T CeilDiv(const T dividend, const T divisor)
+{
+    static_assert(std::is_arithmetic<T>::value, "T must be an arithmetic type");
+    if (divisor == 0 || dividend + divisor - 1 < dividend) {
+        return T_MAX<T>;
+    }
+    return (dividend + divisor - 1) / divisor;
+}
+
+template <typename T>
+__aicore__ inline T Min(const T lhs, const T rhs)
+{
+    return lhs < rhs ? lhs : rhs;
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint32_t BlockSize()
+{
+    return 32 / sizeof(Dtype);
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint32_t MatrixSize()
+{
+    return 512 / sizeof(Dtype);
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint64_t BlockSizeRoundUp(uint64_t num)
+{
+    return (num + BlockSize<Dtype>() - 1) / BlockSize<Dtype>() * BlockSize<Dtype>();
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint64_t NumBlocksRoundUp(uint64_t num)
+{
+    return (num + BlockSize<Dtype>() - 1) / BlockSize<Dtype>();
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint64_t MatrixSizeRoundUp(uint64_t num)
+{
+    return (num + MatrixSize<Dtype>() - 1) / MatrixSize<Dtype>() * MatrixSize<Dtype>();
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint64_t NumMatrixsRoundUp(uint64_t num)
+{
+    return (num + MatrixSize<Dtype>() - 1) / MatrixSize<Dtype>();
+}
+
+template <typename Dtype>
+__aicore__ __attribute__((always_inline)) inline uint64_t L0HalfSize()
+{
+    return 32 * 1024 / sizeof(Dtype);
+}
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/hardware.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/hardware.h
@ -0,0 +1,36 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_HARDWARE_H
+#define INCLUDE_HARDWARE_H
+
+enum class ArchType { ASCEND_V220, ASCEND_V200, ASCEND_M200 };
+
+template <ArchType ArchTag>
+struct HardwareInfo {
+    static uint32_t const l2BW = 5;
+    static uint32_t const hbmBW = 1;
+    static uint32_t const supportMix = 0;
+    static uint32_t const l1Size = 512 * 1024;
+    static uint32_t const l0ASize = 64 * 1024;
+    static uint32_t const l0BSize = 64 * 1024;
+    static uint32_t const l0CSize = 128 * 1024;
+    static uint32_t const l2Size = 192 * 1024 * 1024;
+    static uint32_t const biasSize = 1024;
+    static uint32_t const fixBufSize = 7 * 1024;
+    static uint32_t const ubSize = 192 * 1024;
+    static uint32_t const fractalSize = 512;
+    static uint32_t const l1l0BlockSize = 32;
+    static uint32_t const btBlockSize = 64;
+    static uint32_t const fbBlockSize = 128;
+};
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/iterator.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterator.h
@ -0,0 +1,92 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_ITERTOR_H
+#define INCLUDE_ITERTOR_H
+
+#include "common_func.h"
+#include "hardware.h"
+#include "kernel_operator.h"
+#include "layout.h"
+#include "mem.h"
+
+/////////////////////////////////////////////////////
+// gm_to_l1
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType, DataFormat FormatInGM, DataFormat FormatInL1>
+struct gm_to_l1 {
+    __aicore__ gm_to_l1(AscendC::LocalTensor<DataType> l1Tensor, AscendC::GlobalTensor<DataType> gmTensor,
+                        uint32_t nTileActual, uint32_t nTileCeil, uint32_t nVal, uint32_t dTileActual,
+                        uint32_t dTileCeil, uint32_t dVal) {};
+};
+
+/////////////////////////////////////////////////////
+// l1_to_l0_a
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType, bool IsTransPose, DataFormat DFmtIn, DataFormat DFmtOut>
+struct l1_to_l0_a {
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<DataType> l0Tensor, AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t mTileCeil, uint32_t kPartCeil, uint32_t mSrcStride, uint32_t kSrcStride,
+                          uint32_t mDstStride, uint32_t kDstStride) {};
+};
+
+/////////////////////////////////////////////////////
+// l1_to_l0_b
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType, bool IsTransPose, DataFormat DFmtIn, DataFormat DFmtOut>
+struct l1_to_l0_b {
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<DataType> l0Tensor, AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t nTileCeil, uint32_t kPartCeil, uint32_t nSrcStride, uint32_t kSrcStride,
+                          uint32_t nDstStride, uint32_t kDstStride) {};
+};
+
+/////////////////////////////////////////////////////
+// l0c_to_gm
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, DataFormat OutFormatType, typename OutDataType, typename L0CDataType>
+struct l0c_to_gm {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<OutDataType> gmTensor, AscendC::LocalTensor<L0CDataType> l0cTensor,
+                         uint32_t mTileActual, uint32_t nTileActual, uint32_t mTileCeil, uint32_t nActual,
+                         uint8_t unitFlag = 0) {};
+};
+
+/////////////////////////////////////////////////////
+// l0c_to_l1
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, DataFormat LayoutOut, typename ElementOut, typename ElementIn>
+struct l0c_to_l1 {
+    __aicore__ l0c_to_l1(AscendC::LocalTensor<ElementOut> l1Tensor, AscendC::LocalTensor<ElementIn> l0cTensor,
+                         AscendC::LocalTensor<uint64_t> deqTensor, uint32_t mTileActual, uint32_t nTileActual,
+                         uint32_t mTileCeil, uint32_t nActual) {};
+};
+
+template <ArchType ArchTag, typename DataType>
+struct l1_to_bt {
+    __aicore__ l1_to_bt(uint64_t dst, const AscendC::LocalTensor<DataType> &src, uint16_t convControl, uint16_t nBurst,
+                        uint16_t lenBurst, uint16_t srcGap, uint16_t dstGap) {};
+};
+
+template <ArchType ArchTag, typename DataType>
+struct l1_to_fb {
+    __aicore__ l1_to_fb(AscendC::LocalTensor<DataType> &dst, AscendC::LocalTensor<DataType> &src, uint16_t burstNum,
+                        uint16_t burstLen, uint16_t srcGap, uint16_t dstGap) {};
+};
+
+#include "iterators/gm_to_l1_iterator.inc"
+#include "iterators/gm_to_ub_iterator.inc"
+#include "iterators/l0c_to_gm_iterator.inc"
+#include "iterators/l0c_to_l1_iterator.inc"
+#include "iterators/l0c_to_ub_iterator.inc"
+#include "iterators/l1_to_bt_iterator.inc"
+#include "iterators/l1_to_fb_iterator.inc"
+#include "iterators/l1_to_l0_iterator.inc"
+#include "iterators/l1_to_ub_iterator.inc"
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/gm_to_l1_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/gm_to_l1_iterator.inc
@ -0,0 +1,162 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+// Partial specialization for V220, ND_in, ND_out
+template <ArchType ArchTag, typename DataType>
+struct gm_to_l1<ArchTag, DataType, DataFormat::ND, DataFormat::ND> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+
+    __aicore__ gm_to_l1(AscendC::LocalTensor<DataType> l1Tensor,
+                        AscendC::GlobalTensor<DataType> gmTensor,
+                        uint32_t nTileActual,
+                        uint32_t nTileCeil,
+                        uint32_t nVal,
+                        uint32_t dTileActual,
+                        uint32_t dTileCeil,
+                        uint32_t dVal)
+    {
+        AscendC::DataCopy(l1Tensor,                                                               // dst
+                          gmTensor,                                                               // src
+                          AscendC::DataCopyParams(1,                                              // nBurst
+                                                  CeilDiv<BLOCK_SIZE>(nTileActual * dTileActual), // lenBurst
+                                                  0,                                              // srcGap
+                                                  0));                                            // dstGap
+    };
+};
+
+// Partial specialization for NZ_in, NZ_out
+template <ArchType ArchTag, typename DataType>
+struct gm_to_l1<ArchTag, DataType, DataFormat::NZ, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t STRIDE_LIMIT = 65536;
+
+    __aicore__ gm_to_l1(AscendC::LocalTensor<DataType> l1Tensor,
+                        AscendC::GlobalTensor<DataType> gmTensor,
+                        uint32_t nTileActual,
+                        uint32_t nTileCeil,
+                        uint32_t nVal,
+                        uint32_t dTileActual,
+                        uint32_t dTileCeil,
+                        uint32_t dVal)
+    {
+        uint64_t srcStride = nVal - nTileCeil;
+        if (srcStride < STRIDE_LIMIT) {
+            AscendC::DataCopy(l1Tensor,                                       // dst
+                              gmTensor,                                       // src
+                              AscendC::DataCopyParams(dTileCeil / BLOCK_SIZE, // nBurst
+                                                      nTileCeil,              // lenBurst
+                                                      srcStride,              // srcGap
+                                                      0));                    // dstGap
+        } else {
+            for (uint64_t i = 0; i < dTileCeil / BLOCK_SIZE; i++) {
+                uint64_t dstOffset = i * nTileCeil * BLOCK_SIZE;
+                uint64_t srcOffset = i * nVal * BLOCK_SIZE;
+                AscendC::DataCopy(l1Tensor[dstOffset],               // dst
+                                  gmTensor[srcOffset],               // src
+                                  AscendC::DataCopyParams(1,         // nBurst
+                                                          nTileCeil, // lenBurst
+                                                          0,         // srcGap
+                                                          0));       // dstGap
+            }
+        }
+    };
+};
+
+// Partial specialization for V220, ND_in, ND_out
+template <ArchType ArchTag, typename DataType>
+struct gm_to_l1<ArchTag, DataType, DataFormat::ND, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t STRIDE_LIMIT = 65536;
+
+    __aicore__ gm_to_l1(AscendC::LocalTensor<DataType> l1Tensor,
+                        AscendC::GlobalTensor<DataType> gmTensor,
+                        uint32_t nTileActual,
+                        uint32_t nTileCeil,
+                        uint32_t nVal,
+                        uint32_t dTileActual,
+                        uint32_t dTileCeil,
+                        uint32_t dVal)
+    {
+        if (dVal < STRIDE_LIMIT) {
+            AscendC::DataCopy(l1Tensor,
+                              gmTensor,
+                              AscendC::Nd2NzParams(1,           // ndNum
+                                                   nTileActual, // nValue
+                                                   dTileActual, // dValue
+                                                   0,           // srcNdMatrixStride, unused
+                                                   dVal,        // srcDValue
+                                                   nTileCeil,   // dstNzC0Stride
+                                                   1,           // dstNzNStride
+                                                   0));         // dstNzMatrixStride, unused
+        } else {
+            for (uint32_t i = 0; i < nTileActual; i++) {
+                AscendC::DataCopy(l1Tensor[i * BLOCK_SIZE],
+                                  gmTensor[i * dVal],
+                                  AscendC::Nd2NzParams(1,           // ndNum
+                                                       1,           // nValue
+                                                       dTileActual, // dValue
+                                                       0,           // srcNdMatrixStride, unused
+                                                       0,           // srcDValue
+                                                       nTileCeil,   // dstNzC0Stride
+                                                       0,           // dstNzNStride
+                                                       0));         // dstNzMatrixStride, unused
+            }
+        }
+    };
+};
+
+// Partial specialization for V220, ND_in, NZ_out
+template <ArchType ArchTag, typename DataType>
+struct gm_to_l1<ArchTag, DataType, DataFormat::ND, DataFormat::ZN> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t STRIDE_LIMIT = 65536;
+
+    __aicore__ gm_to_l1(AscendC::LocalTensor<DataType> l1Tensor,
+                        AscendC::GlobalTensor<DataType> gmTensor,
+                        uint32_t nTileActual,
+                        uint32_t nTileCeil,
+                        uint32_t nVal,
+                        uint32_t dTileActual,
+                        uint32_t dTileCeil,
+                        uint32_t dVal)
+    {
+        if (dVal < STRIDE_LIMIT) {
+            AscendC::DataCopy(l1Tensor,
+                              gmTensor,
+                              AscendC::Nd2NzParams(1,           // ndNum
+                                                   nTileActual, // nValue
+                                                   dTileActual, // dValue
+                                                   0,           // srcNdMatrixStride, unused
+                                                   dVal,        // srcDValue
+                                                   nTileCeil,   // dstNzC0Stride
+                                                   1,           // dstNzNStride
+                                                   0));         // dstNzMatrixStride, unused
+        } else {
+            for (uint32_t i = 0; i < nTileActual; ++i) {
+                AscendC::DataCopy(l1Tensor,
+                                  gmTensor,
+                                  AscendC::Nd2NzParams(1,           // ndNum
+                                                       1,           // nValue
+                                                       dTileActual, // dValue
+                                                       0,           // srcNdMatrixStride, unused
+                                                       0,           // srcDValue
+                                                       nTileCeil,   // dstNzC0Stride
+                                                       0,           // dstNzNStride
+                                                       0));         // dstNzMatrixStride, unused
+            }
+        }
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/gm_to_ub_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/gm_to_ub_iterator.inc
@ -0,0 +1,89 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+template <ArchType ArchTag, typename DType> struct gm_to_ub {
+    __aicore__ inline gm_to_ub(AscendC::LocalTensor<DType> dstTensor, AscendC::GlobalTensor<DType> srcTensor,
+                               uint8_t sid, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride)
+    {
+        AscendC::DataCopy(dstTensor, srcTensor, AscendC::DataCopyParams(nBurst, lenBurst, srcStride, dstStride));
+    };
+};
+
+template <ArchType ArchTag, typename DType> struct gm_to_ub_align {
+    __aicore__ inline gm_to_ub_align(AscendC::LocalTensor<DType> dstTensor, AscendC::GlobalTensor<DType> srcTensor,
+                                     uint8_t sid, uint16_t nBurst, uint32_t lenBurst, uint8_t leftPaddingNum,
+                                     uint8_t rightPaddingNum, uint32_t srcGap, uint32_t dstGap)
+    {
+        AscendC::DataCopyPad(dstTensor, srcTensor, AscendC::DataCopyExtParams(nBurst, lenBurst, srcGap, dstGap, 0),
+                             AscendC::DataCopyPadExtParams<DType>(false, leftPaddingNum, rightPaddingNum, 0));
+    };
+};
+
+template <ArchType ArchTag, typename DType> struct ub_to_ub {
+    __aicore__ inline ub_to_ub(AscendC::LocalTensor<DType> dstTensor, AscendC::LocalTensor<DType> srcTensor,
+                               uint8_t sid, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride)
+    {
+        AscendC::DataCopy(dstTensor, srcTensor, AscendC::DataCopyParams(nBurst, lenBurst, srcStride, dstStride));
+    };
+};
+
+template <ArchType ArchTag, typename DataType, DataFormat InDataFormat = DataFormat::ND,
+          DataFormat OutDataFormat = DataFormat::ND>
+struct ub_to_gm {
+    __aicore__ inline ub_to_gm(AscendC::GlobalTensor<DataType> dstTensor, AscendC::LocalTensor<DataType> srcTensor,
+                               uint8_t sid, uint16_t nBurst, uint16_t lenBurst, uint16_t srcStride, uint16_t dstStride)
+    {
+        AscendC::DataCopy(dstTensor, srcTensor, AscendC::DataCopyParams(nBurst, lenBurst, srcStride, dstStride));
+    };
+};
+
+template <ArchType ArchTag, typename DataType> struct ub_to_gm<ArchTag, DataType, DataFormat::NZ, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+
+    __aicore__ ub_to_gm(AscendC::GlobalTensor<DataType> gmTensor, AscendC::LocalTensor<DataType> ubTensor,
+                        uint32_t nTileActual, uint32_t nTileCeil, uint32_t nVal, uint32_t dTileActual,
+                        uint32_t dTileCeil, uint32_t dVal)
+    {
+        constexpr uint32_t STRIDE_LIMIT = 65536;
+        uint64_t dstStride = nVal - nTileCeil;
+        if (dstStride < STRIDE_LIMIT) {
+            AscendC::DataCopy(gmTensor,                                       // dst
+                              ubTensor,                                       // src
+                              AscendC::DataCopyParams(dTileCeil / BLOCK_SIZE, // nBurst
+                                                      nTileCeil,              // lenBurst
+                                                      0,                      // srcGap
+                                                      dstStride));            // dstGap
+        } else {
+            for (uint64_t i = 0; i < dTileCeil / BLOCK_SIZE; ++i) {
+                uint64_t dstOffset = i * nVal * BLOCK_SIZE;
+                uint64_t srcOffset = i * nTileCeil * BLOCK_SIZE;
+                AscendC::DataCopy(gmTensor[dstOffset],               // dst
+                                  ubTensor[srcOffset],               // src
+                                  AscendC::DataCopyParams(1,         // nBurst
+                                                          nTileCeil, // lenBurst
+                                                          0,         // srcGap
+                                                          0));       // dstGap
+            }
+        }
+    };
+};
+
+template <ArchType ArchTag, typename DType> struct ub_to_gm_align {
+    __aicore__ inline ub_to_gm_align(AscendC::GlobalTensor<DType> dstTensor, AscendC::LocalTensor<DType> srcTensor,
+                                     uint8_t sid, uint16_t nBurst, uint32_t lenBurst, uint8_t leftPaddingNum,
+                                     uint8_t rightPaddingNum, uint32_t srcGap, uint32_t dstGap)
+    {
+        AscendC::DataCopyPad(dstTensor, srcTensor, AscendC::DataCopyExtParams(nBurst, lenBurst, srcGap, dstGap, 0));
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_gm_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_gm_iterator.inc
@ -0,0 +1,228 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+constexpr uint32_t BLOCK_NUM = 16;
+constexpr uint32_t BLOCK_SIZE_INT8 = 32;
+
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, half, float> {
+    /**
+     * @brief Copy data from L0C buffer to global memory, partial specialized for
+     *
+     * @param gmTensor the destination tensor on global memory, which is stored in ND format.
+     * @param l0cTensor the source tensor on L0C buffer, which is stored in FRACTAL_NZ format.
+     * @param mTileActual the m-direction size of the matrix in L0C buffer.
+     * @param nTileActual the n-direction size of the matrix in L0C buffer.
+     * @param srcStride the source stride between the adjacent fractal matrix along n-direction in unit of C0_SIZE.
+     * @param dstStride the leading dimension of the destination matrix in unit of element.
+     */
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<half> gmTensor,
+                         AscendC::LocalTensor<float> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::F322F16;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<half, float, AscendC::CFG_ROW_MAJOR>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<float> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride);
+        intriParams.nz2ndParams = {true, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::F322F16};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
+
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, half, int32_t> {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<half> gmTensor,
+                         AscendC::LocalTensor<int32_t> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::VDEQF16;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<half, int32_t, AscendC::CFG_ROW_MAJOR>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<int32_t> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride);
+        intriParams.nz2ndParams = {true, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::VDEQF16};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
+
+#ifdef __DAV_C220_CUBE__
+
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, __bf16, float> {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<__bf16> gmTensor,
+                         AscendC::LocalTensor<float> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::F322BF16;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<__bf16, float, AscendC::CFG_ROW_MAJOR>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<float> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride);
+        intriParams.nz2ndParams = {true, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::F322BF16};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
+
+#endif
+
+
+// Partial specialization ND, float
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, float, float> {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<float> gmTensor,
+                         AscendC::LocalTensor<float> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::NoQuant;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<float, float, AscendC::CFG_ROW_MAJOR>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<float> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride);
+        intriParams.nz2ndParams = {true, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::NoQuant};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
+
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::NZ, half, float> {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<half> gmTensor,
+                         AscendC::LocalTensor<float> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::F322F16;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<half, float, AscendC::CFG_NZ>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<float> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride - (nTileActual * sizeof(half) / sizeof(float)));
+        intriParams.quantParams = {QuantMode_t::F322F16};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
+
+template <>
+struct l0c_to_gm<ArchType::ASCEND_V220, DataFormat::ND, int32_t, int32_t> {
+    __aicore__ l0c_to_gm(AscendC::GlobalTensor<int32_t> gmTensor,
+                         AscendC::LocalTensor<int32_t> l0cTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t srcStride,
+                         uint32_t dstStride,
+                         uint8_t  unitFlag = 0)
+    {
+#ifdef __DAV_C220_CUBE__
+        auto intriParams = AscendC::FixpipeParamsV220(nTileActual, // nSize
+                                                      mTileActual, // mSize
+                                                      srcStride,   // srcStride
+                                                      dstStride,   // dstStride
+                                                      false);      // enRelu
+
+        intriParams.quantPre = QuantMode_t::NoQuant;
+        intriParams.unitFlag = unitFlag;
+        AscendC::Fixpipe<int32_t, int32_t, AscendC::CFG_ROW_MAJOR>(gmTensor, l0cTensor, intriParams);
+#else
+        AscendC::FixpipeParams<int32_t> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE_INT8),
+            0,
+            dstStride);
+        intriParams.nz2ndParams = {true, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::VDEQF16};
+        AscendC::Fixpipe(gmTensor, l0cTensor, intriParams);
+#endif
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_l1_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_l1_iterator.inc
@ -0,0 +1,42 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+/////////////////////////////////////////////////////
+// l0c_to_l1
+/////////////////////////////////////////////////////
+
+// Partial specialization ZN, half, int32_t
+template <ArchType ArchTag>
+struct l0c_to_l1<ArchTag, DataFormat::ZN, half, int32_t> {
+    using ElementOut = half;
+    using ElementIn = int32_t;
+    __aicore__ l0c_to_l1(AscendC::LocalTensor<ElementOut> l1Tensor,
+                         AscendC::LocalTensor<ElementIn> l0cTensor,
+                         AscendC::LocalTensor<uint64_t> deqTensor,
+                         uint32_t mTileActual,
+                         uint32_t nTileActual,
+                         uint32_t mTileCeil,
+                         uint32_t nActual)
+    {
+        constexpr uint32_t BLOCK_NUM = 16;
+        constexpr uint32_t BLOCK_SIZE = 32;
+        AscendC::FixpipeParams<ElementIn> intriParams(
+            (nTileActual + BLOCK_NUM - 1) / AscendC::BLOCK_CUBE,
+            static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE),
+            0,
+            mTileCeil - static_cast<uint16_t>(mTileActual * BLOCK_NUM * sizeof(float) / BLOCK_SIZE) *
+                            sizeof(ElementOut) / sizeof(ElementIn));
+        intriParams.nz2ndParams = {false, 1, 0, 0, static_cast<uint16_t>(nTileActual)};
+        intriParams.quantParams = {QuantMode_t::VDEQF16};
+        AscendC::Fixpipe(l1Tensor, l0cTensor, deqTensor, intriParams);
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_ub_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l0c_to_ub_iterator.inc
@ -0,0 +1,71 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+/////////////////////////////////////////////////////
+// l0c_to_ub
+/////////////////////////////////////////////////////
+
+// Partial specialization ZN, half, int32_t
+template <ArchType ArchTag, typename ElementIn, typename ElementOut, bool MatrixMode = true>
+struct l0c_to_ub {
+    __aicore__ l0c_to_ub(AscendC::LocalTensor<ElementOut> ubTensor,
+                         AscendC::LocalTensor<ElementIn> l0cTensor,
+                         uint16_t nBurst,
+                         uint16_t lenBurst,
+                         uint16_t srcStride,
+                         uint16_t dstStride)
+    {
+        constexpr auto mode =
+            MatrixMode ? AscendC::BlockMode::BLOCK_MODE_MATRIX : AscendC::BlockMode::BLOCK_MODE_VECTOR;
+        AscendC::DataCopy(ubTensor,
+                          l0cTensor,
+                          AscendC::DataCopyParams(nBurst,                              // count
+                                                  lenBurst,                            // len
+                                                  srcStride,                           // srcStrideIn
+                                                  dstStride),                          // dstStrideIn
+                          AscendC::DataCopyEnhancedParams(mode,                        // blockModeIn
+                                                          AscendC::DeqScale::DEQ_NONE, // deqScaleIn
+                                                          0,                           // deqValueIn
+                                                          0,                           // sidStoreModeIn
+                                                          false,                       // isReluIn
+                                                          pad_t::PAD_NONE,             // padModeIn
+                                                          0)                           // padValueIn
+        );
+    };
+};
+
+template <ArchType ArchTag>
+struct l0c_to_ub<ArchTag, int32_t, half> {
+    __aicore__ l0c_to_ub(AscendC::LocalTensor<half> ubTensor,
+                         AscendC::LocalTensor<int32_t> l0cTensor,
+                         uint16_t nBurst,
+                         uint16_t lenBurst,
+                         uint16_t srcStride,
+                         uint16_t dstStride)
+    {
+        AscendC::DataCopy(ubTensor,
+                          l0cTensor,
+                          AscendC::DataCopyParams(nBurst,                                        // count
+                                                  lenBurst,                                      // len
+                                                  srcStride,                                     // srcStrideIn
+                                                  dstStride),                                    // dstStrideIn
+                          AscendC::DataCopyEnhancedParams(AscendC::BlockMode::BLOCK_MODE_MATRIX, // blockModeIn
+                                                          AscendC::DeqScale::VDEQ16,             // deqScaleIn
+                                                          0,                                     // deqValueIn
+                                                          0,                                     // sidStoreModeIn
+                                                          false,                                 // isReluIn
+                                                          pad_t::PAD_NONE,                       // padModeIn
+                                                          0)                                     // padValueIn
+        );
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_bt_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_bt_iterator.inc
@ -0,0 +1,39 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+/////////////////////////////////////////////////////
+// l1_to_bt
+/////////////////////////////////////////////////////
+
+// Partial specialization for V220
+template <typename DataType>
+struct l1_to_bt<ArchType::ASCEND_V220, DataType> {
+    __aicore__ l1_to_bt(uint64_t dst,
+                        const AscendC::LocalTensor<DataType> &src,
+                        uint16_t convControl,
+                        uint16_t nBurst,
+                        uint16_t lenBurst,
+                        uint16_t srcGap,
+                        uint16_t dstGap)
+    {
+        AscendC::LocalTensor<DataType> dstTensor;
+        dstTensor.InitBuffer(dst, nBurst * lenBurst);
+        dstTensor.address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::C2);
+        AscendC::DataCopy(dstTensor,
+                          src,
+                          AscendC::DataCopyParams(nBurst,   // nBurst
+                                                  lenBurst, // lenBurst
+                                                  srcGap,   // srcGap
+                                                  dstGap)); // dstGap
+    }
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_fb_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_fb_iterator.inc
@ -0,0 +1,36 @@
+/* Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+/////////////////////////////////////////////////////
+// l1_to_fb
+/////////////////////////////////////////////////////
+
+// Partial specialization for V220
+template <typename DataType>
+struct l1_to_fb<ArchType::ASCEND_V220, DataType> {
+    __aicore__ l1_to_fb(AscendC::LocalTensor<DataType> &dst,
+                        AscendC::LocalTensor<DataType> &src,
+                        uint16_t burstNum,
+                        uint16_t burstLen,
+                        uint16_t srcGap,
+                        uint16_t dstGap)
+    {
+        dst.address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::C2PIPE2GM);
+        AscendC::DataCopy(dst,
+                          src,
+                          AscendC::DataCopyParams(burstNum, // nBurst
+                                                  burstLen, // lenBurst
+                                                  srcGap,   // srcGap
+                                                  dstGap)); // dstGap);
+    }
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_l0_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_l0_iterator.inc
@ -0,0 +1,310 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+/////////////////////////////////////////////////////
+// l1_to_l0_a
+/////////////////////////////////////////////////////
+
+// Partial specialization for vector
+template <ArchType ArchTag, typename DataType, bool IsTransPose>
+struct l1_to_l0_a<ArchTag, DataType, IsTransPose, DataFormat::VECTOR, DataFormat::VECTOR> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t mTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t mSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t mDstStride,
+                          uint32_t kDstStride)
+    {
+        AscendC::LoadData(l0Tensor,
+                          l1Tensor,
+                          AscendC::LoadData2dParams(0,           // baseIdx
+                                                    kPartCeil,   // repeat
+                                                    kSrcStride,  // srcStride
+                                                    0,           // sid
+                                                    kDstStride,  // dstStride
+                                                    IsTransPose, // transpose
+                                                    0));         // addrCalMode
+    };
+};
+
+// Partial specialization for no transpose, not vector
+template <ArchType ArchTag, typename DataType>
+struct l1_to_l0_a<ArchTag, DataType, false, DataFormat::ZN, DataFormat::ZZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t mTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t mSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t mDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint32_t i = 0; i < mTileCeil / BLOCK_NUM_PER_FRACTAL; i++) {
+            AscendC::LoadData(l0Tensor[i * mDstStride * FRACTAL_SIZE],                                 // dst
+                              l1Tensor[i * mSrcStride * FRACTAL_SIZE],                                 // src
+                              AscendC::LoadData2dParams(0,                                             // baseIdx
+                                                        static_cast<uint16_t>(kPartCeil / BLOCK_SIZE), // repeat
+                                                        kSrcStride,                                    // srcStride
+                                                        0,                                             // sid
+                                                        kDstStride - 1,                                // dstStride
+                                                        false,                                         // transpose
+                                                        0));                                           // addrCalMode
+        }
+    };
+};
+
+// Partial specialization for transpose, not vector
+template <ArchType ArchTag, typename DataType>
+struct l1_to_l0_a<ArchTag, DataType, true, DataFormat::ZN, DataFormat::ZZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t mTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t mSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t mDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint32_t i = 0; i < mTileCeil / BLOCK_SIZE; i++) {
+            AscendC::LoadData(l0Tensor[i * mDstStride * FRACTAL_SIZE],
+                              l1Tensor[i * mSrcStride * FRACTAL_SIZE],
+                              AscendC::LoadData2dParams(0,
+                                                        static_cast<uint16_t>(kPartCeil / BLOCK_NUM_PER_FRACTAL),
+                                                        kSrcStride,
+                                                        0,
+                                                        kDstStride - 1,
+                                                        true,
+                                                        0));
+        }
+    };
+};
+
+template <ArchType ArchTag, typename DataType>
+struct l1_to_l0_a<ArchTag, DataType, false, DataFormat::NZ, DataFormat::ZZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    // 16 * 32
+    static constexpr uint32_t ROW_BLOCK_SIZE = 16;
+    static constexpr uint32_t COL_BLOCK_SIZE = 32 / sizeof(DataType);
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t mTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t mSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t mDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint32_t i = 0; i < mTileCeil / ROW_BLOCK_SIZE; i++) {
+            AscendC::LoadData(l0Tensor[i * ROW_BLOCK_SIZE * kPartCeil],
+                              l1Tensor[i * FRACTAL_SIZE],
+                              AscendC::LoadData2dParams(0,
+                                                        static_cast<uint16_t>(kPartCeil / COL_BLOCK_SIZE),
+                                                        mTileCeil / ROW_BLOCK_SIZE,
+                                                        0,
+                                                        0,
+                                                        false,
+                                                        0));
+        }
+    };
+};
+
+template <>
+struct l1_to_l0_a<ArchType::ASCEND_V220, int8_t, true, DataFormat::ZN, DataFormat::ZZ> {
+    using HardwareParams = HardwareInfo<ArchType::ASCEND_V220>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(int8_t); // 32
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(int8_t); // 512
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize; // 16
+    static constexpr uint32_t NUM_FRACTAL_PER_ITER = 2;
+    __aicore__ l1_to_l0_a(AscendC::LocalTensor<int8_t> l0Tensor,
+                          AscendC::LocalTensor<int8_t> l1Tensor,
+                          uint32_t mTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t mSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t mDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint64_t i = 0; i < mTileCeil / (BLOCK_NUM_PER_FRACTAL * NUM_FRACTAL_PER_ITER); ++i) {
+            AscendC::LoadDataWithTranspose(
+                l0Tensor[i * mDstStride * FRACTAL_SIZE * NUM_FRACTAL_PER_ITER], // dstLocalTensor
+                l1Tensor[i * mSrcStride * FRACTAL_SIZE],                        // srcLocalTensor
+                AscendC::LoadData2dTransposeParams(0,                           // baseIdx
+                                                   static_cast<uint16_t>(CeilDiv<BLOCK_SIZE>(kPartCeil)), // repeat
+                                                   kSrcStride,                                            // srcStride
+                                                   0,                                                     // dstGap
+                                                   mDstStride - 1));                                      // dstFracGap
+        }
+    }
+};
+
+/////////////////////////////////////////////////////
+// l1_to_l0_b
+/////////////////////////////////////////////////////
+
+// Partial specialization for vector
+template <ArchType ArchTag, typename DataType, bool IsTransPose>
+struct l1_to_l0_b<ArchTag, DataType, IsTransPose, DataFormat::VECTOR, DataFormat::VECTOR> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t nTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t nSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t nDstStride,
+                          uint32_t kDstStride)
+    {
+        AscendC::LoadData(
+            l0Tensor, l1Tensor, AscendC::LoadData2dParams(0, kPartCeil, kSrcStride, 0, kDstStride, IsTransPose, 0));
+    };
+};
+
+template <ArchType ArchTag>
+struct l1_to_l0_b<ArchTag, int8_t, true, DataFormat::NZ, DataFormat::ZN> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    using DataType = int8_t;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t nTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t nSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t nDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint32_t i = 0; i < nTileCeil / BLOCK_SIZE; i++) {
+            AscendC::LoadDataWithTranspose(l0Tensor[i * kPartCeil * BLOCK_SIZE],
+                                           l1Tensor[i * BLOCK_SIZE * BLOCK_SIZE],
+                                           AscendC::LoadData2dTransposeParams(0,                      // startIndexIn
+                                                                              kPartCeil / BLOCK_SIZE, // repeatTimesIn
+                                                                              nTileCeil / BLOCK_SIZE, // srcStrideIn
+                                                                              1,                      // dstGapIn
+                                                                              0,                      // dstfracGapIn
+                                                                              0)                      // addrModeIn
+            );
+        }
+    };
+};
+
+// Partial specialization for no transpose, not vector
+template <ArchType ArchTag, typename DataType>
+struct l1_to_l0_b<ArchTag, DataType, false, DataFormat::ZN, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t nTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t nSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t nDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint32_t i = 0; i < kPartCeil / BLOCK_NUM_PER_FRACTAL; i++) {
+            AscendC::LoadData(l0Tensor[i * kDstStride * FRACTAL_SIZE],
+                              l1Tensor[i * kSrcStride * FRACTAL_SIZE],
+                              AscendC::LoadData2dParams(0,                                             // baseIdx
+                                                        static_cast<uint16_t>(nTileCeil / BLOCK_SIZE), // repeat
+                                                        nSrcStride,                                    // srcStride
+                                                        0,                                             // sid
+                                                        nDstStride - 1,                                // dstStride
+                                                        true,                                          // transpose
+                                                        0));                                           // addrCalMode
+        }
+    };
+};
+
+// Partial specialization for transpose, not vector
+template <ArchType ArchTag, typename DataType>
+struct l1_to_l0_b<ArchTag, DataType, true, DataFormat::ZN, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchTag>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(DataType);
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(DataType);
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<DataType> l0Tensor,
+                          AscendC::LocalTensor<DataType> l1Tensor,
+                          uint32_t nTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t nSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t nDstStride,
+                          uint32_t kDstStride)
+    {
+        AscendC::LoadData(
+            l0Tensor,
+            l1Tensor,
+            AscendC::LoadData2dParams(0,                                                           // baseIdx
+                                      static_cast<uint16_t>(kPartCeil * nTileCeil / FRACTAL_SIZE), // repeat
+                                      1,                                                           // srcStride
+                                      0,                                                           // sid
+                                      0,                                                           // dstStride
+                                      false,                                                       // transpose
+                                      0));                                                         // addr_cal_mode_t
+    };
+};
+
+template <>
+struct l1_to_l0_b<ArchType::ASCEND_V220, int8_t, false, DataFormat::ZN, DataFormat::NZ> {
+    using HardwareParams = HardwareInfo<ArchType::ASCEND_V220>;
+    static constexpr uint32_t BLOCK_SIZE = HardwareParams::l1l0BlockSize / sizeof(int8_t); // 32
+    static constexpr uint32_t FRACTAL_SIZE = HardwareParams::fractalSize / sizeof(int8_t); // 16
+    static constexpr uint32_t BLOCK_NUM_PER_FRACTAL = HardwareParams::fractalSize / HardwareParams::l1l0BlockSize;
+    static constexpr uint32_t NUM_FRACTAL_PER_ITER = 2;
+
+    __aicore__ l1_to_l0_b(AscendC::LocalTensor<int8_t> l0Tensor,
+                          AscendC::LocalTensor<int8_t> l1Tensor,
+                          uint32_t nTileCeil,
+                          uint32_t kPartCeil,
+                          uint32_t nSrcStride,
+                          uint32_t kSrcStride,
+                          uint32_t nDstStride,
+                          uint32_t kDstStride)
+    {
+        for (uint64_t i = 0; i < kPartCeil / (BLOCK_NUM_PER_FRACTAL * NUM_FRACTAL_PER_ITER); ++i) {
+            AscendC::LoadDataWithTranspose(
+                l0Tensor[i * kDstStride * FRACTAL_SIZE],                        // dstLocalTensor
+                l1Tensor[i * kSrcStride * FRACTAL_SIZE * NUM_FRACTAL_PER_ITER], // srcLocalTensor
+                AscendC::LoadData2dTransposeParams(0,                           // baseIdx
+                                                   static_cast<uint16_t>(CeilDiv<BLOCK_SIZE>(nTileCeil)), // repeat
+                                                   nSrcStride / NUM_FRACTAL_PER_ITER,                     // srcStride
+                                                   1,                                                     // dstGap
+                                                   0));                                                   // dstFracGap
+        }
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_ub_iterator.inc
+++ b/csrc/mla_preprocess/op_kernel/kernel/iterators/l1_to_ub_iterator.inc
@ -0,0 +1,44 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#include "../iterator.h"
+
+/////////////////////////////////////////////////////
+// l1_to_ub
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType>
+struct l1_to_ub {
+    __aicore__ l1_to_ub(AscendC::LocalTensor<DataType> ubTensor,
+                        AscendC::LocalTensor<DataType> l1Tensor,
+                        uint16_t nBurst,
+                        uint16_t lenBurst,
+                        uint16_t srcStride,
+                        uint16_t dstStride)
+    {
+        AscendC::DataCopy(ubTensor, l1Tensor, AscendC::DataCopyParams(nBurst, lenBurst, srcStride, dstStride));
+    };
+};
+
+/////////////////////////////////////////////////////
+// ub_to_l1
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType>
+struct ub_to_l1 {
+    __aicore__ ub_to_l1(AscendC::LocalTensor<DataType> l1Tensor,
+                        AscendC::LocalTensor<DataType> ubTensor,
+                        uint16_t nBurst,
+                        uint16_t lenBurst,
+                        uint16_t srcStride,
+                        uint16_t dstStride)
+    {
+        AscendC::DataCopy(l1Tensor, ubTensor, AscendC::DataCopyParams(nBurst, lenBurst, srcStride, dstStride));
+    };
+};
--- a/csrc/mla_preprocess/op_kernel/kernel/kernel_utils.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/kernel_utils.h
@ -0,0 +1,395 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef ASCEND_OPS_UTILS_COMMON_KERNEL_KERNEL_UTILS_H
+#define ASCEND_OPS_UTILS_COMMON_KERNEL_KERNEL_UTILS_H
+#include "kernel_operator.h"
+
+using AscendC::HardEvent;
+
+__aicore__ inline uint32_t CeilDiv(uint32_t x, uint32_t y)
+{
+    return y == 0 ? 0 : ((x + y - 1) / y);
+}
+
+__aicore__ inline uint32_t RoundUp(uint32_t x, uint32_t y = 16)
+{
+    return (x + y - 1) / y * y;
+}
+
+__aicore__ inline uint32_t Min(uint32_t x, uint32_t y)
+{
+    return x < y ? x : y;
+}
+
+__aicore__ inline uint32_t Max(uint32_t x, uint32_t y)
+{
+    return x > y ? x : y;
+}
+
+template <typename T, typename Q>
+__aicore__ inline void CopyIn(const AscendC::GlobalTensor<T> &gm, Q &queue, uint64_t offset, uint32_t count)
+{
+    AscendC::LocalTensor<T> local = queue.template AllocTensor<T>();
+    DataCopy(local, gm[offset], count);
+    queue.EnQue(local);
+}
+
+template <typename T, typename Q>
+__aicore__ inline void CopyOut(const AscendC::GlobalTensor<T> &gm, Q &queue, uint64_t offset, uint32_t count)
+{
+    AscendC::LocalTensor<T> local = queue.template DeQue<T>();
+    DataCopy(gm[offset], local, count);
+    queue.FreeTensor(local);
+}
+
+template <typename T>
+__aicore__ inline void CastFrom16To32(const AscendC::LocalTensor<float> &out, const AscendC::LocalTensor<T> &in,
+                                      uint32_t count)
+{
+    Cast(out, in, AscendC::RoundMode::CAST_NONE, count);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void CastFrom32To16(const AscendC::LocalTensor<T> &out, const AscendC::LocalTensor<float> &in,
+                                      uint32_t count)
+{
+    if constexpr (AscendC::IsSameType<T, half>::value) {
+        Cast(out, in, AscendC::RoundMode::CAST_NONE,
+             count);  // 310p cast fp32->half 只能用CAST_NONE，这里拉齐310p和910b
+    } else {          // bf16
+        Cast(out, in, AscendC::RoundMode::CAST_RINT, count);
+    }
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void CastFromF16ToI8(const AscendC::LocalTensor<int8_t> &out, const AscendC::LocalTensor<half> &in,
+                                       half quantMin, uint32_t count)
+{
+    Maxs(in, in, quantMin, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Mins(in, in, (half)127, count);  // 127: limit
+    AscendC::PipeBarrier<PIPE_V>();
+#if defined(__CCE_KT_TEST__) || (__CCE_AICORE__ == 220)
+    Cast(out, in, AscendC::RoundMode::CAST_RINT, count);
+#else
+    Cast(out, in, AscendC::RoundMode::CAST_NONE, count);
+#endif
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T, typename Q>
+__aicore__ inline void CopyInAndCastF32(const AscendC::LocalTensor<float> &out, const AscendC::GlobalTensor<T> &gm,
+                                        Q &queue, uint64_t offset, uint32_t count)
+{
+    CopyIn(gm, queue, offset, count);
+    AscendC::LocalTensor<T> local = queue.template DeQue<T>();
+    Cast(out, local, AscendC::RoundMode::CAST_NONE, count);
+    queue.FreeTensor(local);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T, typename Q>
+__aicore__ inline void Cast16AndCopyOut(const AscendC::LocalTensor<float> &in, const AscendC::GlobalTensor<T> &gm,
+                                        Q &queue, uint64_t offset, uint32_t count)
+{
+    AscendC::LocalTensor<T> local = queue.template AllocTensor<T>();
+    CastFrom32To16(local, in, count);
+    queue.EnQue(local);
+    CopyOut(gm, queue, offset, count);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline T ComputeSum(const AscendC::LocalTensor<T> &in, const AscendC::LocalTensor<T> &tmp,
+                               const AscendC::LocalTensor<T> &workLocal, uint32_t count)
+{
+#if __CCE_AICORE__ == 100
+    float sum = 0;
+    int64_t elementNumPerRep = AscendC::ONE_REPEAT_BYTE_SIZE / sizeof(T);
+    AscendC::LocalTensor<T> src = in;
+    while (count > elementNumPerRep) {
+        int64_t repeatTimes = count / elementNumPerRep;
+        int64_t tailCount = count % elementNumPerRep;
+        int64_t bodyCount = repeatTimes * elementNumPerRep;
+        if (repeatTimes > 0) {
+            AscendC::AscendCUtils::SetMask<T>(elementNumPerRep);
+            vcadd((__ubuf__ T *)tmp.GetPhyAddr(), (__ubuf__ T *)src.GetPhyAddr(), repeatTimes, 1, 1, 8);
+            AscendC::SetFlag<HardEvent::V_S>(EVENT_ID0);  // PipeBarrier(PIPE_V)?
+            AscendC::WaitFlag<HardEvent::V_S>(EVENT_ID0);
+        }
+
+        if (tailCount != 0) {
+            AscendC::AscendCUtils::SetMask<T>(tailCount);
+            vcadd((__ubuf__ T *)tmp[bodyCount].GetPhyAddr(), (__ubuf__ T *)src[bodyCount].GetPhyAddr(), 1, 1, 1, 8);
+            AscendC::SetFlag<HardEvent::V_S>(EVENT_ID0);
+            AscendC::WaitFlag<HardEvent::V_S>(EVENT_ID0);
+            sum += tmp.GetValue(bodyCount);
+        }
+
+        count = repeatTimes;
+        src = tmp;
+    }
+
+    if (count > 1) {
+        AscendC::AscendCUtils::SetMask<T>(count);
+        vcadd((__ubuf__ T *)tmp.GetPhyAddr(), (__ubuf__ T *)tmp.GetPhyAddr(), 1, 1, 1, 8);
+        AscendC::SetFlag<HardEvent::V_S>(EVENT_ID0);
+        AscendC::WaitFlag<HardEvent::V_S>(EVENT_ID0);
+    }
+
+    sum += tmp.GetValue(0);
+    return sum;
+#else
+    ReduceSum(tmp, in, workLocal, count);
+    AscendC::SetFlag<HardEvent::V_S>(EVENT_ID0);
+    AscendC::WaitFlag<HardEvent::V_S>(EVENT_ID0);
+    return tmp.GetValue(0);
+#endif
+}
+
+__aicore__ inline float ComputeSliceSquareSum(const AscendC::LocalTensor<float> &in,
+                                              const AscendC::LocalTensor<float> &tmp,
+                                              const AscendC::LocalTensor<float> &workLocal, uint32_t count)
+{
+    Mul(tmp, in, in, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    return ComputeSum(tmp, tmp, workLocal, count);
+}
+template <typename T>
+__aicore__ inline void ComputeRmsNorm(const AscendC::LocalTensor<T> &out, const AscendC::LocalTensor<float> &in,
+                                      float rms, const AscendC::LocalTensor<T> &gamma, uint32_t count,
+                                      uint32_t precisionMode, uint32_t gemmaMode,
+                                      const AscendC::LocalTensor<float> &tmp)
+{
+    float value = 1.0;
+    Duplicate(tmp, rms, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Div(tmp, in, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+
+    if (precisionMode == 0) {
+        CastFrom16To32(in, gamma, count);
+        AscendC::PipeBarrier<PIPE_V>();
+        if (gemmaMode == 1) {
+            Adds(in, in, value, count);
+            AscendC::PipeBarrier<PIPE_V>();
+        }
+        Mul(in, in, tmp, count);
+        AscendC::PipeBarrier<PIPE_V>();
+        CastFrom32To16(out, in, count);
+        return;
+    }
+    if constexpr (std::is_same<T, half>::value) {
+        CastFrom32To16(out, tmp, count);
+        Mul(out, out, gamma, count);
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+}
+
+template <typename T, uint32_t gemmaMode>
+__aicore__ inline void CastGAndIsGemmaMode(const AscendC::LocalTensor<float> &out, const AscendC::LocalTensor<T> &gamma,
+                                           uint32_t count)
+{
+    Cast(out, gamma, AscendC::RoundMode::CAST_NONE, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    float value = 1.0;
+    if constexpr (gemmaMode == 1) {
+        Adds(out, out, value, count);
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+}
+
+template <typename T, uint32_t precisionMode>
+__aicore__ inline void ComputeRmsNormFast(const AscendC::LocalTensor<T> &out, const AscendC::LocalTensor<float> &in,
+                                          float rms, const AscendC::LocalTensor<T> &gamma, uint32_t count,
+                                          const AscendC::LocalTensor<float> &tmp,
+                                          const AscendC::LocalTensor<float> &fp32_g)
+{
+    float value = 1.0;
+    Duplicate(tmp, rms, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Div(tmp, in, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    if constexpr (precisionMode == 0) {
+        Mul(in, fp32_g, tmp, count);
+        AscendC::PipeBarrier<PIPE_V>();
+        CastFrom32To16(out, in, count);
+        return;
+    }
+    if constexpr (std::is_same<T, half>::value) {
+        CastFrom32To16(out, tmp, count);
+        Mul(out, out, gamma, count);
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+}
+
+template <bool WITH_BETA = true>
+__aicore__ inline void ComputeRmsNorm(const AscendC::LocalTensor<float> &out, const AscendC::LocalTensor<float> &in,
+                                      float rms, const AscendC::LocalTensor<half> &gamma,
+                                      const AscendC::LocalTensor<half> &beta, const AscendC::LocalTensor<float> &tmp,
+                                      uint32_t count)
+{
+    Duplicate(tmp, rms, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Div(out, in, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    CastFrom16To32(tmp, gamma, count);
+    Mul(out, out, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    if constexpr (WITH_BETA) {
+        CastFrom16To32(tmp, beta, count);
+        Add(out, out, tmp, count);
+        AscendC::PipeBarrier<PIPE_V>();
+    }
+}
+
+template <typename T>
+__aicore__ inline void ComputeRmsNorm(const AscendC::LocalTensor<float> &out, const AscendC::LocalTensor<float> &in,
+                                      float reciprocal_of_rms, const AscendC::LocalTensor<T> &gamma,
+                                      const AscendC::LocalTensor<float> &tmp, const AscendC::LocalTensor<T> &res_out,
+                                      uint32_t count)
+{
+    Duplicate(tmp, reciprocal_of_rms, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Mul(out, in, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    CastFrom16To32(tmp, gamma, count);
+    Mul(out, out, tmp, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    CastFrom32To16(res_out, out, count);
+}
+
+template <typename T>
+__aicore__ inline void ComputeResidualAdd(const AscendC::LocalTensor<T> &out, const AscendC::LocalTensor<T> &in,
+                                          const AscendC::LocalTensor<T> &resIn, uint32_t count)
+{
+    Add(out, in, resIn, count);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void ComputeMean(const AscendC::LocalTensor<T> &out, const AscendC::LocalTensor<T> &in, T aveNum,
+                                   uint32_t count)
+{
+    Duplicate(out, aveNum, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Mul(out, in, out, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    T sum = ComputeSum(out, out, out, count);
+    AscendC::SetFlag<HardEvent::S_V>(EVENT_ID0);
+    AscendC::WaitFlag<HardEvent::S_V>(EVENT_ID0);
+    Duplicate(out, sum, count);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+template <typename T>
+__aicore__ inline void ComputeLayerNorm(const AscendC::LocalTensor<float> &out, const AscendC::LocalTensor<float> &in,
+                                        const AscendC::LocalTensor<float> &mean, float eps, float aveNum,
+                                        const AscendC::LocalTensor<T> &gamma, const AscendC::LocalTensor<T> &beta,
+                                        uint32_t count)
+{
+    Sub(in, in, mean, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Mul(out, in, in, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Muls(out, out, aveNum, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    ReduceSum(out, out, out, count);
+    AscendC::SetFlag<HardEvent::V_S>(EVENT_ID0);
+    AscendC::WaitFlag<HardEvent::V_S>(EVENT_ID0);
+    float var = out.GetValue(0);
+    AscendC::SetFlag<HardEvent::S_V>(EVENT_ID0);
+    AscendC::WaitFlag<HardEvent::S_V>(EVENT_ID0);
+    Duplicate(out, var, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Adds(out, out, eps, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Sqrt(out, out, count);
+    AscendC::PipeBarrier<PIPE_V>();
+
+    Div(out, in, out, count);
+    AscendC::PipeBarrier<PIPE_V>();
+
+    Cast(in, gamma, AscendC::RoundMode::CAST_NONE, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Mul(out, out, in, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Cast(in, beta, AscendC::RoundMode::CAST_NONE, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Add(out, out, in, count);
+    AscendC::PipeBarrier<PIPE_V>();
+}
+
+__aicore__ inline void ComputeFp16ToI8Quant(const AscendC::LocalTensor<int8_t> &out,
+                                            const AscendC::LocalTensor<half> &in, const AscendC::LocalTensor<half> &tmp,
+                                            half scale, half offset, half quantMin, uint32_t count)
+{
+    Muls(tmp, in, scale, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Adds(tmp, tmp, offset, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    CastFromF16ToI8(out, tmp, quantMin, count);
+}
+
+__aicore__ inline void ComputeFp32ToI8Quant(const AscendC::LocalTensor<int8_t> &out,
+                                            const AscendC::LocalTensor<float> &in,
+                                            const AscendC::LocalTensor<half> &tmp, half scale, half offset,
+                                            half quantMin, uint32_t count)
+{
+    CastFrom32To16(tmp, in, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    ComputeFp16ToI8Quant(out, tmp, tmp, scale, offset, quantMin, count);
+}
+
+__aicore__ inline void ComputeHighPrecisionFp32ToI8Quant(const AscendC::LocalTensor<int8_t> &out,
+                                                         const AscendC::LocalTensor<float> &in,
+                                                         const AscendC::LocalTensor<half> &tmp, float scale,
+                                                         float offset, half quantMin, uint32_t count)
+{
+    Muls(in, in, scale, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    Adds(in, in, offset, count);
+    AscendC::PipeBarrier<PIPE_V>();
+    CastFrom32To16(tmp, in, count);
+    CastFromF16ToI8(out, tmp, quantMin, count);
+}
+
+__aicore__ inline void CopyGmTilingToUb(__ubuf__ uint8_t *&tilingInUb, const __gm__ uint8_t *tilingInGm,
+                                        size_t tilingSize, AscendC::TPipe *pipe)
+{
+    uint32_t roundTilingSize = RoundUp(tilingSize, 32);
+    AscendC::TBuf<AscendC::TPosition::VECCALC> tilingBuf;
+    AscendC::GlobalTensor<uint8_t> tilingGm;
+
+    tilingGm.SetGlobalBuffer((__gm__ uint8_t *)tilingInGm);
+    pipe->InitBuffer(tilingBuf, roundTilingSize);
+
+    AscendC::LocalTensor<uint8_t> tilingUb = tilingBuf.Get<uint8_t>();
+    AscendC::DataCopy(tilingUb, tilingGm, roundTilingSize);
+
+    tilingInUb = (__ubuf__ uint8_t *)tilingUb.GetPhyAddr();
+}
+
+template <typename T>
+__aicore__ inline uint32_t GetReduceSumWorkLocalSize(uint32_t sliceSize)
+{
+    uint32_t elementsPerBlock = 32 / sizeof(T);
+    uint32_t elementsPerRepeat = 256 / sizeof(T);
+
+    uint32_t firstMaxRepeat = sliceSize < elementsPerRepeat ? 1u : (sliceSize / elementsPerRepeat);
+    uint32_t iter1OutputCount = firstMaxRepeat;
+    uint32_t iter1AlignEnd = RoundUp(iter1OutputCount, elementsPerBlock);
+    return iter1AlignEnd;
+}
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/layout.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/layout.h
@ -0,0 +1,18 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+
+#ifndef INCLUDE_LAYOUT_H
+#define INCLUDE_LAYOUT_H
+
+enum class DataFormat { ND = 0, NZ, ZN, ZZ, NN, VECTOR };
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/mem.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/mem.h
@ -0,0 +1,82 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_MEM_H
+#define INCLUDE_MEM_H
+
+#include "hardware.h"
+#include "kernel_event.h"
+#include "kernel_tensor.h"
+
+enum class BufferType { ASCEND_UB, ASCEND_CB, ASCEND_L0A, ASCEND_L0B, ASCEND_L0C, ASCEND_MAX };
+
+template <BufferType BufferType_>
+__aicore__ constexpr AscendC::TPosition GetPosition()
+{
+    if constexpr (BufferType_ == BufferType::ASCEND_UB) {
+        return AscendC::TPosition::VECIN;
+    } else if constexpr (BufferType_ == BufferType::ASCEND_CB) {
+        return AscendC::TPosition::A1;
+    } else if constexpr (BufferType_ == BufferType::ASCEND_L0A) {
+        return AscendC::TPosition::A2;
+    } else if constexpr (BufferType_ == BufferType::ASCEND_L0B) {
+        return AscendC::TPosition::B2;
+    } else if constexpr (BufferType_ == BufferType::ASCEND_L0C) {
+        return AscendC::TPosition::CO1;
+    }
+    return AscendC::TPosition::GM;
+}
+
+template <ArchType ArchTag>
+struct AsdopsBuffer {
+public:
+    __aicore__ AsdopsBuffer()
+    {
+        constexpr uint32_t bufferSize[(uint32_t)BufferType::ASCEND_MAX] = {
+            HardwareInfo<ArchTag>::ubSize, HardwareInfo<ArchTag>::l1Size, HardwareInfo<ArchTag>::l0ASize,
+            HardwareInfo<ArchTag>::l0BSize, HardwareInfo<ArchTag>::l0CSize};
+#ifdef __DAV_C220_VEC__
+        tensor[(uint32_t)BufferType::ASCEND_UB].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_UB]);
+        tensor[(uint32_t)BufferType::ASCEND_UB].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::VECIN);
+#elif defined(__DAV_C220_CUBE__)
+        tensor[(uint32_t)BufferType::ASCEND_CB].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_CB]);
+        tensor[(uint32_t)BufferType::ASCEND_CB].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::A1);
+        tensor[(uint32_t)BufferType::ASCEND_L0A].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0A]);
+        tensor[(uint32_t)BufferType::ASCEND_L0A].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::A2);
+        tensor[(uint32_t)BufferType::ASCEND_L0B].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0B]);
+        tensor[(uint32_t)BufferType::ASCEND_L0B].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::B2);
+        tensor[(uint32_t)BufferType::ASCEND_L0C].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0C]);
+        tensor[(uint32_t)BufferType::ASCEND_L0C].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::CO1);
+#else
+        tensor[(uint32_t)BufferType::ASCEND_UB].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_UB]);
+        tensor[(uint32_t)BufferType::ASCEND_UB].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::VECIN);
+        tensor[(uint32_t)BufferType::ASCEND_CB].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_CB]);
+        tensor[(uint32_t)BufferType::ASCEND_CB].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::A1);
+        tensor[(uint32_t)BufferType::ASCEND_L0A].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0A]);
+        tensor[(uint32_t)BufferType::ASCEND_L0A].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::A2);
+        tensor[(uint32_t)BufferType::ASCEND_L0B].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0B]);
+        tensor[(uint32_t)BufferType::ASCEND_L0B].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::B2);
+        tensor[(uint32_t)BufferType::ASCEND_L0C].InitBuffer(0, bufferSize[(uint32_t)BufferType::ASCEND_L0C]);
+        tensor[(uint32_t)BufferType::ASCEND_L0C].address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::CO1);
+#endif
+    };
+
+    template <BufferType BufferType_, typename DstDataType = half>
+    __aicore__ AscendC::LocalTensor<DstDataType> GetBuffer(const uint32_t offset) const
+    {
+        return tensor[(uint32_t)BufferType_][offset].template ReinterpretCast<DstDataType>();
+    }
+
+public:
+    AscendC::LocalTensor<uint8_t> tensor[(uint32_t)BufferType::ASCEND_MAX];
+};
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/mma.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/mma.h
@ -0,0 +1,67 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_MMA_H
+#define INCLUDE_MMA_H
+
+#include "hardware.h"
+#include "kernel_tensor.h"
+
+template <ArchType ArchTag, typename ElementA, typename ElementB, typename AccDTypeC, bool IsTransposeA>
+struct mmad {
+    __aicore__ mmad(AscendC::LocalTensor<AccDTypeC> l0cTensor, AscendC::LocalTensor<ElementA> l0aTensor,
+                    AscendC::LocalTensor<ElementB> l0bTensor, uint32_t mTileActual, uint32_t nTileActual,
+                    uint32_t kPartActual, bool initC, uint8_t unitFlag = 0) {};
+
+    __aicore__ mmad(AscendC::LocalTensor<AccDTypeC> l0cTensor, AscendC::LocalTensor<ElementA> l0aTensor,
+                    AscendC::LocalTensor<ElementB> l0bTensor, uint64_t biasBt, uint32_t mTileActual,
+                    uint32_t nTileActual, uint32_t kPartActual, bool initC, uint8_t unitFlag = 0) {};
+};
+
+// Partial specialization for V220, int8_t, not_vector_A, not TransposeA
+template <ArchType ArchTag, typename AccDTypeC, typename ElementA, typename ElementB>
+struct mmad<ArchTag, ElementA, ElementB, AccDTypeC, false> {
+    __aicore__ mmad(AscendC::LocalTensor<AccDTypeC> l0cTensor, AscendC::LocalTensor<ElementA> l0aTensor,
+                    AscendC::LocalTensor<ElementB> l0bTensor, uint32_t mTileActual, uint32_t nTileActual,
+                    uint32_t kPartActual, bool initC, uint8_t unitFlag = 0)
+    {
+        AscendC::Mmad(l0cTensor,                        // C
+                      l0aTensor,                        // A
+                      l0bTensor,                        // B
+                      AscendC::MmadParams(mTileActual,  // m
+                                          nTileActual,  // n
+                                          kPartActual,  // k
+                                          unitFlag,     // unitFlag
+                                          false,        // cmatrixSource
+                                          initC));      // cmatrixInitVal
+    };
+
+    __aicore__ mmad(AscendC::LocalTensor<AccDTypeC> l0cTensor, AscendC::LocalTensor<ElementA> l0aTensor,
+                    AscendC::LocalTensor<ElementB> l0bTensor, uint64_t biasBt, uint32_t mTileActual,
+                    uint32_t nTileActual, uint32_t kPartActual, bool initC, uint8_t unitFlag = 0)
+    {
+        AscendC::LocalTensor<AccDTypeC> biasTensor;
+        biasTensor.InitBuffer(biasBt, nTileActual);
+        biasTensor.address_.logicPos = static_cast<uint8_t>(AscendC::TPosition::C2);
+        AscendC::Mmad(l0cTensor,                        // C
+                      l0aTensor,                        // A
+                      l0bTensor,                        // B
+                      biasTensor,                       // bt
+                      AscendC::MmadParams(mTileActual,  // m
+                                          nTileActual,  // n
+                                          kPartActual,  // k
+                                          unitFlag,     // unitFlag
+                                          true,         // cmatrixSource
+                                          false));      // cmatrixInitVal
+    };
+};
+
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/set_fpc.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/set_fpc.h
@ -0,0 +1,38 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_SET_FPC_H
+#define INCLUDE_SET_FPC_H
+
+#include "hardware.h"
+#include "kernel_tensor.h"
+
+/////////////////////////////////////////////////////
+// SetQuantPreAddr
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DataType>
+struct SetQuantPreAddr {
+    __aicore__ SetQuantPreAddr(AscendC::LocalTensor<DataType> quantPreTensor) {};
+};
+
+template <typename DataType>
+struct SetQuantPreAddr<ArchType::ASCEND_V220, DataType> {
+    static constexpr uint32_t QUANT_PRE_ADDR_MASK = 0xffff;
+    static constexpr uint32_t USELESS_BIT_NUM = 7;
+    static constexpr uint32_t QUANT_PRE_BIT_POS_IN_FPC = 8;
+
+    __aicore__ SetQuantPreAddr(AscendC::LocalTensor<DataType> quantPreTensor)
+    {
+        uint64_t quantPreAddr = (uint64_t)(__fbuf__ uint64_t *)quantPreTensor.GetPhyAddr();
+        AscendC::SetFixPipeConfigImpl(quantPreTensor);
+    };
+};
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/simd.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/simd.h
@ -0,0 +1,274 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_SIMD_H
+#define INCLUDE_SIMD_H
+
+#include "hardware.h"
+#include "kernel_operator.h"
+
+/////////////////////////////////////////////////////
+// vcgadd
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void cgadd_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, const int32_t repeat,
+                               const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
+{
+    AscendC::BlockReduceSum<DType, false>(dst, src, repeat, 0, dstRepStride, srcBlkStride, srcRepStride);
+}
+
+/////////////////////////////////////////////////////
+// vadd
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void add_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0,
+                             AscendC::LocalTensor<DType> src1, uint8_t repeat, uint8_t dstBlockStride,
+                             uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride,
+                             uint8_t src0RepeatStride, uint8_t src1RepeatStride)
+{
+    AscendC::Add<DType, false>(dst, src0, src1, (uint64_t)0, repeat,
+                               AscendC::BinaryRepeatParams(dstBlockStride, src0BlockStride, src1BlockStride,
+                                                           dstRepeatStride, src0RepeatStride, src1RepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vadds
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void adds_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, DType scalarValue,
+                              uint8_t repeat, uint8_t dstBlockStride, uint8_t srcBlockStride, uint8_t dstRepeatStride,
+                              uint8_t srcRepeatStride)
+{
+    AscendC::Adds<DType, false>(
+        dst, src, scalarValue, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vcadd
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void cadd_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint8_t repeat,
+                              uint16_t dstRepeatStride, uint16_t srcBlockStride, uint16_t srcRepeatStride)
+{
+    AscendC::RepeatReduceSum<DType, false>(dst, src, repeat, 0, 0, srcBlockStride, dstRepeatStride, srcRepeatStride);
+}
+/////////////////////////////////////////////////////
+// vbrcb
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void brcb_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint16_t dstBlockStride,
+                              uint16_t dstRepeatStride, uint8_t repeat)
+{
+    AscendC::Brcb(dst, src, repeat, AscendC::BrcbRepeatParams(dstBlockStride, dstRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vcmax
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType, AscendC::ReduceOrder OrderType>
+__aicore__ inline void cmax_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint8_t repeat,
+                              uint16_t dstRepeatStride, uint16_t srcBlockStride, uint16_t srcRepeatStride)
+{
+#if defined(__DAV_C220_VEC__)
+    AscendC::WholeReduceMax<DType, false>(dst, src, (int32_t)0, repeat, dstRepeatStride, srcBlockStride,
+                                          srcRepeatStride, OrderType);
+#else
+    AscendC::WholeReduceMax<DType, false>(dst, src, (int32_t)0, repeat, dstRepeatStride, srcBlockStride,
+                                          srcRepeatStride);
+#endif
+}
+
+/////////////////////////////////////////////////////
+// vconv
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DTypeIn, typename DTypeOut>
+__aicore__ inline void conv_v(AscendC::LocalTensor<DTypeOut> dst, AscendC::LocalTensor<DTypeIn> src, uint8_t repeat,
+                              uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride,
+                              uint16_t srcRepeatStride)
+{
+    if constexpr (std::is_same<DTypeIn, float>::value && std::is_same<DTypeOut, __bf16>::value) {
+        AscendC::Cast<DTypeOut, DTypeIn, false>(
+            dst, src, AscendC::RoundMode::CAST_RINT, (uint64_t)0, repeat,
+            AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+    } else {
+        AscendC::Cast<DTypeOut, DTypeIn, false>(
+            dst, src, AscendC::RoundMode::CAST_NONE, (uint64_t)0, repeat,
+            AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+    }
+}
+
+/////////////////////////////////////////////////////
+// vconv_f322bf16r
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DTypeIn, typename DTypeOut>
+__aicore__ inline void convr_v(AscendC::LocalTensor<DTypeOut> dst, AscendC::LocalTensor<DTypeIn> src, uint8_t repeat,
+                               uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride,
+                               uint16_t srcRepeatStride)
+{
+    AscendC::Cast<DTypeOut, DTypeIn, false>(
+        dst, src, AscendC::RoundMode::CAST_RINT, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vdiv
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void div_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0,
+                             AscendC::LocalTensor<DType> src1, uint8_t repeat, uint8_t dstBlockStride,
+                             uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride,
+                             uint8_t src0RepeatStride, uint8_t src1RepeatStride)
+{
+    AscendC::Div<DType, false>(dst, src0, src1, (uint64_t)0, repeat,
+                               AscendC::BinaryRepeatParams(dstBlockStride, src0BlockStride, src1BlockStride,
+                                                           dstRepeatStride, src0RepeatStride, src1RepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vexp
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void exp_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint8_t repeat,
+                             uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride,
+                             uint16_t srcRepeatStride)
+{
+    AscendC::Exp<DType, false>(
+        dst, src, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vmax
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void max_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0,
+                             AscendC::LocalTensor<DType> src1, uint8_t repeat, uint8_t dstBlockStride,
+                             uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride,
+                             uint8_t src0RepeatStride, uint8_t src1RepeatStride)
+{
+    AscendC::Max<DType, false>(dst, src0, src1, (uint64_t)0, repeat,
+                               AscendC::BinaryRepeatParams(dstBlockStride, src0BlockStride, src1BlockStride,
+                                                           dstRepeatStride, src0RepeatStride, src1RepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vmul
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void mul_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0,
+                             AscendC::LocalTensor<DType> src1, uint8_t repeat, uint8_t dstBlockStride,
+                             uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride,
+                             uint8_t src0RepeatStride, uint8_t src1RepeatStride)
+{
+    AscendC::Mul<DType, false>(dst, src0, src1, (uint64_t)0, repeat,
+                               AscendC::BinaryRepeatParams(dstBlockStride, src0BlockStride, src1BlockStride,
+                                                           dstRepeatStride, src0RepeatStride, src1RepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vmuls
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void muls_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0, DType src1,
+                              uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride,
+                              uint16_t dstRepeatStride, uint16_t srcRepeatStride)
+{
+    AscendC::Muls<DType, false>(
+        dst, src0, src1, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vsub
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void sub_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0,
+                             AscendC::LocalTensor<DType> src1, uint8_t repeat, uint8_t dstBlockStride,
+                             uint8_t src0BlockStride, uint8_t src1BlockStride, uint8_t dstRepeatStride,
+                             uint8_t src0RepeatStride, uint8_t src1RepeatStride)
+{
+    AscendC::Sub<DType, false>(dst, src0, src1, (uint64_t)0, repeat,
+                               AscendC::BinaryRepeatParams(dstBlockStride, src0BlockStride, src1BlockStride,
+                                                           dstRepeatStride, src0RepeatStride, src1RepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vmaxs
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void maxs_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0, DType src1,
+                              uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride,
+                              uint16_t dstRepeatStride, uint16_t srcRepeatStride)
+{
+    AscendC::Maxs<DType, false>(
+        dst, src0, src1, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vmins
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void mins_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src0, DType src1,
+                              uint8_t repeat, uint16_t dstBlockStride, uint16_t srcBlockStride,
+                              uint16_t dstRepeatStride, uint16_t srcRepeatStride)
+{
+    AscendC::Mins<DType, false>(
+        dst, src0, src1, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vsqrt
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void sqrt_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint8_t repeat,
+                              uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride,
+                              uint16_t srcRepeatStride)
+{
+    AscendC::Sqrt<DType, false>(
+        dst, src, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vln
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void ln_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, uint8_t repeat,
+                            uint16_t dstBlockStride, uint16_t srcBlockStride, uint16_t dstRepeatStride,
+                            uint16_t srcRepeatStride)
+{
+    AscendC::Ln<DType, false>(
+        dst, src, (uint64_t)0, repeat,
+        AscendC::UnaryRepeatParams(dstBlockStride, srcBlockStride, dstRepeatStride, srcRepeatStride));
+}
+
+/////////////////////////////////////////////////////
+// vtranspose
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void tranpose_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src)
+{
+    AscendC::Transpose(dst, src);
+}
+
+/////////////////////////////////////////////////////
+// vcgmax
+/////////////////////////////////////////////////////
+template <ArchType ArchTag, typename DType>
+__aicore__ inline void cgmax_v(AscendC::LocalTensor<DType> dst, AscendC::LocalTensor<DType> src, const int32_t repeat,
+                               const int32_t dstRepStride, const int32_t srcBlkStride, const int32_t srcRepStride)
+{
+    AscendC::BlockReduceMax<DType, false>(dst, src, repeat, 0, dstRepStride, srcBlkStride, srcRepStride);
+}
+#endif
--- a/csrc/mla_preprocess/op_kernel/kernel/utils.h
+++ b/csrc/mla_preprocess/op_kernel/kernel/utils.h
@ -0,0 +1,69 @@
+/*  Adapted from
+ *      https://gitee.com/ascend/ascend-transformer-boost.git
+ *
+ * Copyright (c) 2024 Huawei Technologies Co., Ltd.
+ * This file is a part of the CANN Open Software.
+ * Licensed under CANN Open Software License Agreement Version 1.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ */
+#ifndef INCLUDE_UTILS_H
+#define INCLUDE_UTILS_H
+
+template <typename IN_DTYPE>
+__aicore__ inline void CreateCaMatrix(const AscendC::LocalTensor<IN_DTYPE> &dst, const uint16_t repeats,
+                                      const uint16_t blockNum, const uint16_t dstGap, const IN_DTYPE initValue)
+{
+    AscendC::InitConstValue<IN_DTYPE>(dst,
+                                      AscendC::InitConstValueParams<IN_DTYPE>(repeats, blockNum, dstGap, initValue));
+}
+__aicore__ inline void SetFftsBaseAddr(uint64_t config)
+{
+    AscendC::SetSyncBaseAddr(config);
+}
+template <typename IN_DTYPE>
+__aicore__ inline void SetPadding(IN_DTYPE padValue)
+{
+    AscendC::SetLoadDataPaddingValue<IN_DTYPE>(padValue);
+}
+__aicore__ inline void SetAtomicnone()
+{
+    AscendC::SetAtomicNone();
+}
+__aicore__ inline void SetMasknorm()
+{
+#if __CCE_AICORE__ == 100
+    return;
+#endif
+    AscendC::SetMaskNorm();
+}
+__aicore__ inline void SetNdpara(uint16_t ndNum, uint16_t srcNdStride, uint16_t dstNdStride)
+{
+    AscendC::SetFixpipeNz2ndFlag(ndNum, srcNdStride, dstNdStride);
+}
+template <typename IN_DTYPE>
+__aicore__ inline void SetVectorMask(const uint64_t maskHigh, const uint64_t maskLow)
+{
+    AscendC::SetVectorMask<IN_DTYPE>(maskHigh, maskLow);
+}
+__aicore__ inline int64_t GetSubBlockidx()
+{
+    return AscendC::GetSubBlockIdx();
+}
+__aicore__ inline void WaitFlagDev(uint16_t flagId)
+{
+    AscendC::WaitEvent(flagId);
+}
+template <pipe_t pipe, uint8_t mode>
+__aicore__ inline void FftsCrossCoreSync(uint16_t flagId)
+{
+    AscendC::CrossCoreSetFlag<mode, pipe>(flagId);
+}
+template <typename IN_DTYPE, bool setRelu = false>
+__aicore__ inline void SetFpc(const AscendC::LocalTensor<IN_DTYPE> &preTensor, bool isUnitFlag = false)
+{
+    AscendC::SetFixPipeConfig<IN_DTYPE, setRelu>(preTensor, isUnitFlag);
+}
+#endif
--- a/Show More
+++ b/Show More