diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
new file mode 100644
index 000000000..38c0b58e8
--- /dev/null
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,28 @@
+<!--  Thanks for sending a pull request!
+
+BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html
+
+-->
+### What this PR does / why we need it?
+<!--
+- Please clarify what changes you are proposing. The purpose of this section is to outline the changes and how this PR fixes the issue.
+If possible, please consider writing useful notes for better and faster reviews in your PR.
+
+- Please clarify why the changes are needed. For instance, the use case and bug description.
+
+- Fixes #
+-->
+
+### Does this PR introduce _any_ user-facing change?
+<!--
+Note that it means *any* user-facing change including all aspects such as API, interface or other behavior changes.
+Documentation-only updates are not considered user-facing changes.
+-->
+
+### How was this patch tested?
+<!--
+CI passed with new added/existing test.
+If it was tested in a way different from regular unit tests, please clarify how you tested step by step, ideally copy and paste-able, so that other reviewers can test and check, and descendants can verify in the future.
+If tests were not added, please describe why they were not added and/or why it was difficult to add.
+-->
+
diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml
new file mode 100644
index 000000000..1161a6e21
--- /dev/null
+++ b/.github/workflows/actionlint.yml
@@ -0,0 +1,57 @@
+#
+# Adapted from vllm-project/vllm/blob/main/.github
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: Lint GitHub Actions workflows
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '.github/workflows/*.ya?ml'
+      - '.github/workflows/actionlint.*'
+      - '.github/workflows/matchers/actionlint.json'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  actionlint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Run actionlint"
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/actionlint.json"
+          tools/actionlint.sh -color
diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json
new file mode 100644
index 000000000..4613e1617
--- /dev/null
+++ b/.github/workflows/matchers/actionlint.json
@@ -0,0 +1,17 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "actionlint",
+      "pattern": [
+        {
+          "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$",
+          "file": 1,
+          "line": 2,
+          "column": 3,
+          "message": 4,
+          "code": 5
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json
new file mode 100644
index 000000000..f048fce52
--- /dev/null
+++ b/.github/workflows/matchers/mypy.json
@@ -0,0 +1,16 @@
+{
+  "problemMatcher": [
+    {
+      "owner": "mypy",
+      "pattern": [
+        {
+          "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$",
+          "file": 1,
+          "line": 2,
+          "severity": 3,
+          "message": 4
+        }
+      ]
+    }
+  ]
+}
diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json
new file mode 100644
index 000000000..f6d4479ee
--- /dev/null
+++ b/.github/workflows/matchers/ruff.json
@@ -0,0 +1,17 @@
+{
+    "problemMatcher": [
+      {
+        "owner": "ruff",
+        "pattern": [
+          {
+            "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
+            "file": 1,
+            "line": 2,
+            "column": 3,
+            "code": 4,
+            "message": 5
+          }
+        ]
+      }
+    ]
+  }
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
new file mode 100644
index 000000000..ec9c2e6f5
--- /dev/null
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,74 @@
+#
+# Adapted from vllm-project/vllm/blob/main/.github
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: mypy
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**/*.py'
+      - '.github/workflows/mypy.yaml'
+      - 'tools/mypy.sh'
+  pull_request:
+    branches:
+      - "main"
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    paths:
+     - '**/*.py'
+     - '.github/workflows/mypy.yaml'
+     - 'tools/mypy.sh'
+
+jobs:
+  mypy:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9", "3.10", "3.11", "3.12"]
+    steps:
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        pip install -r requirements-dev.txt 
+
+    - name: Checkout vllm-project/vllm repo
+      uses: actions/checkout@v4
+      with:
+        repository: vllm-project/vllm
+        path: vllm-empty
+
+    - name: Install vllm-project/vllm from source
+      working-directory: vllm-empty
+      run: |
+        pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
+        VLLM_TARGET_DEVICE=empty pip install .
+
+    - name: Mypy
+      run: |
+        echo "::add-matcher::.github/workflows/matchers/mypy.json"
+        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
new file mode 100644
index 000000000..11573a84a
--- /dev/null
+++ b/.github/workflows/ruff.yml
@@ -0,0 +1,57 @@
+#
+# Adapted from vllm-project/vllm/blob/main/.github
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: ruff
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - requirements-lint.txt
+      - .github/workflows/matchers/ruff.json
+      - .github/workflows/ruff.yml
+  pull_request:
+    branches:
+      - "main"
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements-lint.txt
+      - name: Analysing the code with ruff
+        run: |
+          echo "::add-matcher::.github/workflows/matchers/ruff.json"
+          ruff check --output-format github .
+      - name: Run isort
+        run: |
+          isort . --check-only
diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml
new file mode 100644
index 000000000..6a8ff7a28
--- /dev/null
+++ b/.github/workflows/shellcheck.yml
@@ -0,0 +1,54 @@
+#
+# Adapted from vllm-project/vllm/blob/main/.github
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: Lint shell scripts
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '**/*.sh'
+      - '.github/workflows/shellcheck.yml'
+
+env:
+  LC_ALL: en_US.UTF-8
+
+defaults:
+  run:
+    shell: bash
+
+permissions:
+  contents: read
+
+jobs:
+  shellcheck:
+    runs-on: ubuntu-latest
+    steps:
+      - name: "Checkout"
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+
+      - name: "Check shell scripts"
+        run: |
+          tools/shellcheck.sh
diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml
new file mode 100644
index 000000000..bea98471a
--- /dev/null
+++ b/.github/workflows/vllm_ascend_test.yaml
@@ -0,0 +1,106 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: 'e2e test'
+
+on:
+  push:
+    branches:
+      - "main"
+    paths:
+      - '*.txt'
+      - '**/*.py'
+      - '.github/workflows/vllm_ascend_test.yaml'
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - '*.txt'
+      - '**/*.py'
+      - '.github/workflows/vllm_ascend_test.yaml'
+
+# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly
+# declared as "shell: bash -el {0}" on steps that need to be properly activated.
+# It's used to activate ascend-toolkit environment variables.
+defaults:
+  run:
+    shell: bash -el {0}
+
+jobs:
+  test:
+    name: vLLM Ascend test (self-host)
+    runs-on: ascend-arm64  # actionlint-ignore: runner-label
+
+    container:
+      image: quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10
+      volumes:
+        - /usr/local/dcmi:/usr/local/dcmi
+        - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
+        - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/
+        # Use self-host cache speed up pip and model download
+        - /home/action/actions-runner/_work/cache:/github/home/.cache/
+      options: >-
+        --device /dev/davinci6
+        --device /dev/davinci_manager
+        --device /dev/devmm_svm
+        --device /dev/hisi_hdc
+      env:
+        HF_ENDPOINT: https://hf-mirror.com
+    steps:
+      - name: Check npu driver
+        run: |
+          npu-smi info
+
+      - name: Config mirrors
+        run: |
+          sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list
+          pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+      - name: Checkout vllm-project/vllm-ascend repo
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
+        run: |
+          apt-get update -y
+          apt-get -y install `cat packages.txt`
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements-dev.txt
+
+      - name: Checkout vllm-project/vllm repo
+        uses: actions/checkout@v4
+        with:
+          repository: vllm-project/vllm
+          path: ./vllm-empty
+
+      - name: Install vllm-project/vllm from source
+        working-directory: ./vllm-empty
+        run: |
+          VLLM_TARGET_DEVICE=empty pip install -e .
+
+      - name: Install vllm-project/vllm-ascend
+        run: |
+          pip install -e .
+
+      - name: Run vllm-project/vllm-ascend test
+        run: |
+          pytest -sv tests
+
+      - name: Run vllm-project/vllm test
+        run: |
+          bash tools/npu-vllm-test.sh
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
new file mode 100644
index 000000000..14a3ae925
--- /dev/null
+++ b/.github/workflows/yapf.yml
@@ -0,0 +1,54 @@
+#
+# Adapted from vllm-project/vllm/blob/main/.github
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+name: yapf
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+  pull_request:
+    branches:
+      - "main"
+    paths:
+      - "**/*.py"
+      - .github/workflows/yapf.yml
+
+jobs:
+  yapf:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install yapf==0.32.0
+      - name: Running yapf
+        run: |
+          yapf --diff --recursive .
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..3991ac8f0
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,194 @@
+## vLLM Ascend Ignore
+# VSCode
+.vscode/
+
+# egg-info
+vllm_ascend.egg-info/
+
+# DS Store
+.DS_Store
+
+# Linting
+actionlint
+shellcheck*/
+
+
+# Python gitignore
+## Adapted from:
+## https://github.com/github/gitignore/blob/main/Python.gitignore
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 000000000..f801b5f8f
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,128 @@
+
+# vLLM Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socioeconomic status,
+nationality, personal appearance, race, caste, color, religion, or sexual
+identity and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the overall
+  community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or advances of
+  any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email address,
+  without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official email address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline/IRL event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement in the #code-of-conduct
+channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series of
+actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or permanent
+ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior, harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within the
+community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/),
+version 2.1, available at
+[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html).
+
+Community Impact Guidelines were inspired by
+[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion).
+
+For answers to common questions about this code of conduct, see the
+[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
+[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
+
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 000000000..c7d45f682
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,107 @@
+# Contributing to vLLM Ascend plugin
+
+## Building and testing
+It's recommended to set up a local development environment to build and test
+before you submit a PR.
+
+### Prepare environment and build
+
+Theoretically, the vllm-ascend build is only supported on Linux because
+`vllm-ascend` dependency `torch_npu` only supports Linux.
+
+But you can still set up dev env on Linux/Windows/macOS for linting and basic
+test as following commands:
+
+```bash
+# Choose a base dir (~/vllm-project/) and set up venv
+cd ~/vllm-project/
+python3 -m venv .venv
+source ./.venv/bin/activate
+
+# Clone vllm code and install
+git clone https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements-build.txt
+VLLM_TARGET_DEVICE="empty" pip install .
+cd ..
+
+# Clone vllm-ascend and install
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+pip install -r requirements-dev.txt
+
+# Then you can run lint and mypy test
+bash format.sh
+
+# Build:
+# - only supported on Linux (torch_npu available)
+# pip install -e .
+# - build without deps for debugging in other OS
+# pip install -e . --no-deps
+
+# Commit changed files using `-s`
+git commit -sm "your commit info"
+```
+
+### Testing
+
+Although vllm-ascend CI provide integration test on [Ascend](.github/workflows/vllm_ascend_test.yaml), you can run it
+locally. The simplest way to run these integration tests locally is through a container:
+
+```bash
+# Under Ascend NPU environment
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+
+IMAGE=vllm-ascend-dev-image
+CONTAINER_NAME=vllm-ascend-dev
+DEVICE=/dev/davinci1
+
+# The first build will take about 10 mins (10MB/s) to download the base image and packages
+docker build -t $IMAGE -f ./Dockerfile .
+# You can also specify the mirror repo via setting VLLM_REPO to speedup
+# docker build -t $IMAGE -f ./Dockerfile . --build-arg VLLM_REPO=https://gitee.com/mirrors/vllm
+
+docker run --name $CONTAINER_NAME --network host --device $DEVICE \
+           --device /dev/davinci_manager --device /dev/devmm_svm \
+           --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi \
+           -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+           -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+           -ti --rm $IMAGE bash
+
+cd vllm-ascend
+pip install -r requirements-dev.txt
+
+pytest tests/
+```
+
+## DCO and Signed-off-by
+
+When contributing changes to this project, you must agree to the DCO. Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO.
+
+Using `-s` with `git commit` will automatically add this header.
+
+## PR Title and Classification
+
+Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:
+
+- `[Attention]` for new features or optimization in attention.
+- `[Communicator]` for new features or optimization in communicators.
+- `[ModelRunner]` for new features or optimization in model runner.
+- `[Platform]` for new features or optimization in platform.
+- `[Worker]` for new features or optimization in worker.
+- `[Core]` for new features or optimization  in the core vllm-ascend logic (such as platform, attention, communicators, model runner)
+- `[Kernel]` changes affecting compute kernels and ops.
+- `[Bugfix]` for bug fixes.
+- `[Doc]` for documentation fixes and improvements.
+- `[Test]` for tests (such as unit tests).
+- `[CI]` for build or continuous integration improvements.
+- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly.
+
+> [!NOTE]
+> If the PR spans more than one category, please include all relevant prefixes.
+
+## Others
+
+You may find more information about contributing to vLLM Ascend backend plugin on [<u>docs.vllm.ai</u>](https://docs.vllm.ai/en/latest/contributing/overview.html).
+If you find any problem when contributing, you can feel free to submit a PR to improve the doc to help other developers.
diff --git a/DCO b/DCO
new file mode 100644
index 000000000..49b8cb054
--- /dev/null
+++ b/DCO
@@ -0,0 +1,34 @@
+Developer Certificate of Origin
+Version 1.1
+
+Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+Everyone is permitted to copy and distribute verbatim copies of this
+license document, but changing it is not allowed.
+
+
+Developer's Certificate of Origin 1.1
+
+By making a contribution to this project, I certify that:
+
+(a) The contribution was created in whole or in part by me and I
+    have the right to submit it under the open source license
+    indicated in the file; or
+
+(b) The contribution is based upon previous work that, to the best
+    of my knowledge, is covered under an appropriate open source
+    license and I have the right under that license to submit that
+    work with modifications, whether created in whole or in part
+    by me, under the same open source license (unless I am
+    permitted to submit under a different license), as indicated
+    in the file; or
+
+(c) The contribution was provided directly to me by some other
+    person who certified (a), (b) or (c) and I have not modified
+    it.
+
+(d) I understand and agree that this project and the contribution
+    are public and that a record of the contribution (including all
+    personal information I submit with it, including my sign-off) is
+    maintained indefinitely and may be redistributed consistent with
+    this project or the open source license(s) involved.
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..63c8bd685
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+FROM quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10
+
+# Define environments
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git vim
+
+WORKDIR /workspace
+
+COPY . /workspace/vllm-ascend/
+
+RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
+
+# Install vLLM main
+ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
+RUN git clone --depth 1 $VLLM_REPO /workspace/vllm
+RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/
+
+# Install vllm-ascend main
+RUN python3 -m pip install /workspace/vllm-ascend/
+
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
new file mode 100644
index 000000000..c16e8372d
--- /dev/null
+++ b/README.md
@@ -0,0 +1,102 @@
+<p align="center">
+  <picture>
+    <!-- TODO: Replace tmp link to logo url after vllm-projects/vllm-ascend ready -->
+    <source media="(prefers-color-scheme: dark)" srcset="https://github.com/user-attachments/assets/4a958093-58b5-4772-a942-638b51ced646">
+    <img alt="vllm-ascend" src="https://github.com/user-attachments/assets/838afe2f-9a1d-42df-9758-d79b31556de0" width=55%>
+  </picture>
+</p>
+
+<h3 align="center">
+vLLM Ascend Plugin
+</h3>
+
+<p align="center">
+| <a href="https://www.hiascend.com/en/"><b>About Ascend</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack (#sig-ascend)</b></a> |
+</p>
+
+---
+*Latest News* 🔥
+
+- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162).
+---
+## Overview
+
+vLLM Ascend plugin (`vllm-ascend`) is a backend plugin for running vLLM on the Ascend NPU.
+
+This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM.
+
+By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU.
+
+## Prerequisites
+### Support Devices
+- Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2)
+- Atlas 800I A2 Inference series (Atlas 800I A2)
+
+### Dependencies
+| Requirement | Supported version | Recommended version | Note                                     |
+|-------------|-------------------| ----------- |------------------------------------------|
+| vLLM        | main              | main | Required for vllm-ascend                 |
+| Python      | >= 3.9            | [3.10](https://www.python.org/downloads/) | Required for vllm                        |
+| CANN        | >= 8.0.RC2        | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu   |
+| torch-npu   | >= 2.4.0          | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1)    | Required for vllm-ascend                 |
+| torch       | >= 2.4.0          | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1)      | Required for torch-npu and vllm required |
+
+Find more about how to setup your environment in [here](docs/environment.md).
+
+## Getting Started
+
+> [!NOTE]
+> Currently, we are actively collaborating with the vLLM community to support the Ascend backend plugin, once supported you can use one line command `pip install vllm vllm-ascend` to compelete installation.
+
+Installation from source code:
+```bash
+# Install vllm main branch according:
+# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#build-wheel-from-source
+git clone --depth 1 https://github.com/vllm-project/vllm.git
+cd vllm
+pip install -r requirements-build.txt
+VLLM_TARGET_DEVICE=empty pip install .
+
+# Install vllm-ascend main branch
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+pip install -e .
+```
+
+Run the following command to start the vLLM server with the [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model:
+
+```bash
+# export VLLM_USE_MODELSCOPE=true to speed up download
+vllm serve Qwen/Qwen2.5-0.5B-Instruct
+curl http://localhost:8000/v1/models
+```
+
+Please refer to [vLLM Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) for more details.
+
+## Building
+
+#### Build Python package from source
+
+```bash
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+pip install -e .
+```
+
+#### Build container image from source
+```bash
+git clone https://github.com/vllm-project/vllm-ascend.git
+cd vllm-ascend
+docker build -t vllm-ascend-dev-image -f ./Dockerfile .
+```
+
+See [Building and Testing](./CONTRIBUTING.md) for more details, which is a step-by-step guide to help you set up development environment, build and test.
+
+## Contributing
+We welcome and value any contributions and collaborations:
+- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues).
+- Please see the guidance on how to contribute in [CONTRIBUTING.md](./CONTRIBUTING.md).
+
+## License
+
+Apache License 2.0, as found in the [LICENSE](./LICENSE) file.
diff --git a/docs/environment.md b/docs/environment.md
new file mode 100644
index 000000000..5dd70b29a
--- /dev/null
+++ b/docs/environment.md
@@ -0,0 +1,38 @@
+### Prepare Ascend NPU environment
+
+### Dependencies
+| Requirement  | Supported version | Recommended version | Note |
+| ------------ | ------- | ----------- | ----------- | 
+| Python | >= 3.9 | [3.10](https://www.python.org/downloads/) | Required for vllm |
+| CANN         | >= 8.0.RC2 | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu |
+| torch-npu    | >= 2.4.0   | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1)    | Required for vllm-ascend |
+| torch        | >= 2.4.0   | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1)      | Required for torch-npu and vllm required |
+
+
+Below is a quick note to install recommended version software:
+
+#### Containerized installation
+
+You can use the [container image](https://hub.docker.com/r/ascendai/cann) directly with one line command:
+
+```bash
+docker run \
+    --name vllm-ascend-env \
+    --device /dev/davinci1 \
+    --device /dev/davinci_manager \
+    --device /dev/devmm_svm \
+    --device /dev/hisi_hdc \
+    -v /usr/local/dcmi:/usr/local/dcmi \
+    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
+    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
+    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
+    -v /etc/ascend_install.info:/etc/ascend_install.info \
+    -it quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 bash
+```
+
+You do not need to install `torch` and `torch_npu` manually, they will be automatically installed as `vllm-ascend` dependencies.
+
+#### Manual installation
+
+Or follow the instructions provided in the [Ascend Installation Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) to set up the environment.
+
diff --git a/docs/logos/vllm-ascend-logo-text-dark.png b/docs/logos/vllm-ascend-logo-text-dark.png
new file mode 100644
index 000000000..f534d09ee
Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-dark.png differ
diff --git a/docs/logos/vllm-ascend-logo-text-light.png b/docs/logos/vllm-ascend-logo-text-light.png
new file mode 100644
index 000000000..b71b49267
Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-light.png differ
diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py
new file mode 100644
index 000000000..f8d5489a5
--- /dev/null
+++ b/examples/offline_distributed_inference_npu.py
@@ -0,0 +1,45 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+# Create an LLM.
+# TODO (cmq): ray is not supported currently, need some fixes
+llm = LLM(
+    model="facebook/opt-125m",
+    tensor_parallel_size=2,
+    distributed_executor_backend="mp",
+    trust_remote_code=True,
+)
+
+# Generate texts from the prompts.
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py
new file mode 100644
index 000000000..785492c7d
--- /dev/null
+++ b/examples/offline_inference_audio_language.py
@@ -0,0 +1,153 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/audio_language.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+This example shows how to use vLLM for running offline inference 
+with the correct prompt format on audio language models.
+
+For most models, the prompt format should follow corresponding examples
+on HuggingFace model repository.
+"""
+
+from transformers import AutoTokenizer
+from vllm import LLM, SamplingParams
+from vllm.assets.audio import AudioAsset
+from vllm.utils import FlexibleArgumentParser
+
+audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")]
+question_per_audio_count = {
+    0: "What is 1+1?",
+    1: "What is recited in the audio?",
+    2: "What sport and what nursery rhyme are referenced?"
+}
+
+# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on
+# lower-end GPUs.
+# Unless specified, these settings have been tested to work on a single L4.
+
+
+# Ultravox 0.3
+def run_ultravox(question: str, audio_count: int):
+    model_name = "fixie-ai/ultravox-v0_3"
+
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    messages = [{
+        'role': 'user',
+        'content': "<|audio|>\n" * audio_count + question
+    }]
+    prompt = tokenizer.apply_chat_template(messages,
+                                           tokenize=False,
+                                           add_generation_prompt=True)
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              trust_remote_code=True,
+              limit_mm_per_prompt={"audio": audio_count})
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# Qwen2-Audio
+def run_qwen2_audio(question: str, audio_count: int):
+    model_name = "Qwen/Qwen2-Audio-7B-Instruct"
+
+    llm = LLM(model=model_name,
+              max_model_len=4096,
+              max_num_seqs=5,
+              limit_mm_per_prompt={"audio": audio_count})
+
+    audio_in_prompt = "".join([
+        f"Audio {idx+1}: "
+        f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count)
+    ])
+
+    prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+              "<|im_start|>user\n"
+              f"{audio_in_prompt}{question}<|im_end|>\n"
+              "<|im_start|>assistant\n")
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
+# TODO (cmq): test ultravox
+model_example_map = {
+    # "ultravox": run_ultravox,
+    "qwen2_audio": run_qwen2_audio
+}
+
+
+def main(args):
+    model = args.model_type
+    if model not in model_example_map:
+        raise ValueError(f"Model type {model} is not supported.")
+
+    audio_count = args.num_audios
+    llm, prompt, stop_token_ids = model_example_map[model](
+        question_per_audio_count[audio_count], audio_count)
+
+    # We set temperature to 0.2 so that outputs can be different
+    # even when all prompts are identical when running batch inference.
+    sampling_params = SamplingParams(temperature=0.2,
+                                     max_tokens=64,
+                                     stop_token_ids=stop_token_ids)
+
+    mm_data = {}
+    if audio_count > 0:
+        mm_data = {
+            "audio": [
+                asset.audio_and_sample_rate
+                for asset in audio_assets[:audio_count]
+            ]
+        }
+
+    assert args.num_prompts > 0
+    inputs = {"prompt": prompt, "multi_modal_data": mm_data}
+    if args.num_prompts > 1:
+        # Batch inference
+        inputs = [inputs] * args.num_prompts
+
+    outputs = llm.generate(inputs, sampling_params=sampling_params)
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description='Demo on using vLLM for offline inference with '
+        'audio language models')
+    parser.add_argument('--model-type',
+                        '-m',
+                        type=str,
+                        default="qwen2_audio",
+                        choices=model_example_map.keys(),
+                        help='Huggingface "model_type".')
+    parser.add_argument('--num-prompts',
+                        type=int,
+                        default=1,
+                        help='Number of prompts to run.')
+    parser.add_argument("--num-audios",
+                        type=int,
+                        default=1,
+                        choices=[0, 1, 2],
+                        help="Number of audio items per prompt.")
+
+    args = parser.parse_args()
+    main(args)
diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py
new file mode 100644
index 000000000..10c2c6e40
--- /dev/null
+++ b/examples/offline_inference_npu.py
@@ -0,0 +1,39 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/examples/offline_inference/basic.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from vllm import LLM, SamplingParams
+
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+# Create a sampling params object.
+sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+# Create an LLM.
+llm = LLM(model="facebook/opt-125m")
+
+# Generate texts from the prompts.
+outputs = llm.generate(prompts, sampling_params)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/format.sh b/format.sh
new file mode 100755
index 000000000..9ea7495c2
--- /dev/null
+++ b/format.sh
@@ -0,0 +1,341 @@
+#!/usr/bin/env bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# YAPF formatter, adapted from ray and skypilot.
+#
+# Usage:
+#    # Do work and commit your work.
+
+#    # Format files that differ from origin/main.
+#    bash format.sh
+
+#    # Commit changed files with message 'Run yapf and ruff'
+#
+#
+# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# You are encouraged to run this locally before pushing changes for review.
+
+# Cause the script to exit if a single command fails
+set -eo pipefail
+
+# this stops git rev-parse from failing if we run this from the .git directory
+builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
+ROOT="$(git rev-parse --show-toplevel)"
+builtin cd "$ROOT" || exit 1
+
+check_command() {
+    if ! command -v "$1" &> /dev/null; then
+        echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`"
+        exit 1
+    fi
+}
+
+check_command yapf
+check_command ruff
+check_command mypy
+check_command codespell
+check_command isort
+check_command clang-format
+
+YAPF_VERSION=$(yapf --version | awk '{print $2}')
+RUFF_VERSION=$(ruff --version | awk '{print $2}')
+MYPY_VERSION=$(mypy --version | awk '{print $2}')
+CODESPELL_VERSION=$(codespell --version)
+ISORT_VERSION=$(isort --vn)
+CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}')
+SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}')
+
+# params: tool name, tool version, required version
+tool_version_check() {
+    expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3)
+    if [[ "$2" != "$expected" ]]; then
+        echo "❓❓Wrong $1 version installed: $expected is required, not $2."
+        exit 1
+    fi
+}
+
+tool_version_check "yapf" "$YAPF_VERSION"
+tool_version_check "ruff" "$RUFF_VERSION"
+tool_version_check "mypy" "$MYPY_VERSION"
+tool_version_check "isort" "$ISORT_VERSION"
+tool_version_check "codespell" "$CODESPELL_VERSION"
+tool_version_check "clang-format" "$CLANGFORMAT_VERSION"
+tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION"
+
+YAPF_FLAGS=(
+    '--recursive'
+    '--parallel'
+)
+
+YAPF_EXCLUDES=(
+    '--exclude' 'build/**'
+)
+
+# Format specified files
+format() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "$@"
+}
+
+# Format files that differ from main branch. Ignores dirs that are not slated
+# for autoformat yet.
+format_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause yapf to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
+    fi
+
+}
+
+# Format all files
+format_all() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" .
+}
+
+## This flag formats individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   format "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is formatted.
+elif [[ "$1" == '--all' ]]; then
+   format_all
+else
+   # Format only the files that changed in last commit.
+   format_changed
+fi
+echo 'vLLM yapf: Done'
+
+# Run mypy
+echo 'vLLM mypy:'
+tools/mypy.sh
+echo 'vLLM mypy: Done'
+
+
+# If git diff returns a file that is in the skip list, the file may be checked anyway:
+# https://github.com/codespell-project/codespell/issues/1915
+# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
+CODESPELL_EXCLUDES=(
+    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
+)
+
+# check spelling of specified files
+spell_check() {
+    codespell "$@"
+}
+
+spell_check_all(){
+  codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}"
+}
+
+# Spelling check of files that differ from main branch.
+spell_check_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause ruff to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+            codespell "${CODESPELL_EXCLUDES[@]}"
+    fi
+}
+
+# Run Codespell
+## This flag runs spell check of individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   spell_check "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   spell_check_all
+else
+   # Check spelling only of the files that changed in last commit.
+   spell_check_changed
+fi
+echo 'vLLM codespell: Done'
+
+
+# Lint specified files
+lint() {
+    ruff check "$@"
+}
+
+# Lint files that differ from main branch. Ignores dirs that are not slated
+# for autolint yet.
+lint_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause ruff to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+             ruff check
+    fi
+
+}
+
+# Run Ruff
+### This flag lints individual files. --files *must* be the first command line
+### arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   lint "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   lint vllm tests
+else
+   # Format only the files that changed in last commit.
+   lint_changed
+fi
+echo 'vLLM ruff: Done'
+
+# check spelling of specified files
+isort_check() {
+    isort "$@"
+}
+
+isort_check_all(){
+  isort .
+}
+
+# Spelling  check of files that differ from main branch.
+isort_check_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause ruff to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
+             isort
+    fi
+}
+
+# Run Isort
+# This flag runs spell check of individual files. --files *must* be the first command line
+# arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   isort_check "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is linted.
+elif [[ "$1" == '--all' ]]; then
+   isort_check_all
+else
+   # Check spelling only of the files that changed in last commit.
+   isort_check_changed
+fi
+echo 'vLLM isort: Done'
+
+# Clang-format section
+# Exclude some files for formatting because they are vendored
+# NOTE: Keep up to date with .github/workflows/clang-format.yml
+CLANG_FORMAT_EXCLUDES=(
+    'csrc/moe/topk_softmax_kernels.cu'
+    'csrc/quantization/gguf/ggml-common.h'
+    'csrc/quantization/gguf/dequantize.cuh'
+    'csrc/quantization/gguf/vecdotq.cuh'
+    'csrc/quantization/gguf/mmq.cuh'
+    'csrc/quantization/gguf/mmvq.cuh'
+)
+
+# Format specified files with clang-format
+clang_format() {
+    clang-format -i "$@"
+}
+
+# Format files that differ from main branch with clang-format.
+clang_format_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause clang-format to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    # Get the list of changed files, excluding the specified ones
+    changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e))
+    if [ -n "$changed_files" ]; then
+        echo "$changed_files" | xargs -P 5 clang-format -i
+    fi
+}
+
+# Format all files with clang-format
+clang_format_all() {
+    find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+        | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \
+        | xargs clang-format -i
+}
+
+# Run clang-format
+if [[ "$1" == '--files' ]]; then
+   clang_format "${@:2}"
+elif [[ "$1" == '--all' ]]; then
+   clang_format_all
+else
+   clang_format_changed
+fi
+echo 'vLLM clang-format: Done'
+
+echo 'vLLM actionlint:'
+tools/actionlint.sh -color
+echo 'vLLM actionlint: Done'
+
+echo 'vLLM shellcheck:'
+tools/shellcheck.sh
+echo 'vLLM shellcheck: Done'
+
+echo 'excalidraw png check:'
+tools/png-lint.sh
+echo 'excalidraw png check: Done'
+
+if ! git diff --quiet &>/dev/null; then
+    echo 
+    echo "🔍🔍There are files changed by the format checker or by you that are not added and committed:"
+    git --no-pager diff --name-only
+    echo "🔍🔍Format checker passed, but please add, commit and push all the files above to include changes made by the format checker."
+
+    exit 1
+else
+    echo "✨🎉 Format check passed! Congratulations! 🎉✨"
+fi
+
+# echo 'vLLM sphinx-lint:'
+# tools/sphinx-lint.sh
+# echo 'vLLM sphinx-lint: Done'
diff --git a/mypy.ini b/mypy.ini
new file mode 100644
index 000000000..b627e7f51
--- /dev/null
+++ b/mypy.ini
@@ -0,0 +1,14 @@
+[mypy]
+; warn_return_any = True
+warn_unused_configs = True
+
+; Suppress all missing import errors from torch_npu for mypy.
+[mypy-torch_npu.*]
+ignore_missing_imports = True
+
+[mypy-transformers.*]
+ignore_missing_imports = True
+
+; Remove this after https://github.com/vllm-project/vllm/pull/11324 merged
+[mypy-vllm.distributed.device_communicators.base_communicator]
+ignore_missing_imports = True
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 000000000..c6490115b
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1,3 @@
+git
+vim
+
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 000000000..60a78830d
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,3 @@
+-r requirements-lint.txt
+modelscope
+pytest
diff --git a/requirements-lint.txt b/requirements-lint.txt
new file mode 100644
index 000000000..711bb50a0
--- /dev/null
+++ b/requirements-lint.txt
@@ -0,0 +1,15 @@
+# formatting
+yapf==0.32.0
+toml==0.10.2
+tomli==2.0.2
+ruff==0.6.5
+codespell==2.3.0
+isort==5.13.2
+clang-format==18.1.5
+sphinx-lint==1.0.0
+
+# type checking
+mypy==1.11.1
+types-PyYAML
+types-requests
+types-setuptools
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 000000000..51cb33f2b
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,5 @@
+decorator
+pyyaml
+scipy
+setuptools
+torch_npu == 2.5.1rc1
diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..4aaab9907
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,95 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/blob/main/setup.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from typing import List
+
+from setuptools import setup
+
+ROOT_DIR = os.path.dirname(__file__)
+
+
+def get_path(*filepath) -> str:
+    return os.path.join(ROOT_DIR, *filepath)
+
+
+def read_readme() -> str:
+    """Read the README file if present."""
+    p = get_path("README.md")
+    if os.path.isfile(p):
+        with open(get_path("README.md"), encoding="utf-8") as f:
+            return f.read()
+    else:
+        return ""
+
+
+def get_requirements() -> List[str]:
+    """Get Python package dependencies from requirements.txt."""
+
+    def _read_requirements(filename: str) -> List[str]:
+        with open(get_path(filename)) as f:
+            requirements = f.read().strip().split("\n")
+        resolved_requirements = []
+        for line in requirements:
+            if line.startswith("-r "):
+                resolved_requirements += _read_requirements(line.split()[1])
+            elif line.startswith("--"):
+                continue
+            else:
+                resolved_requirements.append(line)
+        return resolved_requirements
+
+    try:
+        requirements = _read_requirements("requirements.txt")
+    except ValueError:
+        print("Failed to read requirements.txt in vllm_ascend.")
+    return requirements
+
+
+setup(
+    name='vllm_ascend',
+    # Follow:
+    # https://packaging.python.org/en/latest/specifications/version-specifiers
+    version='0.1.0a1',
+    author="vLLM-Ascend team",
+    license="Apache 2.0",
+    description=("vLLM Ascend backend plugin"),
+    long_description=read_readme(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/vllm-project/vllm-ascend",
+    project_urls={
+        "Homepage": "https://github.com/vllm-project/vllm-ascend",
+    },
+    classifiers=[
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "License :: OSI Approved :: Apache Software License",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Information Technology",
+        "Intended Audience :: Science/Research",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Scientific/Engineering :: Information Analysis",
+    ],
+    packages=['vllm_ascend'],
+    python_requires=">=3.9",
+    install_requires=get_requirements(),
+    extras_require={},
+    entry_points={'vllm.platform_plugins': ["ascend = vllm_ascend:register"]})
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 000000000..3a593e45e
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,331 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/blob/main/tests/conftest.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import List, Optional, Tuple, TypeVar, Union
+
+import numpy as np
+import pytest
+from PIL import Image
+from vllm import LLM, SamplingParams
+from vllm.config import TaskOption
+from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt
+from vllm.logger import init_logger
+from vllm.outputs import RequestOutput
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import is_list_of
+
+from tests.model_utils import (TokensTextLogprobs,
+                               TokensTextLogprobsPromptLogprobs)
+
+logger = init_logger(__name__)
+
+_M = TypeVar("_M")
+_PromptMultiModalInput = Union[List[_M], List[List[_M]]]
+
+PromptImageInput = _PromptMultiModalInput[Image.Image]
+PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
+PromptVideoInput = _PromptMultiModalInput[np.ndarray]
+
+
+class VllmRunner:
+
+    def __init__(
+        self,
+        model_name: str,
+        task: TaskOption = "auto",
+        tokenizer_name: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        # Use smaller max model length, otherwise bigger model cannot run due
+        # to kv cache size limit.
+        max_model_len: int = 1024,
+        dtype: str = "half",
+        disable_log_stats: bool = True,
+        tensor_parallel_size: int = 1,
+        block_size: int = 16,
+        enable_chunked_prefill: bool = False,
+        swap_space: int = 4,
+        enforce_eager: Optional[bool] = False,
+        **kwargs,
+    ) -> None:
+        self.model = LLM(
+            model=model_name,
+            task=task,
+            tokenizer=tokenizer_name,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=True,
+            dtype=dtype,
+            swap_space=swap_space,
+            enforce_eager=enforce_eager,
+            disable_log_stats=disable_log_stats,
+            tensor_parallel_size=tensor_parallel_size,
+            max_model_len=max_model_len,
+            block_size=block_size,
+            enable_chunked_prefill=enable_chunked_prefill,
+            **kwargs,
+        )
+
+    def get_inputs(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[TextPrompt]:
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        if videos is not None:
+            assert len(prompts) == len(videos)
+
+        if audios is not None:
+            assert len(prompts) == len(audios)
+
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                if image is not None:
+                    inputs[i]["multi_modal_data"] = {"image": image}
+
+        if videos is not None:
+            for i, video in enumerate(videos):
+                if video is not None:
+                    inputs[i]["multi_modal_data"] = {"video": video}
+
+        if audios is not None:
+            for i, audio in enumerate(audios):
+                if audio is not None:
+                    inputs[i]["multi_modal_data"] = {"audio": audio}
+
+        return inputs
+
+    def generate(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.generate(inputs,
+                                          sampling_params=sampling_params)
+
+        outputs: List[Tuple[List[List[int]], List[str]]] = []
+        for req_output in req_outputs:
+            prompt_str = req_output.prompt
+            prompt_ids = req_output.prompt_token_ids
+            req_sample_output_ids: List[List[int]] = []
+            req_sample_output_strs: List[str] = []
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = list(sample.token_ids)
+                req_sample_output_ids.append(prompt_ids + output_ids)
+                req_sample_output_strs.append(prompt_str + output_str)
+            outputs.append((req_sample_output_ids, req_sample_output_strs))
+        return outputs
+
+    @staticmethod
+    def _final_steps_generate_w_logprobs(
+        req_outputs: List[RequestOutput],
+    ) -> List[TokensTextLogprobsPromptLogprobs]:
+        outputs: List[TokensTextLogprobsPromptLogprobs] = []
+        for req_output in req_outputs:
+            assert len(req_output.outputs) > 0
+            for sample in req_output.outputs:
+                output_str = sample.text
+                output_ids = list(sample.token_ids)
+                output_logprobs = sample.logprobs
+            outputs.append((output_ids, output_str, output_logprobs,
+                            req_output.prompt_logprobs))
+        return outputs
+
+    def generate_w_logprobs(
+        self,
+        prompts: List[str],
+        sampling_params: SamplingParams,
+        images: Optional[PromptImageInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+    ) -> Union[List[TokensTextLogprobs],
+               List[TokensTextLogprobsPromptLogprobs]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.generate(inputs,
+                                          sampling_params=sampling_params)
+
+        toks_str_logsprobs_prompt_logprobs = (
+            self._final_steps_generate_w_logprobs(req_outputs))
+        # Omit prompt logprobs if not required by sampling params
+        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+                if sampling_params.prompt_logprobs is None else
+                toks_str_logsprobs_prompt_logprobs)
+
+    def generate_encoder_decoder_w_logprobs(
+        self,
+        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        sampling_params: SamplingParams,
+    ) -> Union[List[TokensTextLogprobs],
+               List[TokensTextLogprobsPromptLogprobs]]:
+        '''
+        Logprobs generation for vLLM encoder/decoder models
+        '''
+
+        assert sampling_params.logprobs is not None
+        req_outputs = self.model.generate(encoder_decoder_prompts,
+                                          sampling_params=sampling_params)
+        toks_str_logsprobs_prompt_logprobs = (
+            self._final_steps_generate_w_logprobs(req_outputs))
+        # Omit prompt logprobs if not required by sampling params
+        return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs]
+                if sampling_params.prompt_logprobs is None else
+                toks_str_logsprobs_prompt_logprobs)
+
+    def generate_greedy(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[Tuple[List[int], str]]:
+        greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
+        outputs = self.generate(prompts,
+                                greedy_params,
+                                images=images,
+                                videos=videos,
+                                audios=audios)
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
+
+    def generate_greedy_logprobs(
+        self,
+        prompts: List[str],
+        max_tokens: int,
+        num_logprobs: int,
+        num_prompt_logprobs: Optional[int] = None,
+        images: Optional[PromptImageInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        stop_token_ids: Optional[List[int]] = None,
+        stop: Optional[List[str]] = None,
+    ) -> Union[List[TokensTextLogprobs],
+               List[TokensTextLogprobsPromptLogprobs]]:
+        greedy_logprobs_params = SamplingParams(
+            temperature=0.0,
+            max_tokens=max_tokens,
+            logprobs=num_logprobs,
+            prompt_logprobs=num_prompt_logprobs,
+            stop_token_ids=stop_token_ids,
+            stop=stop)
+
+        return self.generate_w_logprobs(prompts,
+                                        greedy_logprobs_params,
+                                        images=images,
+                                        audios=audios,
+                                        videos=videos)
+
+    def generate_encoder_decoder_greedy_logprobs(
+        self,
+        encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]],
+        max_tokens: int,
+        num_logprobs: int,
+        num_prompt_logprobs: Optional[int] = None,
+    ) -> Union[List[TokensTextLogprobs],
+               List[TokensTextLogprobsPromptLogprobs]]:
+        greedy_logprobs_params = SamplingParams(
+            temperature=0.0,
+            max_tokens=max_tokens,
+            logprobs=num_logprobs,
+            prompt_logprobs=(num_prompt_logprobs),
+        )
+        '''
+        Greedy logprobs generation for vLLM encoder/decoder models
+        '''
+
+        return self.generate_encoder_decoder_w_logprobs(
+            encoder_decoder_prompts, greedy_logprobs_params)
+
+    def generate_beam_search(
+        self,
+        prompts: Union[List[str], List[List[int]]],
+        beam_width: int,
+        max_tokens: int,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
+        if is_list_of(prompts, str, check="all"):
+            prompts = [TextPrompt(prompt=prompt) for prompt in prompts]
+        else:
+            prompts = [
+                TokensPrompt(prompt_token_ids=tokens) for tokens in prompts
+            ]
+        outputs = self.model.beam_search(
+            prompts,
+            BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens))
+        returned_outputs = []
+        for output in outputs:
+            token_ids = [x.tokens for x in output.sequences]
+            texts = [x.text for x in output.sequences]
+            returned_outputs.append((token_ids, texts))
+        return returned_outputs
+
+    def classify(self, prompts: List[str]) -> List[List[float]]:
+        req_outputs = self.model.classify(prompts)
+        return [req_output.outputs.probs for req_output in req_outputs]
+
+    def encode(
+        self,
+        prompts: List[str],
+        images: Optional[PromptImageInput] = None,
+        videos: Optional[PromptVideoInput] = None,
+        audios: Optional[PromptAudioInput] = None,
+    ) -> List[List[float]]:
+        inputs = self.get_inputs(prompts,
+                                 images=images,
+                                 videos=videos,
+                                 audios=audios)
+
+        req_outputs = self.model.embed(inputs)
+        return [req_output.outputs.embedding for req_output in req_outputs]
+
+    def score(
+        self,
+        text_1: Union[str, List[str]],
+        text_2: Union[str, List[str]],
+    ) -> List[float]:
+        req_outputs = self.model.score(text_1, text_2)
+        return [req_output.outputs.score for req_output in req_outputs]
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        del self.model
+        cleanup_dist_env_and_memory()
+
+
+@pytest.fixture(scope="session")
+def vllm_runner():
+    return VllmRunner
diff --git a/tests/model_utils.py b/tests/model_utils.py
new file mode 100644
index 000000000..1b9eadccd
--- /dev/null
+++ b/tests/model_utils.py
@@ -0,0 +1,303 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from vllm.config import ModelConfig, TaskOption
+from vllm.inputs import InputContext
+from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
+
+TokensText = Tuple[List[int], str]
+
+
+def check_outputs_equal(
+    *,
+    outputs_0_lst: Sequence[TokensText],
+    outputs_1_lst: Sequence[TokensText],
+    name_0: str,
+    name_1: str,
+):
+    """
+    Compare the two sequences generated by different models,
+    which should be equal.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        # The text and token outputs should exactly match
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
+
+        assert output_str_0 == output_str_1, fail_msg
+        assert output_ids_0 == output_ids_1, fail_msg
+
+
+# Representation of generated sequence as a tuple of
+# * Token ID list
+# * String
+# * List of top sample logprobs for each sampled token
+#
+# Assumes prompt logprobs were not requested.
+TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+                                                                    float]],
+                                                          SampleLogprobs]]]
+
+# Allow for tokens to be represented as str's rather than IDs;
+# tuple of
+# * Token string representations list
+# * String
+# * Optional list of top sample logprobs for each sampled token
+#
+# Assumes prompt logprobs were not requested.
+TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]],
+                                                        List[Dict[str,
+                                                                  Logprob]]]]]
+
+# Representation of generated sequence as a tuple of
+# * Token ID list
+# * String
+# * Optional list of top sample logprobs for each sampled token
+# * Optional list of top prompt logprobs for each prompt token
+#
+# Allows prompt logprobs to be requested.
+TokensTextLogprobsPromptLogprobs = Tuple[
+    List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]],
+    Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]]
+
+
+def check_logprobs_close(
+    *,
+    outputs_0_lst: Sequence[Union[TokensTextLogprobs,
+                                  TokensTextLogprobsPromptLogprobs,
+                                  TextTextLogprobs]],
+    outputs_1_lst: Sequence[Union[TokensTextLogprobs,
+                                  TokensTextLogprobsPromptLogprobs,
+                                  TextTextLogprobs]],
+    name_0: str,
+    name_1: str,
+    num_outputs_0_skip_tokens: int = 0,
+    warn_on_mismatch: bool = True,
+    always_check_logprobs: bool = False,
+) -> None:
+    """Compare the logprobs of two sequences generated by different models,
+    which should be similar but not necessarily equal.
+
+    How sample logprobs are compared:
+    * `always_check_logprobs == True`: set of highest-logprob token ids
+      must match between seq0 and seq1 at all sampled token offsets
+    * `always_check_logprobs == False`: highest-logprob token ids are
+      only compared at sampled token offsets for which generated token
+      ids don't match
+
+    Prompt logprobs must be provided either for both input sequences, or
+    for neither. If prompt logprobs are provided, then highest-logprob
+    prompt token ids must match between seq0 and seq1 at all prompt token
+    offsets.
+
+    Args:
+      outputs_0_lst: First sequence to compare
+      outputs_0_lst: Second sequence to compare
+      name_0: sequence #0 name
+      name_1: sequence #1 name
+      num_outputs_0_skip_tokens: If > 0, specifies the number of initial
+                                 sequence #0 tokens & logprobs to discard
+                                 before comparison, i.e. all
+                                 of sequence #1 will be compared to
+                                 sequence #0 beginning at index
+                                 num_outputs_0_skip_tokens
+      warn_on_mismatch: Issue a warning if there is token-wise or text-wise
+                        mismatch between the two sequences
+      always_check_logprobs: If true, check logprobs even when tokens match
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+
+    # Loop through responses to each prompt.
+    for prompt_idx, (outputs_0,
+                     outputs_1) in enumerate(zip(outputs_0_lst,
+                                                 outputs_1_lst)):
+        assert len(outputs_0) == len(outputs_1)
+        if len(outputs_0) == 3:
+            assert len(outputs_1) == 3
+            # Break out tokens, text & sample logprobs
+            # (prompt logprobs were not provided)
+            output_ids_0, output_str_0, logprobs_0 = outputs_0
+            output_ids_1, output_str_1, logprobs_1 = outputs_1
+        elif len(outputs_0) == 4:
+            assert len(outputs_1) == 4
+            # Break out tokens, text, sample logprobs & prompt logprobs
+            (
+                output_ids_0,
+                output_str_0,
+                logprobs_0,
+                prompt_logprobs_0,
+            ) = outputs_0
+            (
+                output_ids_1,
+                output_str_1,
+                logprobs_1,
+                prompt_logprobs_1,
+            ) = outputs_1
+
+            # Test prompt logprobs closeness
+            if (prompt_logprobs_0 is not None
+                    and prompt_logprobs_1 is not None):
+                # Both sequences' prompt logprobs lists are not `None``
+                # (although individual list elements may be `None`);
+                # for each token's logprobs:
+                for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate(
+                        zip(prompt_logprobs_0, prompt_logprobs_1)):
+                    fail_msg = (
+                        f"Prompt logprobs test:"
+                        f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}"
+                        f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}")
+
+                    if logprobs_elem_0 is None:
+                        # If the seq 0 token's logprobs are `None`,
+                        # the seq 1 token's logprobs must be `None`
+                        assert logprobs_elem_1 is None, fail_msg
+                    else:
+                        # If the seq 0 token's logprobs are not `None`,
+                        # the seq 1 token's logprobs must not be `None`
+                        assert logprobs_elem_1 is not None, fail_msg
+                        # Logprobs check: top-k token choices must be the same
+                        assert (set(logprobs_elem_0.keys()) == set(
+                            logprobs_elem_1.keys())), fail_msg
+            else:
+                # Both sequence logprobs lists must be `None`
+                fail_msg = (f"Prompt logprobs test:"
+                            f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}"
+                            f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}")
+
+                assert (prompt_logprobs_0 is None
+                        and prompt_logprobs_1 is None), fail_msg
+        else:
+            raise ValueError(f"Outputs tuple must have 3 or 4 elements but "
+                             f"{len(outputs_0)} elements were provided: "
+                             f"{outputs_0}")
+
+        if logprobs_0 is None:
+            logprobs_0 = [None] * len(output_ids_0)
+        if logprobs_1 is None:
+            logprobs_1 = [None] * len(output_ids_1)
+
+        # Skip specified number of initial sequence #0 tokens
+        # & logprobs, leaving output text as-is for simplicity
+        # (text mismatches may generate warnings but do not
+        # cause the test to fail.)
+        if num_outputs_0_skip_tokens < 0:
+            raise ValueError("num_outputs_0_skip_tokens must be non-negative")
+        output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:]
+        logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:]
+
+        # Loop through generated tokens.
+        for idx, (output_id_0,
+                  output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
+
+            is_tok_mismatch = output_id_0 != output_id_1
+
+            # If generated tokens don't match
+            # or it is desired to always check logprobs,
+            # then
+            if is_tok_mismatch or always_check_logprobs:
+                logprobs_elem_0 = logprobs_0[idx]
+                logprobs_elem_1 = logprobs_1[idx]
+
+                # Each predicted token must be in top N logprobs of the other
+                fail_msg = (
+                    f"Test{prompt_idx}:"
+                    f"\nMatched tokens:\t{output_ids_0[:idx]}"
+                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+
+                assert logprobs_elem_0 is not None, fail_msg
+                assert logprobs_elem_1 is not None, fail_msg
+                assert output_id_0 in logprobs_elem_1, fail_msg
+                assert output_id_1 in logprobs_elem_0, fail_msg
+
+                if warn_on_mismatch and is_tok_mismatch:
+                    with warnings.catch_warnings():
+                        # This ensures that repeated warnings are shown
+                        # in the output, not just the first occurrence
+                        warnings.simplefilter("always")
+
+                        warnings.warn(fail_msg, stacklevel=2)
+
+                # Break out since sequences will now diverge.
+                break
+        else:
+            if output_str_0 != output_str_1 and warn_on_mismatch:
+                # The token outputs exactly match,
+                # so the text outputs should exactly match as well
+                fail_msg = (f"Test{prompt_idx}:"
+                            f"\n{name_0}:\t{output_str_0!r}"
+                            f"\n{name_1}:\t{output_str_1!r}")
+
+                with warnings.catch_warnings():
+                    # This ensures that repeated warnings are shown
+                    # in the output, not just the first occurrence
+                    warnings.simplefilter("always")
+
+                    warnings.warn(fail_msg, stacklevel=2)
+
+
+def build_model_context(model_name: str,
+                        task: TaskOption = "auto",
+                        tokenizer_name: Optional[str] = None,
+                        trust_remote_code: bool = False,
+                        dtype: Optional[Union[str, torch.dtype]] = None,
+                        mm_processor_kwargs: Optional[Dict] = None,
+                        limit_mm_per_prompt: Optional[Dict] = None):
+    """Creates an InputContext for a given model.
+
+    Args:
+        model_name: Name of the model being considered.
+        tokenizer_name: Name of the tokenizer being considered.
+        trust_remote_code: Whether or not to allow loading remote code.
+        mm_processor_kwargs: optional processor kwargs for to be leveraged
+            in the input processor, mapper, dummy data creation, etc.
+        limit_mm_per_prompt: Multimodal limits.
+
+    Returns:
+        InputContext for the model being considered.
+    """
+    if tokenizer_name is None:
+        tokenizer_name = model_name
+    if dtype is None:
+        dtype = "half"
+
+    model_config = ModelConfig(
+        model_name,
+        task=task,
+        tokenizer=tokenizer_name,
+        tokenizer_mode="auto",
+        trust_remote_code=trust_remote_code,
+        dtype=dtype,
+        seed=0,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    return InputContext(model_config)
diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py
new file mode 100644
index 000000000..484bce63c
--- /dev/null
+++ b/tests/test_offline_inference.py
@@ -0,0 +1,61 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""Compare the short outputs of HF and vLLM when using greedy sampling.
+
+Run `pytest tests/test_offline_inference.py`.
+"""
+import os
+
+import pytest
+import vllm  # noqa: F401
+from conftest import VllmRunner
+
+import vllm_ascend  # noqa: F401
+
+MODELS = [
+    "Qwen/Qwen2.5-0.5B-Instruct",
+]
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+
+TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4")
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("dtype", ["half", "float16"])
+@pytest.mark.parametrize("max_tokens", [5])
+def test_models(
+    model: str,
+    dtype: str,
+    max_tokens: int,
+) -> None:
+    os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND"
+
+    # 5042 tokens for gemma2
+    # gemma2 has alternating sliding window size of 4096
+    # we need a prompt with more than 4096 tokens to test the sliding window
+    prompt = "The following numbers of the sequence " + ", ".join(
+        str(i) for i in range(1024)) + " are:"
+    example_prompts = [prompt]
+
+    with VllmRunner(model,
+                    max_model_len=8192,
+                    dtype=dtype,
+                    enforce_eager=False,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tools/actionlint.sh b/tools/actionlint.sh
new file mode 100755
index 000000000..72a10b18f
--- /dev/null
+++ b/tools/actionlint.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if command -v actionlint &> /dev/null; then
+    # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml becase sel-hosted runner `npu-arm64` is unknown
+    actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    exit 0
+elif [ -x ./actionlint ]; then
+    ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml
+    exit 0
+fi
+
+# download a binary to the current directory - v1.7.3
+bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash)
+./actionlint  .github/workflows/*.yml .github/workflows/mypy.yaml
diff --git a/tools/check_repo.sh b/tools/check_repo.sh
new file mode 100644
index 000000000..e86d0f110
--- /dev/null
+++ b/tools/check_repo.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time)
+
+if ! git diff --quiet; then
+	echo "Repo is dirty" >&2
+
+	exit 1
+fi
+
+if ! git describe --tags; then
+	echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2
+
+	exit 1
+fi
diff --git a/tools/mypy.sh b/tools/mypy.sh
new file mode 100755
index 000000000..fcb7c0e4b
--- /dev/null
+++ b/tools/mypy.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+CI=${1:-0}
+PYTHON_VERSION=${2:-3.9}
+
+if [ "$CI" -eq 1 ]; then
+    set -e
+fi
+
+run_mypy() {
+    echo "Running mypy on $1"
+    if [ "$CI" -eq 1 ] && [ -z "$1" ]; then
+        mypy --python-version "${PYTHON_VERSION}" "$@"
+        return
+    fi
+    mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@"
+}
+
+run_mypy vllm_ascend
+run_mypy examples
+run_mypy tests
diff --git a/tools/npu-vllm-test.sh b/tools/npu-vllm-test.sh
new file mode 100644
index 000000000..17c7a4d43
--- /dev/null
+++ b/tools/npu-vllm-test.sh
@@ -0,0 +1,422 @@
+#!/bin/bash
+
+# 
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -o pipefail
+
+TEST_DIR="./vllm-empty/tests"
+TEST_FILES=(
+    test_sequence.py
+    # test_utils.py
+    # test_config.py
+    test_cache_block_hashing.py
+    # test_scalartype.py
+    # test_embedded_commit.py
+    # test_inputs.py
+    # test_sharded_state_loader.py
+    test_sampling_params.py
+    # test_logger.py
+    # test_logits_processor.py
+    # test_regression.py
+    # prefix_caching/test_prefix_caching.py
+    # prefix_caching/test_disable_sliding_window.py
+    # weight_loading/test_weight_loading.py
+    # samplers/test_beam_search.py
+    # samplers/test_typical_acceptance_sampler.py
+    # samplers/test_no_bad_words.py
+    # samplers/test_rejection_sampler.py
+    # samplers/test_ignore_eos.py
+    # samplers/test_ranks.py
+    # samplers/test_logits_processor.py
+    # samplers/test_sampler.py
+    # samplers/test_seeded_generate.py
+    # samplers/test_logprobs.py
+    # kernels/test_encoder_decoder_attn.py
+    # kernels/test_rotary_embedding.py
+    # kernels/test_prefix_prefill.py
+    # kernels/test_flashinfer.py
+    # kernels/utils.py
+    # kernels/test_machete_mm.py
+    # kernels/test_flash_attn.py
+    # kernels/test_awq.py
+    # kernels/test_blocksparse_attention.py
+    # kernels/test_utils.py
+    # kernels/test_aqlm.py
+    # kernels/test_cutlass.py
+    # kernels/test_causal_conv1d.py
+    # kernels/test_marlin_gemm.py
+    # kernels/test_layernorm.py
+    # kernels/test_pos_encoding.py
+    # kernels/test_moe.py
+    # kernels/test_awq_marlin.py
+    # kernels/test_int8_quant.py
+    # kernels/test_gptq.py
+    # kernels/test_attention.py
+    # kernels/test_activation.py
+    # kernels/quant_utils.py
+    # kernels/test_permute_cols.py
+    # kernels/test_triton_scaled_mm.py
+    # kernels/test_gguf.py
+    # kernels/test_awq_triton.py
+    # kernels/test_attention_selector.py
+    # kernels/test_ggml.py
+    # kernels/test_mamba_ssm.py
+    # kernels/test_fused_quant_layernorm.py
+    # kernels/test_fp8_quant.py
+    # kernels/test_cascade_flash_attn.py
+    # kernels/conftest.py
+    # kernels/allclose_default.py
+    # kernels/test_block_fp8.py
+    # kernels/test_cache.py
+    # kernels/test_semi_structured.py
+    # quantization/test_quark.py
+    # quantization/test_compressed_tensors.py
+    # quantization/utils.py
+    # quantization/test_experts_int8.py
+    # quantization/test_lm_head.py
+    # quantization/test_ipex_quant.py
+    # quantization/test_bitsandbytes.py
+    # quantization/test_cpu_offload.py
+    # quantization/test_fp8.py
+    # quantization/test_configs.py
+    # tool_use/test_tool_calls.py
+    # tool_use/utils.py
+    # tool_use/test_chat_completions.py
+    # tool_use/test_jamba_tool_parser.py
+    # tool_use/test_chat_completion_request_validations.py
+    # tool_use/conftest.py
+    # tool_use/test_parallel_tool_calls.py
+    # runai_model_streamer/test_runai_model_streamer_loader.py
+    # runai_model_streamer/test_weight_utils.py
+    # kv_transfer/test_lookup_buffer.sh
+    # kv_transfer/test_send_recv.py
+    # kv_transfer/test_send_recv.sh
+    # kv_transfer/test_lookup_buffer.py
+    # kv_transfer/module_test.py
+    # kv_transfer/disagg_test.py
+    # plugins/vllm_add_dummy_platform/setup.py
+    # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+    # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py
+    # plugins/vllm_add_dummy_model/setup.py
+    # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py
+    # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py
+    # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py
+    # prompt_adapter/test_multi_adapter_inference.py
+    # prompt_adapter/test_pa_lora.py
+    # prompt_adapter/test_bloom.py
+    # compile/test_pass_manager.py
+    # compile/utils.py
+    # compile/test_wrapper.py
+    # compile/test_fusion.py
+    # compile/backend.py
+    # compile/test_full_graph.py
+    # compile/test_basic_correctness.py
+    # compile/test_functionalization.py
+    # compile/piecewise/test_simple.py
+    # compile/piecewise/test_toy_llama.py
+    # lora/test_punica_ops_variation.py
+    # lora/test_quant_model.py
+    # lora/test_lora_checkpoints.py
+    # lora/test_mixtral.py
+    # lora/test_qwen2vl.py
+    # lora/test_baichuan.py
+    # lora/utils.py
+    # lora/test_phi.py
+    # lora/test_utils.py
+    # lora/test_minicpmv_tp.py
+    # lora/test_layers.py
+    # lora/test_worker.py
+    # lora/test_jamba.py
+    # lora/test_tokenizer_group.py
+    # lora/test_lora_bias_e2e.py
+    # lora/test_chatglm3_tp.py
+    # lora/test_punica_ops_sizes.py
+    # lora/test_lora_manager.py
+    # lora/test_llama_tp.py
+    # lora/test_lora_huggingface.py
+    # lora/test_long_context.py
+    # lora/test_gemma.py
+    # lora/conftest.py
+    # lora/data/long_context_test_data.py
+    # models/registry.py
+    # models/utils.py
+    # models/test_registry.py
+    # models/test_initialization.py
+    # models/test_oot_registration.py
+    # models/multimodal/processing/test_internvl.py
+    # models/multimodal/processing/test_llava_next.py
+    # models/multimodal/processing/test_idefics3.py
+    # models/multimodal/processing/test_qwen2_vl.py
+    # models/multimodal/processing/test_phi3v.py
+    # models/multimodal/processing/test_common.py
+    # models/multimodal/processing/test_qwen.py
+    # models/multimodal/processing/test_llava_onevision.py
+    # models/encoder_decoder/language/test_bart.py
+    # models/encoder_decoder/audio_language/test_whisper.py
+    # models/encoder_decoder/vision_language/test_broadcast.py
+    # models/encoder_decoder/vision_language/test_florence2.py
+    # models/encoder_decoder/vision_language/test_mllama.py
+    # models/decoder_only/language/test_models.py
+    # models/decoder_only/language/test_gptq_marlin.py
+    # models/decoder_only/language/test_granite.py
+    # models/decoder_only/language/test_modelopt.py
+    # models/decoder_only/language/test_phimoe.py
+    # models/decoder_only/language/test_aqlm.py
+    # models/decoder_only/language/test_mistral.py
+    # models/decoder_only/language/test_jamba.py
+    # models/decoder_only/language/test_gptq_marlin_24.py
+    # models/decoder_only/language/test_mamba.py
+    # models/decoder_only/language/test_gguf.py
+    # models/decoder_only/language/test_fp8.py
+    # models/decoder_only/audio_language/test_ultravox.py
+    # models/decoder_only/vision_language/test_models.py
+    # models/decoder_only/vision_language/test_awq.py
+    # models/decoder_only/vision_language/test_intern_vit.py
+    # models/decoder_only/vision_language/test_qwen2_vl.py
+    # models/decoder_only/vision_language/test_pixtral.py
+    # models/decoder_only/vision_language/test_phi3v.py
+    # models/decoder_only/vision_language/test_h2ovl.py
+    # models/decoder_only/vision_language/vlm_utils/types.py
+    # models/decoder_only/vision_language/vlm_utils/model_utils.py
+    # models/decoder_only/vision_language/vlm_utils/runners.py
+    # models/decoder_only/vision_language/vlm_utils/core.py
+    # models/decoder_only/vision_language/vlm_utils/custom_inputs.py
+    # models/decoder_only/vision_language/vlm_utils/case_filtering.py
+    # models/decoder_only/vision_language/vlm_utils/builders.py
+    # models/embedding/utils.py
+    # models/embedding/language/test_scoring.py
+    # models/embedding/language/test_gritlm.py
+    # models/embedding/language/test_cls_models.py
+    # models/embedding/language/test_embedding.py
+    # models/embedding/vision_language/test_llava_next.py
+    # models/embedding/vision_language/test_dse_qwen2_vl.py
+    # models/embedding/vision_language/test_phi3v.py
+    # multimodal/utils.py
+    # multimodal/test_processor_kwargs.py
+    # multimodal/test_utils.py
+    # multimodal/test_inputs.py
+    # multimodal/test_processing.py
+    # standalone_tests/python_only_compile.sh
+    # standalone_tests/lazy_torch_compile.py
+    # async_engine/test_async_llm_engine.py
+    # async_engine/api_server_async_engine.py
+    # async_engine/test_api_server.py
+    # async_engine/test_request_tracker.py
+    # mq_llm_engine/utils.py
+    # mq_llm_engine/test_load.py
+    # mq_llm_engine/test_abort.py
+    # mq_llm_engine/test_error_handling.py
+    # tokenization/test_tokenizer.py
+    # tokenization/test_tokenizer_group.py
+    # tokenization/test_get_eos.py
+    # tokenization/test_cached_tokenizer.py
+    # tokenization/test_detokenize.py
+    # core/utils.py
+    # core/test_chunked_prefill_scheduler.py
+    # core/test_serialization.py
+    # core/test_num_computed_tokens_update.py
+    # core/test_scheduler_encoder_decoder.py
+    # core/test_scheduler.py
+    # core/block/test_cpu_gpu_block_allocator.py
+    # core/block/test_prefix_caching_block.py
+    # core/block/test_common.py
+    # core/block/test_block_table.py
+    # core/block/test_block_manager.py
+    # core/block/conftest.py
+    # core/block/test_naive_block.py
+    # core/block/e2e/test_correctness.py
+    # core/block/e2e/test_correctness_sliding_window.py
+    # core/block/e2e/conftest.py
+    # tracing/test_tracing.py
+    # engine/test_arg_utils.py
+    # engine/test_detokenization.py
+    # engine/test_short_mm_context.py
+    # engine/test_custom_executor.py
+    # engine/test_multiproc_workers.py
+    # engine/test_computed_prefix_blocks.py
+    # engine/test_stop_reason.py
+    # engine/test_skip_tokenizer_init.py
+    # engine/test_stop_strings.py
+    # engine/output_processor/test_stop_checker.py
+    # engine/output_processor/test_multi_step.py
+    # tensorizer_loader/test_tensorizer.py
+    # tensorizer_loader/conftest.py
+    # entrypoints/test_chat_utils.py
+    # entrypoints/conftest.py
+    # entrypoints/llm/test_lazy_outlines.py
+    # entrypoints/llm/test_generate_multiple_loras.py
+    # entrypoints/llm/test_encode.py
+    # entrypoints/llm/test_init.py
+    # entrypoints/llm/test_guided_generate.py
+    # entrypoints/llm/test_gpu_utilization.py
+    # entrypoints/llm/test_chat.py
+    # entrypoints/llm/test_accuracy.py
+    # entrypoints/llm/test_prompt_validation.py
+    # entrypoints/llm/test_generate.py
+    # entrypoints/offline_mode/test_offline_mode.py
+    # entrypoints/openai/test_completion.py
+    # entrypoints/openai/test_models.py
+    # entrypoints/openai/test_chat_echo.py
+    # entrypoints/openai/test_score.py
+    # entrypoints/openai/test_tokenization.py
+    # entrypoints/openai/test_cli_args.py
+    # entrypoints/openai/test_chunked_prompt.py
+    # entrypoints/openai/test_encoder_decoder.py
+    # entrypoints/openai/test_chat_template.py
+    # entrypoints/openai/test_oot_registration.py
+    # entrypoints/openai/test_run_batch.py
+    # entrypoints/openai/test_metrics.py
+    # entrypoints/openai/test_vision_embedding.py
+    # entrypoints/openai/test_embedding.py
+    # entrypoints/openai/test_lora_adapters.py
+    # entrypoints/openai/test_video.py
+    # entrypoints/openai/test_serving_models.py
+    # entrypoints/openai/test_chat.py
+    # entrypoints/openai/test_pooling.py
+    # entrypoints/openai/test_basic.py
+    # entrypoints/openai/test_accuracy.py
+    # entrypoints/openai/test_prompt_validation.py
+    # entrypoints/openai/test_vision.py
+    # entrypoints/openai/test_audio.py
+    # entrypoints/openai/test_async_tokenization.py
+    # entrypoints/openai/test_return_tokens_as_ids.py
+    # entrypoints/openai/test_serving_chat.py
+    # entrypoints/openai/test_shutdown.py
+    # entrypoints/openai/test_root_path.py
+    # entrypoints/openai/tool_parsers/utils.py
+    # entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py
+    # model_executor/weight_utils.py
+    # model_executor/test_enabled_custom_ops.py
+    # model_executor/test_guided_processors.py
+    # model_executor/test_model_load_with_params.py
+    # model_executor/conftest.py
+    # metrics/test_metrics.py
+    # system_messages/sonnet3.5_nov2024.txt
+    # encoder_decoder/test_e2e_correctness.py
+    # v1/core/test_kv_cache_utils.py
+    # v1/core/test_prefix_caching.py
+    # v1/sample/test_sampler.py
+    # v1/engine/test_engine_core.py
+    # v1/engine/test_async_llm.py
+    # v1/engine/test_output_processor.py
+    # v1/engine/test_engine_args.py
+    # v1/engine/test_engine_core_client.py
+    # v1/e2e/test_cascade_attention.py
+    # v1/worker/test_gpu_input_batch.py
+    # spec_decode/utils.py
+    # spec_decode/test_utils.py
+    # spec_decode/test_ngram_worker.py
+    # spec_decode/test_metrics.py
+    # spec_decode/test_batch_expansion.py
+    # spec_decode/test_multi_step_worker.py
+    # spec_decode/test_scorer.py
+    # spec_decode/test_spec_decode_worker.py
+    # spec_decode/test_dynamic_spec_decode.py
+    # spec_decode/e2e/test_mlp_correctness.py
+    # spec_decode/e2e/test_ngram_correctness.py
+    # spec_decode/e2e/test_seed.py
+    # spec_decode/e2e/test_integration.py
+    # spec_decode/e2e/test_medusa_correctness.py
+    # spec_decode/e2e/test_integration_dist_tp4.py
+    # spec_decode/e2e/test_eagle_correctness.py
+    # spec_decode/e2e/test_compatibility.py
+    # spec_decode/e2e/test_multistep_correctness.py
+    # spec_decode/e2e/test_integration_dist_tp2.py
+    # spec_decode/e2e/conftest.py
+    # spec_decode/e2e/test_logprobs.py
+    # multi_step/test_correctness_async_llm.py
+    # multi_step/test_correctness_llm.py
+    # vllm_test_utils/setup.py
+    # vllm_test_utils/vllm_test_utils/blame.py
+    # vllm_test_utils/vllm_test_utils/monitor.py
+    # plugins_tests/test_platform_plugins.py
+    # tpu/test_compilation.py
+    # tpu/test_quantization_accuracy.py
+    # tpu/test_custom_dispatcher.py
+    # distributed/test_custom_all_reduce.py
+    # distributed/test_distributed_oot.py
+    # distributed/test_pipeline_parallel.py
+    # distributed/test_pynccl.py
+    # distributed/test_pipeline_partition.py
+    # distributed/test_utils.py
+    # distributed/test_pp_cudagraph.py
+    # distributed/test_ca_buffer_sharing.py
+    # distributed/test_multi_node_assignment.py
+    # distributed/test_same_node.py
+    # distributed/test_shm_broadcast.py
+    # distributed/test_comm_ops.py
+    # basic_correctness/test_chunked_prefill.py
+    # basic_correctness/test_preemption.py
+    # basic_correctness/test_cpu_offload.py
+    # basic_correctness/test_basic_correctness.py
+    # worker/test_model_runner.py
+    # worker/test_encoder_decoder_model_runner.py
+    # worker/test_swap.py
+    # worker/test_profile.py
+    # worker/test_model_input.py
+)
+
+# print usage
+usage() {
+    echo "Usage: $0 -t <test_script_1> -t <test_script_2> ..."
+    echo "Example: $0 -t test_inputs.py -t test_regression.py"
+    exit 1
+}
+
+# parse command line args
+while getopts ":t:" opt; do
+    case ${opt} in
+        t)
+            TEST_FILES+=("${OPTARG}")
+            ;;
+        *)
+            usage
+            ;;
+    esac
+done
+
+echo "------ Test vllm_ascend on vLLM native ut ------"
+
+
+# check if the test scripts are specified
+if [ ${#TEST_FILES[@]} -eq 0 ]; then
+    echo "Error: No test scripts specified."
+    usage
+fi
+
+
+# test all the specified ut
+for test_file in "${TEST_FILES[@]}"; do
+    full_path="$TEST_DIR/$test_file"
+    if [ -f "$full_path" ]; then
+        echo "Running $test_file..."
+        # Check if pytest ran successfully
+        if ! pytest -sv "$full_path"
+        then
+            echo "Error: $test_file failed."
+            exit 1
+        fi
+        echo "Completed $test_file."
+    else
+        echo "Error: $test_file not found in $TEST_DIR."
+        exit 1
+    fi
+done
+
+echo "------ All specified tests completed -------"
diff --git a/tools/png-lint.sh b/tools/png-lint.sh
new file mode 100755
index 000000000..5eeb11eda
--- /dev/null
+++ b/tools/png-lint.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Ensure that *.excalidraw.png files have the excalidraw metadata
+# embedded in them. This ensures they can be loaded back into
+# the tool and edited in the future.
+
+find . -iname '*.excalidraw.png' | while read -r file; do
+	if git check-ignore -q "$file"; then
+		continue
+	fi
+	if ! grep -q "excalidraw+json" "$file"; then
+		echo "$file was not exported from excalidraw with 'Embed Scene' enabled."
+		exit 1
+	fi
+done
diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh
new file mode 100755
index 000000000..d782af70a
--- /dev/null
+++ b/tools/shellcheck.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+scversion="stable"
+
+if [ -d "shellcheck-${scversion}" ]; then
+    PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+    export PATH
+fi
+
+if ! [ -x "$(command -v shellcheck)" ]; then
+    if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then
+        echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing"
+        exit 1
+    fi
+
+    # automatic local install if linux x86_64
+    wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+    PATH="$PATH:$(pwd)/shellcheck-${scversion}"
+    export PATH
+fi
+
+# TODO - fix warnings in .buildkite/run-amd-test.sh
+find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"'
diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh
new file mode 100755
index 000000000..806408013
--- /dev/null
+++ b/tools/sphinx-lint.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from https://github.com/vllm-project/vllm/tree/main/tools
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+sphinx-lint --disable trailing-whitespace,missing-final-newline docs
diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py
new file mode 100644
index 000000000..80af5a525
--- /dev/null
+++ b/vllm_ascend/__init__.py
@@ -0,0 +1,21 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+def register():
+    """Register the NPU platform."""
+    return "vllm_ascend.platform.NPUPlatform"
diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py
new file mode 100644
index 000000000..0693d44e3
--- /dev/null
+++ b/vllm_ascend/attention.py
@@ -0,0 +1,678 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/attention/backends
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import math
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+except ImportError:
+    print("Failed to import torch_npu.")
+
+from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
+                                              AttentionLayer,
+                                              AttentionMetadata, AttentionType)
+from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState,
+                                           CommonMetadataBuilder,
+                                           compute_slot_mapping_start_idx,
+                                           is_block_tables_empty)
+from vllm.attention.ops.paged_attn import (PagedAttention,
+                                           PagedAttentionMetadata)
+
+if TYPE_CHECKING:
+    from vllm_ascend.model_runner import ModelInputForNPUBuilder
+
+SHARE_MASK_TRIL_PREFIX_CACHE = None
+SHARE_MASK_TRIL = None
+
+
+class AscendAttentionBackend(AttentionBackend):
+
+    @staticmethod
+    def get_name() -> str:
+        return "ASCEND"
+
+    @staticmethod
+    def get_impl_cls() -> Type["AscendAttentionBackendImpl"]:
+        return AscendAttentionBackendImpl
+
+    @staticmethod
+    def get_metadata_cls() -> Type["AscendMetadata"]:
+        return AscendMetadata
+
+    @staticmethod
+    def get_state_cls() -> Type["CommonAttentionState"]:
+        return CommonAttentionState
+
+    @staticmethod
+    def get_kv_cache_shape(
+        num_blocks: int,
+        block_size: int,
+        num_kv_heads: int,
+        head_size: int,
+    ) -> Tuple[int, ...]:
+        return (2, num_blocks, block_size, num_kv_heads * head_size)
+
+    @staticmethod
+    def swap_blocks(
+        src_kv_cache: List[torch.Tensor],
+        dst_kv_cache: List[torch.Tensor],
+        src_to_dst: torch.Tensor,
+    ) -> None:
+        src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1]
+        dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1]
+        src_indices = src_to_dst[:, 0]
+        dst_indices = src_to_dst[:, 1]
+
+        dst_key_cache[dst_indices] = src_key_cache[src_indices].to(
+            dst_key_cache.device)
+        dst_value_cache[dst_indices] = src_value_cache[src_indices].to(
+            dst_key_cache.device)
+
+    @staticmethod
+    def copy_blocks(
+        kv_caches: List[torch.Tensor],
+        src_to_dists: torch.Tensor,
+    ) -> None:
+        src_indices = src_to_dists[:, 0]
+        dst_indices = src_to_dists[:, 1]
+
+        for kv_cache in kv_caches:
+            key_caches = kv_cache[0]
+            value_caches = kv_cache[1]
+            key_caches[dst_indices] = key_caches[src_indices]
+            value_caches[dst_indices] = value_caches[src_indices]
+
+    @staticmethod
+    def get_builder_cls() -> Type["AscendMetadataBuilder"]:
+        return AscendMetadataBuilder
+
+    @classmethod
+    def make_metadata_builder(cls, *args, **kwargs) -> "AscendMetadataBuilder":
+        return cls.get_builder_cls()(*args, **kwargs)
+
+
+class AscendPagedAttention(PagedAttention):
+
+    @staticmethod
+    def write_to_paged_cache(
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        slot_indices: torch.Tensor,
+    ) -> None:
+        torch_npu.npu_scatter_nd_update_(key_cache, slot_indices, key)
+        torch_npu.npu_scatter_nd_update_(value_cache, slot_indices, value)
+
+
+@dataclass
+class AscendMetadata(AttentionMetadata, PagedAttentionMetadata):
+    """Metadata for Ascendbackend.
+        * modified from XFormersbackend
+    NOTE: Any python object stored here is not updated when it is
+    cuda-graph replayed. If you have values that need to be changed
+    dynamically, it should be stored in tensor. The tensor has to be
+    updated from `CUDAGraphRunner.forward` API.
+    """
+
+    # |---------- N-1 iteration --------|
+    # |---------------- N iteration ---------------------|
+    # |- tokenA -|......................|-- newTokens ---|
+    # |---------- context_len ----------|
+    # |-------------------- seq_len ----------------------|
+    #                                   |-- query_len ---|
+
+    # seq_lens stored as a tensor.
+    seq_lens_tensor: Optional[torch.Tensor]
+
+    # FIXME: It is for flash attn.
+    # Maximum sequence length among prefill batch. 0 if there are decoding
+    # requests only.
+    max_prefill_seq_len: int
+    # Maximum sequence length among decode batch. 0 if there are prefill
+    # requests only.
+    max_decode_seq_len: int
+
+    # Whether or not if cuda graph is enabled.
+    # Cuda-graph is currently enabled for decoding only.
+    # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention.
+    use_cuda_graph: bool
+
+    # (batch_size,). The sequence length per sequence. Sequence length means
+    # the computed tokens + new tokens None if it is a decoding.
+    seq_lens: Optional[List[int]] = None
+
+    # FIXME: It is for flash attn.
+    # (batch_size + 1,). The cumulative sequence lengths of the sequences in
+    # the batch, used to index into sequence. E.g., if the sequence length is
+    # [4, 6], it is [0, 4, 10].
+    seq_start_loc: Optional[torch.Tensor] = None
+
+    # (batch_size,) A tensor of context lengths (tokens that are computed
+    # so far).
+    context_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum query length in the batch. None for decoding.
+    max_query_len: Optional[int] = None
+
+    # (batch_size + 1,). The cumulative subquery lengths of the sequences in
+    # the batch, used to index into subquery. E.g., if the subquery length
+    # is [4, 6], it is [0, 4, 10].
+    query_start_loc: Optional[torch.Tensor] = None
+
+    # Self-attention prefill/decode metadata cache
+    _cached_prefill_metadata: Optional["AscendMetadata"] = None
+    _cached_decode_metadata: Optional["AscendMetadata"] = None
+
+    # Begin encoder attn & enc/dec cross-attn fields...
+
+    # Encoder sequence lengths representation
+    encoder_seq_lens: Optional[List[int]] = None
+    encoder_seq_lens_tensor: Optional[torch.Tensor] = None
+
+    # Maximum sequence length among encoder sequences
+    max_encoder_seq_len: Optional[int] = None
+
+    # Number of tokens input to encoder
+    num_encoder_tokens: Optional[int] = None
+
+    attn_mask: Optional[torch.Tensor] = None
+    pse_shift: Optional[torch.Tensor] = None
+    sparse_mode: int = 0
+
+    # Cross-attention memory-mapping data structures: slot mapping
+    # and block tables
+    cross_slot_mapping: Optional[torch.Tensor] = None
+    cross_block_tables: Optional[torch.Tensor] = None
+
+    # slot_mapping: Optional[torch.Tensor] = None
+
+    @property
+    def prefill_metadata(self) -> Optional["AscendMetadata"]:
+        if self.num_prefills == 0:
+            return None
+
+        if self._cached_prefill_metadata is not None:
+            # Recover cached prefill-phase attention
+            # metadata structure
+            return self._cached_prefill_metadata
+
+        assert ((self.seq_lens is not None)
+                or (self.encoder_seq_lens is not None))
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        query_start_loc = (None if self.query_start_loc is None else
+                           self.query_start_loc[:self.num_prefills + 1])
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[:self.num_prefill_tokens])
+        seq_lens = (None if self.seq_lens is None else
+                    self.seq_lens[:self.num_prefills])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[:self.num_prefills])
+        seq_start_loc = (None if self.seq_start_loc is None else
+                         self.seq_start_loc[:self.num_prefills + 1])
+        context_lens_tensor = (None if self.context_lens_tensor is None else
+                               self.context_lens_tensor[:self.num_prefills])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[:self.num_prefills])
+
+        # Construct & cache prefill-phase attention metadata structure
+        self._cached_prefill_metadata = AscendMetadata(
+            num_prefills=self.num_prefills,
+            num_prefill_tokens=self.num_prefill_tokens,
+            num_decode_tokens=0,
+            slot_mapping=slot_mapping,
+            seq_lens=seq_lens,
+            seq_lens_tensor=seq_lens_tensor,
+            max_query_len=self.max_query_len,
+            max_prefill_seq_len=self.max_prefill_seq_len,
+            max_decode_seq_len=0,
+            query_start_loc=query_start_loc,
+            seq_start_loc=seq_start_loc,
+            context_lens_tensor=context_lens_tensor,
+            block_tables=block_tables,
+            use_cuda_graph=False,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables,
+            enable_kv_scales_calculation=False)
+        return self._cached_prefill_metadata
+
+    @property
+    def decode_metadata(self) -> Optional["AscendMetadata"]:
+        if self.num_decode_tokens == 0:
+            return None
+
+        if self._cached_decode_metadata is not None:
+            # Recover cached decode-phase attention
+            # metadata structure
+            return self._cached_decode_metadata
+        assert ((self.seq_lens_tensor is not None)
+                or (self.encoder_seq_lens_tensor is not None))
+
+        # Compute some attn_metadata fields which default to None
+        slot_mapping = (None if self.slot_mapping is None else
+                        self.slot_mapping[self.num_prefill_tokens:])
+        seq_lens_tensor = (None if self.seq_lens_tensor is None else
+                           self.seq_lens_tensor[self.num_prefills:])
+        block_tables = (None if self.block_tables is None else
+                        self.block_tables[self.num_prefills:])
+
+        # Construct & cache decode-phase attention metadata structure
+        self._cached_decode_metadata = AscendMetadata(
+            num_prefills=0,
+            num_prefill_tokens=0,
+            num_decode_tokens=self.num_decode_tokens,
+            slot_mapping=slot_mapping,
+            seq_lens_tensor=seq_lens_tensor,
+            max_prefill_seq_len=0,
+            max_decode_seq_len=self.max_decode_seq_len,
+            # Batch may be composed of prefill|decodes, adjust query start
+            # indices to refer to the start of decodes. E.g.
+            # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6].
+            query_start_loc=(self.query_start_loc[self.num_prefills:] -
+                             self.query_start_loc[self.num_prefills])
+            if self.query_start_loc is not None else None,
+            seq_start_loc=self.seq_start_loc[self.num_prefills:]
+            if self.seq_start_loc is not None else None,
+            context_lens_tensor=None,
+            block_tables=block_tables,
+            use_cuda_graph=self.use_cuda_graph,
+            # Begin encoder & cross attn fields below...
+            encoder_seq_lens=self.encoder_seq_lens,
+            encoder_seq_lens_tensor=self.encoder_seq_lens_tensor,
+            max_encoder_seq_len=self.max_encoder_seq_len,
+            multi_modal_placeholder_index_maps=self.
+            multi_modal_placeholder_index_maps,
+            cross_slot_mapping=self.cross_slot_mapping,
+            cross_block_tables=self.cross_block_tables,
+            enable_kv_scales_calculation=False)
+        return self._cached_decode_metadata
+
+
+class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]):
+
+    _metadata_cls = AscendMetadata
+
+    def compute_npu_slot_indices(self, is_profile_run, slot_indices, seq_id,
+                                 seq_len, context_len, start_idx, block_size,
+                                 block_tables, max_query_len):
+        """
+        compute slot indices
+        slot mapping in other backend of vllm stores slot indices,
+        which are indicates by `block_number * block_size + block_offset`
+        In Ascend backend, slot mapping stores [block_number, block_offset].
+        To distinguish this, slot_indices is used in this func
+        """
+        if is_profile_run:
+            # During memory profiling, the block tables are not
+            # initialized yet. In this case, we just use a dummy
+            # slot mapping.
+            # In embeddings, the block tables are {seq_id: None}.
+            slot_indices.extend([[PAD_SLOT_ID, 0]] * seq_len)
+            return
+        # Mask the [0, start_idx) tokens of the prompt with
+        # [PAD_SLOT_ID, 0], where start_idx is max(0, seq_len -
+        # sliding_window). For example, if the prompt len is 10,
+        # sliding window is 8, and block size is 4, the first two
+        # tokens are masked and the slot mapping will be
+        # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1].
+        padding_mask_len = max(0, start_idx - context_len)
+        slot_indices.extend([[PAD_SLOT_ID, 0]] * padding_mask_len)
+
+        range_start = max(start_idx, context_len)
+        range_end = seq_len
+        numel = range_end - range_start
+        block_table = block_tables[seq_id]
+
+        for i in range(range_start, range_end):
+            block_number = block_table[i // block_size]
+            block_offset = i % block_size
+            slot_indices.append([block_number, block_offset])
+        slot_indices.extend([[PAD_SLOT_ID, 0]] * (max_query_len - numel))
+
+    def _add_seq_group(
+            self, inter_data: "ModelInputForNPUBuilder.InterDataForSeqGroup",
+            chunked_prefill_enabled: bool):
+        """Add a sequence group to the metadata. Specifically update/append
+        1. context length.
+        2. block table.
+        3. slot mapping.
+        """
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+        max_query_len = max(
+            max(data.query_lens)
+            for data in self.input_builder.inter_data_list)
+
+        is_prompt = inter_data.is_prompt
+        block_tables = inter_data.block_tables
+
+        for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len,
+             curr_sliding_window_block) in zip(
+                 inter_data.seq_ids, [len(t) for t in inter_data.input_tokens],
+                 inter_data.orig_seq_lens, inter_data.seq_lens,
+                 inter_data.query_lens, inter_data.context_lens,
+                 inter_data.curr_sliding_window_blocks):
+            self.context_lens.append(context_len)
+            if is_prompt:
+                self.num_prefills += 1
+                self.num_prefill_tokens += token_len
+                self.prefill_seq_lens.append(seq_len)
+            else:
+                assert query_len == 1, (
+                    "seq_len: {}, context_len: {}, query_len: {}".format(
+                        seq_len, context_len, query_len))
+                self.num_decode_tokens += query_len
+                self.curr_seq_lens.append(curr_seq_len)
+
+            # Compute block table.
+            # TODO(sang): Combine chunked prefill and prefix caching by
+            # only allowing multiple of block_size chunk size.
+            # NOTE: This only works for oooooooxxx style attention.
+            block_table: List[int] = []
+            prefix_cache_hit = any([
+                inter_data.prefix_cache_hit
+                for inter_data in self.input_builder.inter_data_list
+            ])
+            if prefix_cache_hit:
+                # NOTE(woosuk): For flash-attn, the block table should
+                # include the entries for the incoming prefill tokens.
+                if block_tables is not None:
+                    block_table = block_tables[seq_id]
+            elif ((chunked_prefill_enabled or not is_prompt)
+                  and block_tables is not None):
+                if curr_sliding_window_block == 0:
+                    block_table = block_tables[seq_id]
+                else:
+                    block_table = block_tables[seq_id][
+                        -curr_sliding_window_block:]
+            self.block_tables.append(block_table)
+
+            # Compute slot mapping.
+            is_profile_run = is_block_tables_empty(block_tables)
+            start_idx = compute_slot_mapping_start_idx(is_prompt, query_len,
+                                                       context_len,
+                                                       self.sliding_window)
+
+            self.compute_npu_slot_indices(is_profile_run, self.slot_mapping,
+                                          seq_id, seq_len, context_len,
+                                          start_idx, self.block_size,
+                                          inter_data.block_tables,
+                                          max_query_len)
+
+
+class AscendAttentionBackendImpl(AttentionImpl):
+
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: int,
+        alibi_slopes: Optional[List[float]],
+        sliding_window: Optional[int],
+        kv_cache_dtype: str,
+        blocksparse_params: Optional[Dict[str, Any]] = None,
+        logits_soft_cap: Optional[float] = None,
+        attn_type: str = AttentionType.DECODER,
+    ) -> None:
+        self.num_heads = num_heads
+        self.head_size = head_size
+        self.scale = float(scale)
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+        self.kv_cache_dtype = kv_cache_dtype
+        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes,
+                                        dtype=torch.float32,
+                                        device="npu")
+        self.alibi_slopes = alibi_slopes
+        self.attn_type = attn_type
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+
+    def forward(
+        self,
+        layer: AttentionLayer,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kv_cache: List[torch.Tensor],
+        attn_metadata: AscendMetadata,
+        attn_type: str = AttentionType.DECODER,
+        output: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Forward pass with Ascend attention.
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+                   num_tokens = batch_size * seq_len
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            kv_cache: shape = [2, num_blocks, block_size,
+                               num_kv_heads * head_size]
+                      key_cache = [num_blocks, block_size,
+                                   num_kv_heads * head_size]
+                      value_cache = [num_blocks, block_size,
+                                     num_kv_heads * head_size]
+            attn_metadata: Metadata for attention.
+        Returns:
+            shape = [batch_size, seq_len * num_heads * head_size]
+        """
+        assert layer._k_scale == 1.0 and layer._v_scale == 1.0
+        attn_type = self.attn_type
+        if attn_type != AttentionType.DECODER:
+            raise NotImplementedError("Encoder self-attention and "
+                                      "encoder/decoder cross-attention "
+                                      "are not implemented for "
+                                      "PallasAttentionBackendImpl")
+        # view q k v to BSH
+        num_tokens = query.shape[0]
+
+        if kv_cache is not None and len(kv_cache) >= 2:
+            slot_indices = attn_metadata.slot_mapping
+            key_cache, value_cache = kv_cache[0], kv_cache[1]
+            AscendPagedAttention.write_to_paged_cache(
+                key,
+                value,
+                key_cache,
+                value_cache,
+                slot_indices,
+            )
+
+        if attn_metadata.num_prefills > 0:
+            if attn_metadata.attn_mask is None:
+                if num_tokens > 16384:
+                    attn_metadata.sparse_mode = 2
+                attention_mask = gen_input_mask(
+                    attn_metadata.max_prefill_seq_len, self.sliding_window,
+                    num_tokens)
+                attn_metadata.attn_mask = attention_mask
+
+            if (self.alibi_slopes is not None
+                    and attn_metadata.pse_shift is None):
+                attn_metadata.pse_shift = _make_alibi_bias(
+                    self.alibi_slopes,
+                    self.num_kv_heads,
+                    dtype=query.dtype,
+                    seq_len=attn_metadata.max_prefill_seq_len,
+                    batch_size=num_tokens,
+                )
+
+            if (len(kv_cache) == 0 or attn_metadata.block_tables is None
+                    or attn_metadata.block_tables.numel() == 0):
+                max_seq_len = attn_metadata.max_prefill_seq_len
+
+                # shape of q/k/v [B,S*H] --> [B,S,N,D]
+                query = query.view(-1, max_seq_len, self.num_heads,
+                                   self.head_size).transpose(1, 2)
+                key = key.view(-1, max_seq_len, self.num_kv_heads,
+                               self.head_size).transpose(1, 2)
+                value = value.view(-1, max_seq_len, self.num_kv_heads,
+                                   self.head_size).transpose(1, 2)
+                # FA for prefill phase
+                output = torch_npu.npu_prompt_flash_attention(
+                    query,
+                    key,
+                    value,
+                    pse_shift=attn_metadata.pse_shift,
+                    atten_mask=attn_metadata.attn_mask,
+                    num_heads=self.num_heads,
+                    scale_value=1 / math.sqrt(self.head_size),
+                    input_layout="BNSD",
+                    num_key_value_heads=self.num_kv_heads,
+                    pre_tokens=65535,
+                    next_tokens=0,
+                    sparse_mode=attn_metadata.sparse_mode,
+                )
+                # reshape to [B,H]
+                output = output.transpose(1, 2).reshape(
+                    num_tokens, self.num_heads * self.head_size)
+            else:
+                # prefix-enabled attention
+                assert attn_type == AttentionType.DECODER, (
+                    "Only decoder-only models support prefix caching")
+                assert attn_metadata.seq_lens is not None
+                assert kv_cache is not None
+                query = query.view(query.shape[0], -1,
+                                   self.num_heads * self.head_size)
+                output = torch.zeros(query.shape,
+                                     device="npu",
+                                     dtype=query.dtype)
+                # TODO (Mengqing Cao): torch_npu.npu_incre_flash_attention
+                # support only when `S == 1`, OPTIMIZE ME when prefix caching
+                # is supported in torch-npu ops.
+                for i in range(query.shape[0]):
+                    # FA for prefill phase
+                    output[i] = torch_npu.npu_incre_flash_attention(
+                        query[i].unsqueeze(0),
+                        key_cache,
+                        value_cache,
+                        num_heads=self.num_heads,
+                        num_key_value_heads=self.num_kv_heads,
+                        scale_value=self.scale,
+                        input_layout="BSH",
+                        block_table=attn_metadata.block_tables,
+                        block_size=key_cache.
+                        shape[1],  # max val of block_size == 512
+                        actual_seq_lengths=attn_metadata.seq_lens,
+                    )
+                # [B,S,H] --> [B,H]
+                output = output.squeeze(1)
+
+        elif attn_metadata.decode_metadata:
+            # FA for decoding phase
+            assert kv_cache is not None
+            # shape of query [B,S*H] --> [B,S,H]
+            query = query.view(
+                -1,
+                1,
+                self.head_size * self.num_heads,
+            )
+            output = torch_npu.npu_incre_flash_attention(
+                query,
+                key_cache,
+                value_cache,
+                num_heads=self.num_heads,
+                num_key_value_heads=self.num_kv_heads,
+                scale_value=self.scale,
+                input_layout="BSH",
+                block_table=attn_metadata.block_tables,
+                block_size=key_cache.shape[1],  # max val of block_size == 512
+                actual_seq_lengths=attn_metadata.seq_lens,
+            )
+
+            # [B,S,H] --> [B,H]
+            output = output.squeeze(1)
+        return output
+
+
+def gen_input_mask(seq_len, sliding_window, len):
+    """
+    Generating lower triangular matrix
+    """
+    if len > 16384:
+        # improve computing performance on NPU when input tokens are huge
+        global SHARE_MASK_TRIL_PREFIX_CACHE
+        if SHARE_MASK_TRIL_PREFIX_CACHE is None:
+            SHARE_MASK_TRIL_PREFIX_CACHE = torch.triu(
+                torch.ones(1, 1, 2048, 2048, dtype=bool, device="npu"),
+                diagonal=1,
+            )
+        attention_mask = SHARE_MASK_TRIL_PREFIX_CACHE
+    else:
+        global SHARE_MASK_TRIL
+        if SHARE_MASK_TRIL is None or SHARE_MASK_TRIL.shape[0] < seq_len:
+            SHARE_MASK_TRIL = ~torch.tril(
+                torch.ones(seq_len, seq_len, dtype=bool, device="npu"))
+
+        attention_mask = SHARE_MASK_TRIL
+        if sliding_window is not None:
+            attention_mask = ~attention_mask
+            attention_mask = torch.triu(attention_mask,
+                                        diagonal=1 - sliding_window)
+            attention_mask = ~attention_mask
+
+    return attention_mask
+
+
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    num_kv_heads: int,
+    dtype: torch.dtype,
+    seq_len: int,
+    batch_size: int,
+):
+    bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(seq_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    # Calculate a matrix where each element represents ith element- jth
+    # element.
+    bias = bias[None, :] - bias[:, None]
+
+    padded_len = (seq_len + 7) // 8 * 8
+    num_heads = alibi_slopes.shape[0]
+    bias = torch.empty(
+        1,
+        num_heads,
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    if num_heads != num_kv_heads:
+        bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads))
+
+    return bias
diff --git a/vllm_ascend/communicator.py b/vllm_ascend/communicator.py
new file mode 100644
index 000000000..efef46e92
--- /dev/null
+++ b/vllm_ascend/communicator.py
@@ -0,0 +1,28 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import torch
+import torch.distributed as dist
+from vllm.distributed.device_communicators.base_communicator import \
+    CommunicatorBase
+
+
+class NPUCommunicator(CommunicatorBase):
+
+    def all_reduce(self, x: torch.Tensor) -> torch.Tensor:
+        dist.all_reduce(x, group=self.group)
+        return x
diff --git a/vllm_ascend/model_runner.py b/vllm_ascend/model_runner.py
new file mode 100644
index 000000000..96ef1914b
--- /dev/null
+++ b/vllm_ascend/model_runner.py
@@ -0,0 +1,620 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/model_runner.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import dataclasses
+from typing import Any, Dict, List, Optional, Set, Type
+
+import torch
+import torch.distributed
+from torch import nn
+from vllm.distributed import get_pp_group
+from vllm.logger import init_logger
+from vllm.lora.layers import LoRAMapping
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import SamplingMetadata
+from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderMap
+from vllm.platforms import current_platform
+from vllm.prompt_adapter.layers import PromptAdapterMapping
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sampling_params import SamplingParams
+from vllm.sequence import SequenceGroupMetadata
+from vllm.utils import flatten_2d_lists, make_tensor_with_pad
+from vllm.worker.model_runner import (ModelInputForGPU,
+                                      ModelInputForGPUBuilder,
+                                      ModelInputForGPUWithSamplingMetadata,
+                                      ModelRunner)
+
+logger = init_logger(__name__)
+
+LORA_WARMUP_RANK = 8
+
+
+class ModelInputForNPUBuilder(ModelInputForGPUBuilder):
+    """Build ModelInputForGPU from SequenceGroupMetadata."""
+
+    # Note: ideally we would be using a dataclass(kw_only=True)
+    # here, so that this can be subclassed easily,
+    # but kw_only is not supported in python<3.10.
+    def build(self) -> ModelInputForGPU:
+        """Finalize the builder intermediate data and
+        create on-device tensors.
+        """
+        # Combine and flatten intermediate data.
+        input_tokens = [
+            flatten_2d_lists(inter_data.input_tokens)
+            for inter_data in self.inter_data_list
+        ]
+        if not input_tokens:
+            # This may happen when all prefill requests hit
+            # prefix caching and there is no decode request.
+            return self.model_input_cls()
+
+        mrope_input_positions: Optional[List[List[int]]] = None
+        if any(inter_data.mrope_input_positions is not None
+               for inter_data in self.inter_data_list):
+            mrope_input_positions = [[] for _ in range(3)]
+            # calculate max position length for padding
+            input_position_lens = [
+                len(inter_data.input_positions[0])
+                for inter_data in self.inter_data_list
+            ]
+            max_pos_len = max(input_position_lens)
+
+            for idx in range(3):
+                for inter_data in self.inter_data_list:
+                    msections = inter_data.mrope_input_positions
+                    if msections is None:
+                        for _seq_input_positions in inter_data.input_positions:
+                            # zero pad
+                            _seq_input_positions.extend(
+                                [0] *
+                                (max_pos_len - len(_seq_input_positions)))
+                            mrope_input_positions[idx].extend(
+                                _seq_input_positions)
+                    else:
+                        for _seq_mrope_input_positions in msections:
+                            # zero pad
+                            _seq_mrope_input_positions[idx].extend(
+                                [0] * (max_pos_len -
+                                       len(_seq_mrope_input_positions[idx])))
+                            mrope_input_positions[idx].extend(
+                                _seq_mrope_input_positions[idx])
+            input_positions = None
+        else:
+            input_positions = [
+                flatten_2d_lists(inter_data.input_positions)
+                for inter_data in self.inter_data_list
+            ]
+
+        seq_lens = []
+        max_decode_seq_len = 0
+        for inter_data in self.inter_data_list:
+            seq_lens.extend(inter_data.seq_lens)
+            if not inter_data.is_prompt:
+                max_decode_seq_len = max(max_decode_seq_len,
+                                         max(inter_data.seq_lens))
+        query_lens = flatten_2d_lists(
+            [inter_data.query_lens for inter_data in self.inter_data_list])
+        # Mapping from request IDs to sequence IDs. Used for Jamba models
+        # that manages the cache by itself.
+        request_ids_to_seq_ids = {
+            data.request_id: data.seq_ids
+            for data in self.inter_data_list
+        }
+
+        batch_size = len(input_tokens)
+
+        # If cuda graph can be used, pad tensors accordingly.
+        # See `capture_model` API for more details.
+        # vLLM uses cuda graph only for decoding requests.
+        cuda_graph_pad_size = -1
+
+        if self.inter_data_list[0].is_prompt:
+            input_tokens_tensor = make_tensor_with_pad(
+                input_tokens, 0, dtype=torch.int, device=self.runner.device)
+            input_tokens_tensor = torch.flatten(input_tokens_tensor)
+            if mrope_input_positions is not None:
+                mrope_input_positions_tensor = make_tensor_with_pad(
+                    mrope_input_positions,
+                    0,
+                    dtype=torch.int,
+                    device=self.runner.device)
+                input_positions_tensor = torch.tensor(
+                    mrope_input_positions_tensor,
+                    dtype=torch.long,
+                    device=self.runner.device)
+            else:
+                input_positions_tensor = make_tensor_with_pad(
+                    input_positions,
+                    0,
+                    dtype=torch.int,
+                    device=self.runner.device)
+                input_positions_tensor = torch.flatten(input_positions_tensor)
+
+            max_seq_len = max(seq_lens)
+            seq_lens = len(seq_lens) * [max_seq_len]
+        else:
+            input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens),
+                                               dtype=torch.long,
+                                               device=self.runner.device)
+            if mrope_input_positions is not None:
+                input_positions_tensor = torch.tensor(
+                    mrope_input_positions,
+                    dtype=torch.long,
+                    device=self.runner.device)
+            else:
+                input_positions_tensor = torch.tensor(
+                    flatten_2d_lists(input_positions),
+                    dtype=torch.long,
+                    device=self.runner.device)
+
+        # Sequence and query lengths.
+        seq_lens.extend([1] * cuda_graph_pad_size)
+
+        # Attention metadata.
+        attn_metadata = self.attn_metadata_builder.build(
+            seq_lens, query_lens, cuda_graph_pad_size, batch_size)
+
+        # LoRA data.
+        lora_requests = set()
+        lora_mapping = None
+        if self.enable_lora:
+            lora_requests = set(r for data in self.inter_data_list
+                                for r in data.lora_requests)
+            lora_index_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_index_mapping)
+                for inter_data in self.inter_data_list
+            ])
+            lora_index_mapping.extend([0] * cuda_graph_pad_size)
+            lora_prompt_mapping = flatten_2d_lists([
+                flatten_2d_lists(inter_data.lora_prompt_mapping)
+                for inter_data in self.inter_data_list
+            ])
+            lora_mapping = LoRAMapping(
+                **dict(index_mapping=lora_index_mapping,
+                       prompt_mapping=lora_prompt_mapping,
+                       is_prefill=not self.decode_only))
+
+        # Prompt adapter data.
+        prompt_adapter_requests: Set[PromptAdapterRequest] = set()
+        prompt_adapter_mapping = None
+        if self.enable_prompt_adapter:
+            prompt_adapter_requests = set(
+                data.prompt_adapter_request for data in self.inter_data_list
+                if data.prompt_adapter_request is not None)
+            prompt_adapter_index_mapping = flatten_2d_lists([
+                inter_data.prompt_adapter_index_mapping
+                for inter_data in self.inter_data_list
+            ])
+            prompt_adapter_index_mapping.extend([0] * cuda_graph_pad_size)
+            prompt_adapter_prompt_mapping = flatten_2d_lists([
+                inter_data.prompt_adapter_prompt_mapping
+                for inter_data in self.inter_data_list
+            ])
+            prompt_adapter_mapping = PromptAdapterMapping(
+                prompt_adapter_index_mapping,
+                prompt_adapter_prompt_mapping,
+            )
+
+        # Multi-modal data.
+        multi_modal_kwargs_list = [
+            data.multi_modal_kwargs for data in self.inter_data_list
+            if data.multi_modal_kwargs is not None
+        ]
+        multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list)
+
+        return self.model_input_cls(
+            input_tokens=input_tokens_tensor,
+            input_positions=input_positions_tensor,
+            attn_metadata=attn_metadata,
+            seq_lens=seq_lens,
+            query_lens=query_lens,
+            lora_mapping=lora_mapping,
+            lora_requests=lora_requests,
+            multi_modal_kwargs=multi_modal_kwargs,
+            request_ids_to_seq_ids=request_ids_to_seq_ids,
+            finished_requests_ids=self.finished_requests_ids,
+            prompt_adapter_mapping=prompt_adapter_mapping,
+            prompt_adapter_requests=prompt_adapter_requests)
+
+    class InterDataForSeqGroup:
+        """Intermediate data for the current sequence group."""
+
+        def simple_reinit(self):
+            self.input_tokens[0].clear()  # type: ignore
+            self.input_positions[0].clear()  # type: ignore
+            self.token_types[0].clear()  # type: ignore
+            self.mrope_input_positions = None  # type: ignore
+            self.seq_lens[0] = 0  # type: ignore
+            self.orig_seq_lens[0] = 0  # type: ignore
+            self.query_lens[0] = 0  # type: ignore
+            self.context_lens[0] = 0  # type: ignore
+            self.curr_sliding_window_blocks[0] = 0  # type: ignore
+            self.lora_index_mapping.clear()  # type: ignore
+            self.lora_prompt_mapping.clear()  # type: ignore
+            self.lora_requests.clear()  # type: ignore
+            self.prompt_adapter_index_mapping.clear()  # type: ignore
+            self.prompt_adapter_prompt_mapping.clear()  # type: ignore
+
+        def __init__(
+            self,
+            *,
+            # From sequence group metadata.
+            request_id: str,
+            seq_ids: List[int],
+            is_prompt: bool,
+            block_tables: Optional[Dict[int, List[int]]],
+            computed_block_nums: List[int],
+            n_seqs: int = 0,
+
+            # Input tokens and positions.
+            input_tokens: Optional[List[List[int]]] = None,
+            input_positions: Optional[List[List[int]]] = None,
+            token_types: Optional[List[List[int]]] = None,
+            mrope_input_positions: Optional[List[List[List[int]]]] = None,
+
+            # The sequence length (may be capped to the sliding window).
+            seq_lens: Optional[List[int]] = None,
+            # The original sequence length (before applying sliding window).
+            # This is used to compute slot mapping.
+            orig_seq_lens: Optional[List[int]] = None,
+            # The query length.
+            query_lens: Optional[List[int]] = None,
+            # The number of tokens that are already computed.
+            context_lens: Optional[List[int]] = None,
+            # The current sliding window block.
+            curr_sliding_window_blocks: Optional[List[int]] = None,
+
+            # LoRA inputs.
+            lora_index_mapping: Optional[List[List[int]]] = None,
+            lora_prompt_mapping: Optional[List[List[int]]] = None,
+            lora_requests: Optional[Set[LoRARequest]] = None,
+
+            # Prompt adapter inputs.
+            prompt_adapter_index_mapping: Optional[List[int]] = None,
+            prompt_adapter_prompt_mapping: Optional[List[int]] = None,
+            prompt_adapter_request: Optional[PromptAdapterRequest] = None,
+
+            # Multi-modal inputs.
+            multi_modal_kwargs: Optional[MultiModalKwargs] = None,
+            multi_modal_placeholder_maps: Optional[Dict[
+                str, MultiModalPlaceholderMap]] = None,
+
+            # Whether the prefix cache is hit (prefill only).
+            prefix_cache_hit: bool = False,
+            reinit: bool = False,
+            reinit_use_defaults: bool = False,
+            encoder_seq_len: int = 0,
+        ):
+            if reinit:
+                assert len(self.seq_ids) == len(seq_ids)  # type: ignore
+                for i, seq_id in enumerate(seq_ids):
+                    self.seq_ids[i] = seq_id  # type: ignore
+            else:
+                self.seq_ids = seq_ids
+
+            self.request_id = request_id
+            self.is_prompt = is_prompt
+            self.block_tables = block_tables
+            self.computed_block_nums = computed_block_nums
+            self.n_seqs = n_seqs
+            self.encoder_seq_len = encoder_seq_len
+
+            if reinit:
+                if len(self.seq_ids) == 1 and reinit_use_defaults:
+                    self.simple_reinit()
+                else:
+                    if input_tokens:
+                        self.input_tokens = input_tokens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.input_tokens[seq_id].clear()
+
+                    if input_positions:
+                        self.input_positions = input_positions
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.input_positions[seq_id].clear()
+
+                    if token_types:
+                        self.token_types = token_types
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.token_types[seq_id].clear()
+
+                    self.mrope_input_positions = None
+
+                    if seq_lens:
+                        self.seq_lens = seq_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.seq_lens[seq_id] = 0
+
+                    if orig_seq_lens:
+                        self.orig_seq_lens = orig_seq_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.orig_seq_lens[seq_id] = 0
+
+                    if query_lens:
+                        self.query_lens = query_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.query_lens[seq_id] = 0
+
+                    if context_lens:
+                        self.context_lens = context_lens
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.context_lens[seq_id] = 0
+
+                    if curr_sliding_window_blocks:
+                        self.curr_sliding_window_blocks = \
+                            curr_sliding_window_blocks
+                    else:
+                        for seq_id in range(len(self.seq_ids)):
+                            self.curr_sliding_window_blocks[seq_id] = 0
+
+                    if lora_index_mapping:
+                        self.lora_index_mapping = lora_index_mapping
+                    else:
+                        self.lora_index_mapping.clear()
+
+                    if lora_prompt_mapping:
+                        self.lora_prompt_mapping = lora_prompt_mapping
+                    else:
+                        self.lora_prompt_mapping.clear()
+
+                    if lora_requests:
+                        self.lora_requests = lora_requests
+                    else:
+                        self.lora_requests.clear()
+
+                    if prompt_adapter_index_mapping:
+                        self.prompt_adapter_index_mapping = \
+                            prompt_adapter_index_mapping
+                    else:
+                        self.prompt_adapter_index_mapping.clear()
+
+                    if prompt_adapter_prompt_mapping:
+                        self.prompt_adapter_prompt_mapping = \
+                            prompt_adapter_prompt_mapping
+                    else:
+                        self.prompt_adapter_prompt_mapping.clear()
+
+            else:
+                self.input_tokens = input_tokens or []
+                self.input_positions = input_positions or []
+                self.token_types = token_types or []
+                self.mrope_input_positions = mrope_input_positions or None
+                self.seq_lens = seq_lens or []
+                self.orig_seq_lens = orig_seq_lens or []
+                self.query_lens = query_lens or []
+                self.context_lens = context_lens or []
+                self.curr_sliding_window_blocks = \
+                    curr_sliding_window_blocks or []
+
+                self.lora_index_mapping = lora_index_mapping or []
+                self.lora_prompt_mapping = lora_prompt_mapping or []
+                self.lora_requests = lora_requests or set()
+
+                self.prompt_adapter_index_mapping = (
+                    prompt_adapter_index_mapping or [])
+                self.prompt_adapter_prompt_mapping = (
+                    prompt_adapter_prompt_mapping or [])
+
+            self.prompt_adapter_request = prompt_adapter_request
+            self.multi_modal_kwargs = multi_modal_kwargs
+            self.multi_modal_placeholder_maps = multi_modal_placeholder_maps
+            self.prefix_cache_hit = prefix_cache_hit
+
+            self.n_seqs = len(self.seq_ids)
+
+            if not reinit:
+                self.__post_init__()
+
+        def __post_init__(self):
+            self.n_seqs = len(self.seq_ids)
+
+            self.input_tokens = [[] for _ in range(self.n_seqs)]
+            self.input_positions = [[] for _ in range(self.n_seqs)]
+            self.token_types = [[] for _ in range(self.n_seqs)]
+            self.mrope_input_positions = None
+            self.seq_lens = [0] * self.n_seqs
+            self.orig_seq_lens = [0] * self.n_seqs
+            self.query_lens = [0] * self.n_seqs
+            self.context_lens = [0] * self.n_seqs
+            self.curr_sliding_window_blocks = [0] * self.n_seqs
+
+            self.lora_index_mapping = []
+            self.lora_prompt_mapping = []
+
+
+class NPUModelRunner(ModelRunner):
+    """
+    NPU model runner with sampling step.
+    """
+    _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = (
+        ModelInputForGPUWithSamplingMetadata)
+    _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        model_input = \
+            ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict(
+                tensor_dict,
+                attn_backend=self.attn_backend,
+            )
+        return model_input
+
+    @current_platform.inference_mode()
+    def profile_run(self) -> None:
+        # Enable top-k sampling to reflect the accurate memory usage.
+        sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1)
+        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
+        max_num_seqs = self.scheduler_config.max_num_seqs
+        # This represents the maximum number of different requests
+        # that will have unique loras, an therefore the max amount of memory
+        # consumption create dummy lora request copies from the lora request
+        # passed in, which contains a lora from the lora warmup path.
+        dummy_lora_requests: List[LoRARequest] = []
+        dummy_lora_requests_per_seq: List[LoRARequest] = []
+        if self.lora_config:
+            assert self.lora_manager is not None
+            with self.lora_manager.dummy_lora_cache():
+                for idx in range(self.lora_config.max_loras):
+                    lora_id = idx + 1
+                    dummy_lora_request = LoRARequest(
+                        lora_name=f"warmup_{lora_id}",
+                        lora_int_id=lora_id,
+                        lora_path="/not/a/real/path",
+                    )
+                    self.lora_manager.add_dummy_lora(dummy_lora_request,
+                                                     rank=LORA_WARMUP_RANK)
+                    dummy_lora_requests.append(dummy_lora_request)
+                dummy_lora_requests_per_seq = [
+                    dummy_lora_requests[idx % len(dummy_lora_requests)]
+                    for idx in range(max_num_seqs)
+                ]
+
+        # Profile memory usage with max_num_sequences sequences and the total
+        # number of tokens equal to max_num_batched_tokens.
+        seqs: List[SequenceGroupMetadata] = []
+        # Additional GPU memory may be needed for multi-modal encoding, which
+        # needs to be accounted for when calculating the GPU blocks for
+        # vLLM blocker manager.
+        # To exercise the worst scenario for GPU memory consumption,
+        # the number of seqs (batch_size) is chosen to maximize the number
+        # of images processed.
+
+        max_mm_tokens = self.mm_registry.get_max_multimodal_tokens(
+            self.model_config)
+        if max_mm_tokens > 0:
+            max_num_seqs_orig = max_num_seqs
+            max_num_seqs = min(max_num_seqs,
+                               max_num_batched_tokens // max_mm_tokens)
+            if max_num_seqs < 1:
+                expr = (f"min({max_num_seqs_orig}, "
+                        f"{max_num_batched_tokens} // {max_mm_tokens})")
+                logger.warning(
+                    "Computed max_num_seqs (%s) to be less than 1. "
+                    "Setting it to the minimum value of 1.", expr)
+                max_num_seqs = 1
+
+        batch_size = 0
+        for group_id in range(max_num_seqs):
+            seq_len = (max_num_batched_tokens // max_num_seqs +
+                       (group_id < max_num_batched_tokens % max_num_seqs))
+            batch_size += seq_len
+
+            dummy_data = self.input_registry \
+                .dummy_data_for_profiling(self.model_config,
+                                          seq_len,
+                                          self.mm_registry)
+
+            seq = SequenceGroupMetadata(
+                request_id=str(group_id),
+                is_prompt=True,
+                seq_data={group_id: dummy_data.seq_data},
+                sampling_params=sampling_params,
+                block_tables=None,
+                lora_request=dummy_lora_requests_per_seq[group_id]
+                if dummy_lora_requests_per_seq else None,
+                multi_modal_data=dummy_data.multi_modal_data,
+                multi_modal_placeholders=dummy_data.multi_modal_placeholders,
+            )
+            seqs.append(seq)
+
+        # Run the model with the dummy inputs.
+        num_layers = self.model_config.get_num_layers(self.parallel_config)
+        # use an empty tensor instead of `None`` to force Dynamo to pass
+        # it by reference, rather by specializing on the value ``None``.
+        # the `dtype` argument does not matter, and we use `float32` as
+        # a placeholder (it has wide hardware support).
+        # it is important to create tensors inside the loop, rather than
+        # multiplying the list, to avoid Dynamo from treating them as
+        # tensor aliasing.
+        kv_caches = [
+            torch.tensor([], dtype=torch.float32, device=self.device)
+            for _ in range(num_layers)
+        ]
+        finished_requests_ids = [seq.request_id for seq in seqs]
+        model_input = self.prepare_model_input(
+            seqs, finished_requests_ids=finished_requests_ids)
+        intermediate_tensors = None
+        if not get_pp_group().is_first_rank:
+            intermediate_tensors = self.model.make_empty_intermediate_tensors(
+                batch_size=batch_size,
+                dtype=self.model_config.dtype,
+                device=self.device)
+        self.execute_model(model_input, kv_caches, intermediate_tensors)
+        current_platform.synchronize()
+        return
+
+    @current_platform.inference_mode()
+    def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None:
+        """NPU graph capture a model.
+        TODO: not support now
+        """
+        pass
+
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelInputForGPUWithSamplingMetadata:
+        """Prepare the model input based on a given sequence group, including
+        metadata for the sampling step.
+        The API assumes seq_group_metadata_list is sorted by prefill -> decode.
+        The result tensors and data structure also batches input in prefill
+        -> decode order. For example,
+        - input_tokens[:num_prefill_tokens] contains prefill tokens.
+        - input_tokens[num_prefill_tokens:] contains decode tokens.
+        If cuda graph is required, this API automatically pads inputs.
+        """
+        model_input = self._prepare_model_input_tensors(
+            seq_group_metadata_list, finished_requests_ids)
+        if get_pp_group().is_last_rank:
+            # Sampling metadata is only required for the final pp group
+            generators = self.get_generators(finished_requests_ids)
+            sampling_metadata = SamplingMetadata.prepare(
+                seq_group_metadata_list,
+                model_input.seq_lens,
+                model_input.query_lens,
+                self.device,
+                self.pin_memory,
+                generators,
+                self.sampling_metadata_cache,
+                # TODO (cmq): enable this after supported in vllm
+                # pad_for_invariant_seq_len=True,
+            )
+        else:
+            sampling_metadata = None
+        is_prompt = (seq_group_metadata_list[0].is_prompt
+                     if seq_group_metadata_list else None)
+        return dataclasses.replace(model_input,
+                                   sampling_metadata=sampling_metadata,
+                                   is_prompt=is_prompt,
+                                   virtual_engine=virtual_engine)
+
+    def get_model(self) -> nn.Module:
+        return self.model
diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py
new file mode 100644
index 000000000..bdc40cd5f
--- /dev/null
+++ b/vllm_ascend/ops/__init__.py
@@ -0,0 +1,18 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import vllm_ascend.ops.layernorm  # noqa
diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py
new file mode 100644
index 000000000..719aa977d
--- /dev/null
+++ b/vllm_ascend/ops/layernorm.py
@@ -0,0 +1,40 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Optional, Tuple, Union
+
+import torch
+from vllm.model_executor.layers.layernorm import RMSNorm
+
+
+def forward_oot(
+    self,
+    x: torch.Tensor,
+    residual: Optional[torch.Tensor] = None,
+) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+    import torch_npu
+
+    if residual is not None:
+        x, _, residual = torch_npu.npu_add_rms_norm(x, residual, self.weight,
+                                                    self.variance_epsilon)
+        return x, residual
+
+    x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon)
+    return x
+
+
+RMSNorm.forward_oot = forward_oot
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
new file mode 100644
index 000000000..2b847de13
--- /dev/null
+++ b/vllm_ascend/platform.py
@@ -0,0 +1,115 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import os
+from typing import Optional, Tuple
+
+import torch
+
+try:
+    import torch_npu  # noqa: F401
+except ImportError:
+    print("Failed to import torch_npu.")
+
+from vllm.config import VllmConfig
+from vllm.platforms import Platform, PlatformEnum
+
+os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
+
+
+def _device_id_to_physical_device_id(device_id: int) -> int:
+    if "ASCEND_RT_VISIBLE_DEVICES" in os.environ:
+        device_ids = os.environ["ASCEND_RT_VISIBLE_DEVICES"].split(",")
+        if device_ids == [""]:
+            raise RuntimeError("ASCEND_RT_VISIBLE_DEVICES is set to empty"
+                               "string, which means Ascend NPU support is"
+                               "disabled.")
+        physical_device_id = device_ids[device_id]
+        return int(physical_device_id)
+    else:
+        return device_id
+
+
+class NPUPlatform(Platform):
+
+    _enum = PlatformEnum.OOT
+    device_name: str = "npu"
+    device_type: str = "npu"
+    simple_compile_backend: str = "npu"
+    ray_device_key: str = "NPU"
+    device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES"
+
+    @classmethod
+    def get_device_capability(cls, device_id: int = 0):
+        return None
+
+    @classmethod
+    def get_device_name(cls, device_id: int = 0) -> str:
+        physical_device_id = _device_id_to_physical_device_id(device_id)
+        return torch.npu.get_device_name(physical_device_id)
+
+    @classmethod
+    def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool:
+        return True
+
+    @classmethod
+    def inference_mode(cls):
+        return torch.inference_mode()
+
+    @classmethod
+    def set_device(cls, device: torch.device):
+        torch.npu.set_device(device)
+
+    @classmethod
+    def empty_cache(cls):
+        torch.npu.empty_cache()
+
+    @classmethod
+    def synchronize(cls):
+        torch.npu.synchronize()
+
+    @classmethod
+    def mem_get_info(cls) -> Tuple[int, int]:
+        return torch.npu.mem_get_info()
+
+    @classmethod
+    def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
+        # Register ops when setup.
+        from vllm_ascend import ops  # noqa: F401
+
+        parallel_config = vllm_config.parallel_config
+        if parallel_config.worker_cls == "auto":
+            parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker"
+        cache_config = vllm_config.cache_config
+        if cache_config and cache_config.block_size is None:
+            cache_config.block_size = 16
+
+    @classmethod
+    def get_attn_backend_cls(cls, selected_backend, head_size, dtype,
+                             kv_cache_dtype, block_size, use_v1, use_mla):
+        return "vllm_ascend.attention.AscendAttentionBackend"
+
+    @classmethod
+    def get_current_memory_usage(cls,
+                                 device: Optional[torch.types.Device] = None
+                                 ) -> float:
+        torch.npu.reset_peak_memory_stats(device)
+        return torch.npu.max_memory_allocated(device)
+
+    @classmethod
+    def get_device_communicator_cls(cls) -> str:
+        return "vllm_ascend.communicator.NPUCommunicator"
diff --git a/vllm_ascend/worker.py b/vllm_ascend/worker.py
new file mode 100644
index 000000000..8ddd5302e
--- /dev/null
+++ b/vllm_ascend/worker.py
@@ -0,0 +1,481 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# This file is a part of the vllm-ascend project.
+# Adapted from vllm-project/vllm/vllm/worker/worker.py
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import gc
+from typing import Dict, List, Optional, Set, Tuple, Type, Union
+
+import torch
+import torch.distributed
+import torch_npu
+from torch import nn
+from vllm import envs
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.distributed import (ensure_model_parallel_initialized,
+                              init_distributed_environment,
+                              set_custom_all_reduce)
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.model_executor import set_random_seed
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
+from vllm.platforms import current_platform
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import (ExecuteModelRequest, IntermediateTensors,
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta)
+from vllm.utils import bind_kv_cache
+from vllm.worker.cache_engine import CacheEngine
+from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner
+from vllm.worker.model_runner_base import ModelRunnerBase
+from vllm.worker.pooling_model_runner import PoolingModelRunner
+from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase,
+                                     WorkerInput)
+
+from vllm_ascend.model_runner import NPUModelRunner
+
+logger = init_logger(__name__)
+
+
+class NPUWorker(LocalOrDistributedWorkerBase):
+    """A worker class that executes (a partition of) the model on a NPU.
+    Each worker is associated with a single NPU. The worker is responsible for
+    maintaining the KV cache and executing the model on the NPU. In case of
+    distributed inference, each worker is assigned a partition of the model.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        local_rank: int,
+        rank: int,
+        distributed_init_method: str,
+        is_driver_worker: bool = False,
+        model_runner_cls: Optional[Type[ModelRunnerBase]] = None,
+    ) -> None:
+
+        WorkerBase.__init__(self, vllm_config=vllm_config)
+        # distribute related config
+        self.parallel_config.rank = rank
+        self.local_rank = local_rank
+        self.rank = rank
+        self.distributed_init_method = distributed_init_method
+        self.is_driver_worker = is_driver_worker
+
+        if is_driver_worker:
+            assert rank % self.parallel_config.tensor_parallel_size == 0, \
+                   "Driver worker should be rank 0 of tensor parallel group."
+        if self.model_config.trust_remote_code:
+            # note: lazy import to avoid importing torch before initializing
+            from vllm.utils import init_cached_hf_modules
+            init_cached_hf_modules()
+
+        # Return hidden states from target model if the draft model is an
+        # mlp_speculator
+        speculative_config = self.speculative_config
+        model_config = self.model_config
+        speculative_args = {} if speculative_config is None \
+            or (speculative_config.draft_model_config.model ==
+                model_config.model) \
+            or (speculative_config.draft_model_config.hf_config.model_type
+                not in ["medusa", "mlp_speculator", "eagle"]) \
+                    else {"return_hidden_states": True}
+
+        ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner
+        if model_config.runner_type == "pooling":
+            ModelRunnerClass = PoolingModelRunner
+        elif self.model_config.is_encoder_decoder:
+            ModelRunnerClass = EncoderDecoderModelRunner
+        self.model_runner: ModelRunnerBase = ModelRunnerClass(
+            vllm_config=self.vllm_config,
+            kv_cache_dtype=self.cache_config.cache_dtype,
+            is_driver_worker=is_driver_worker,
+            **speculative_args,
+        )
+        if model_runner_cls is not None:
+            self.model_runner = model_runner_cls(self.model_runner)
+
+        # Uninitialized cache engine. Will be initialized by
+        # initialize_cache.
+        self.cache_engine: List[CacheEngine]
+        # Initialize gpu_cache as embedding models don't initialize kv_caches
+        self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
+        self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
+
+        # Torch profiler. Enabled and configured through env vars:
+        # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
+        if envs.VLLM_TORCH_PROFILER_DIR:
+            torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR
+            logger.info("Profiling enabled. Traces will be saved to: %s",
+                        torch_profiler_trace_dir)
+
+            experimental_config = torch_npu.profiler._ExperimentalConfig(
+                export_type=torch_npu.profiler.ExportType.Text,
+                profiler_level=torch_npu.profiler.ProfilerLevel.Level0,
+                msprof_tx=False,
+                aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone,
+                l2_cache=False,
+                op_attr=False,
+                data_simplification=False,
+                record_op_args=False,
+                gc_detect_threshold=None,
+            )
+
+            self.profiler = torch_npu.profiler.profile(
+                activities=[
+                    torch_npu.profiler.ProfilerActivity.CPU,
+                    torch_npu.profiler.ProfilerActivity.NPU,
+                ],
+                with_stack=True,
+                profile_memory=True,
+                with_modules=True,
+                experimental_config=experimental_config,
+                on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(
+                    torch_profiler_trace_dir))
+        else:
+            self.profiler = None
+
+    def start_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.start()
+
+    def stop_profile(self):
+        if self.profiler is None:
+            raise RuntimeError("Profiler is not enabled.")
+        self.profiler.stop()
+
+    def init_device(self) -> None:
+        if self.device_config.device.type == "npu":
+            # # This env var set by Ray causes exceptions with graph building.
+            # os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+            self.device = torch.device(f"npu:{self.local_rank}")
+            current_platform.set_device(self.device)
+
+            current_platform.empty_cache()
+            self.init_npu_memory = current_platform.mem_get_info()[0]
+        else:
+            raise RuntimeError(
+                f"Not support device type: {self.device_config.device}")
+        # Initialize the distributed environment.
+        init_worker_distributed_environment(self.parallel_config, self.rank,
+                                            self.distributed_init_method,
+                                            self.local_rank)
+        # Set random seed.
+        set_random_seed(self.model_config.seed)
+
+    def load_model(self):
+        self.model_runner.load_model()
+
+    def save_sharded_state(
+        self,
+        path: str,
+        pattern: Optional[str] = None,
+        max_size: Optional[int] = None,
+    ) -> None:
+        self.model_runner.save_sharded_state(
+            path,
+            pattern=pattern,
+            max_size=max_size,
+        )
+
+    def save_tensorized_model(
+        self,
+        tensorizer_config: TensorizerConfig,
+    ) -> None:
+        self.model_runner.save_tensorized_model(
+            tensorizer_config=tensorizer_config, )
+
+    @current_platform.inference_mode()
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """Profiles the peak memory usage of the model to determine how many
+        KV blocks may be allocated without OOMs.
+        The engine will first conduct a profiling of the existing memory usage.
+        Then, it calculate the maximum possible number of NPU and CPU blocks
+        that can be allocated with the remaining free memory.
+        .. tip::
+            You may limit the usage of NPU memory
+            by adjusting the `gpu_memory_utilization` parameter.
+        """
+        # Profile the memory usage of the model and get the maximum number of
+        # cache blocks that can be allocated with the remaining free memory.
+        current_platform.empty_cache()
+
+        # Execute a forward pass with dummy inputs to profile the memory usage
+        # of the model.
+        self.model_runner.profile_run()
+
+        # Calculate the number of blocks that can be allocated with the
+        # profiled peak memory.
+        free_npu_memory, total_npu_memory = current_platform.mem_get_info()
+        # NOTE(woosuk): Here we assume that the other processes using the same
+        # GPU did not change their memory usage during the profiling.
+        peak_memory = self.init_npu_memory - free_npu_memory
+        assert peak_memory > 0, (
+            "Error in memory profiling. "
+            f"Initial free memory {self.init_npu_memory}, current free memory"
+            f" {free_npu_memory}. This happens when the NPU memory was "
+            "not properly cleaned up before initializing the vLLM instance.")
+
+        cache_block_size = self.get_cache_block_size_bytes()
+        num_npu_blocks = int(
+            (total_npu_memory * self.cache_config.gpu_memory_utilization -
+             peak_memory) // cache_block_size)
+        num_cpu_blocks = int(self.cache_config.swap_space_bytes //
+                             cache_block_size)
+        num_npu_blocks = max(num_npu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
+        if self.model_runner.lora_manager:
+            self.model_runner.remove_all_loras()
+        gc.collect()
+        # TODO: don`t need impl this func after empty_cache in
+        # Worker.determine_num_available_blocks() unified`
+        current_platform.empty_cache()
+        return num_npu_blocks, num_cpu_blocks
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        """Allocate NPU and CPU KV cache with the specified number of blocks.
+        """
+        raise_if_cache_size_invalid(num_gpu_blocks,
+                                    self.cache_config.block_size,
+                                    self.cache_config.is_attention_free,
+                                    self.model_config.max_model_len)
+
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
+        self._init_cache_engine()
+        self._warm_up_model()
+
+    def _init_cache_engine(self):
+        assert self.cache_config.num_gpu_blocks is not None
+        self.cache_engine = [
+            CacheEngine(self.cache_config, self.model_config,
+                        self.parallel_config, self.device_config)
+            for _ in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        self.gpu_cache = [
+            self.cache_engine[ve].gpu_cache
+            for ve in range(self.parallel_config.pipeline_parallel_size)
+        ]
+        bind_kv_cache(self.compilation_config.static_forward_context,
+                      self.gpu_cache)
+
+    def _warm_up_model(self) -> None:
+        # model capture is not supported, thus we just set seed here.
+        # Reset the seed to ensure that the random state is not affected by
+        # the model initialization and profiling.
+        set_random_seed(self.model_config.seed)
+
+    @property
+    def do_metadata_broadcast(self) -> bool:
+        return self.parallel_config.tensor_parallel_size > 1
+
+    @property
+    def kv_cache(self) -> Optional[List[List[torch.Tensor]]]:
+        return self.gpu_cache
+
+    @torch.inference_mode()
+    def prepare_worker_input(
+            self, execute_model_req: ExecuteModelRequest) -> WorkerInput:
+        virtual_engine = execute_model_req.virtual_engine
+        num_steps = execute_model_req.num_steps
+        num_seq_groups = len(execute_model_req.seq_group_metadata_list)
+        # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors.
+        # they contain parameters to launch cudamemcpyasync.
+        blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in,
+                                         device="cpu",
+                                         dtype=torch.int64).view(-1, 2)
+        blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out,
+                                          device="cpu",
+                                          dtype=torch.int64).view(-1, 2)
+        # `blocks_to_copy` is a gpu tensor. The src and tgt of
+        # blocks to copy are in the same device, and `blocks_to_copy`
+        # can be used directly within cuda kernels.
+        blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy,
+                                      device=self.device,
+                                      dtype=torch.int64).view(-1, 2)
+
+        return WorkerInput(
+            num_seq_groups=num_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_swap_out=blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            virtual_engine=virtual_engine,
+            num_steps=num_steps,
+        )
+
+    def get_model(self) -> nn.Module:
+        return self.model_runner.get_model()
+
+    @torch.inference_mode()
+    def execute_worker(self, worker_input: WorkerInput) -> None:
+        virtual_engine = worker_input.virtual_engine
+        # Issue cache operations.
+        if (worker_input.blocks_to_swap_in is not None
+                and worker_input.blocks_to_swap_in.numel() > 0):
+            self.cache_engine[virtual_engine].swap_in(
+                worker_input.blocks_to_swap_in)
+        if (worker_input.blocks_to_swap_out is not None
+                and worker_input.blocks_to_swap_out.numel() > 0):
+            self.cache_engine[virtual_engine].swap_out(
+                worker_input.blocks_to_swap_out)
+        if (worker_input.blocks_to_copy is not None
+                and worker_input.blocks_to_copy.numel() > 0):
+            self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy)
+
+    def _get_cached_seq_group_metadata(
+            self,
+            seq_group_metadata_list: List[Union[SequenceGroupMetadata,
+                                                SequenceGroupMetadataDelta]],
+            finished_request_ids: List[str]) -> List[SequenceGroupMetadata]:
+        """Return a list of cached Sequence Group Metadata after updating its
+        state.
+
+        It is used because scheduler only sends delta to workers to reduce
+        the data payload size. The function also cleans up cache based on
+        a given `finished_request_ids`.
+        """
+        new_seq_group_metadata_list = []
+        for metadata_or_delta in seq_group_metadata_list:
+            request_id = metadata_or_delta.request_id
+            if request_id not in self._seq_group_metadata_cache:
+                # The first prefill.
+                assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                self._seq_group_metadata_cache[request_id] = metadata_or_delta
+            else:
+                # The first prefill is already cached.
+                if isinstance(metadata_or_delta, SequenceGroupMetadataDelta):
+                    self._seq_group_metadata_cache[request_id].apply_delta(
+                        metadata_or_delta)
+                else:
+                    # If metadata snapshot is sent again, it is
+                    # preempted. Reset the cache because we need to start
+                    # from scratch.
+                    assert isinstance(metadata_or_delta, SequenceGroupMetadata)
+                    self._seq_group_metadata_cache[
+                        request_id] = metadata_or_delta
+
+            new_seq_group_metadata_list.append(
+                self._seq_group_metadata_cache[request_id])
+
+        # Clean up finished ids
+        for finished_id in finished_request_ids:
+            del self._seq_group_metadata_cache[finished_id]
+
+        return new_seq_group_metadata_list
+
+    def _execute_model_spmd(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+    ) -> Optional[List[SamplerOutput]]:
+        if execute_model_req is not None:
+            new_seq_group_metadata_list = self._get_cached_seq_group_metadata(
+                execute_model_req.seq_group_metadata_list,
+                execute_model_req.finished_requests_ids)
+
+            execute_model_req.seq_group_metadata_list = (
+                new_seq_group_metadata_list)
+        output = super()._execute_model_spmd(execute_model_req,
+                                             intermediate_tensors)
+        return output
+
+    def add_lora(self, lora_request: LoRARequest) -> bool:
+        raise NotImplementedError(
+            "LoRA is not implemented for NPU backend currently.")
+
+    def remove_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRA is not implemented for NPU backend currently.")
+
+    def pin_lora(self, lora_id: int) -> bool:
+        raise NotImplementedError(
+            "LoRA is not implemented for NPU backend currently.")
+
+    def list_loras(self) -> Set[int]:
+        raise NotImplementedError(
+            "LoRA is not implemented for NPU backend currently.")
+
+    def add_prompt_adapter(
+            self, prompt_adapter_request: PromptAdapterRequest) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for NPU backend currently.")
+
+    def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for NPU backend currently.")
+
+    def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for NPU backend currently.")
+
+    def list_prompt_adapters(self) -> Set[int]:
+        raise NotImplementedError(
+            "Prompt Adapter is not implemented for NPU backend currently.")
+
+    @property
+    def max_model_len(self) -> int:
+        return self.model_config.max_model_len
+
+    @property
+    def vocab_size(self) -> int:
+        return self.model_runner.vocab_size
+
+    def get_cache_block_size_bytes(self) -> int:
+        """Get the size of the KV cache block size in bytes.
+        """
+        return CacheEngine.get_cache_block_size(self.cache_config,
+                                                self.model_config,
+                                                self.parallel_config)
+
+
+def init_worker_distributed_environment(
+        parallel_config: ParallelConfig,
+        rank: int,
+        distributed_init_method: Optional[str] = None,
+        local_rank: int = -1,
+        backend: str = "hccl") -> None:
+    """Initialize the distributed environment."""
+    set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
+
+    init_distributed_environment(parallel_config.world_size, rank,
+                                 distributed_init_method, local_rank, backend)
+
+    ensure_model_parallel_initialized(parallel_config.tensor_parallel_size,
+                                      parallel_config.pipeline_parallel_size)
+
+
+def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free,
+                                max_model_len) -> None:
+    if is_attention_free and num_gpu_blocks != 0:
+        raise ValueError("No memory should be allocated for the cache blocks "
+                         f"for an attention-free model, but {num_gpu_blocks}"
+                         "blocks are allocated.")
+    if not is_attention_free and num_gpu_blocks <= 0:
+        raise ValueError("No available memory for the cache blocks. "
+                         "Try increasing `gpu_memory_utilization` when "
+                         "initializing the engine.")
+    max_seq_len = block_size * num_gpu_blocks
+    if not is_attention_free and max_model_len > max_seq_len:
+        raise ValueError(
+            f"The model's max seq len ({max_model_len}) "
+            "is larger than the maximum number of tokens that can be "
+            f"stored in KV cache ({max_seq_len}). Try increasing "
+            "`gpu_memory_utilization` or decreasing `max_model_len` when "
+            "initializing the engine.")