diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..38c0b58e8 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,28 @@ + +### What this PR does / why we need it? + + +### Does this PR introduce _any_ user-facing change? + + +### How was this patch tested? + + diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml new file mode 100644 index 000000000..1161a6e21 --- /dev/null +++ b/.github/workflows/actionlint.yml @@ -0,0 +1,57 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run actionlint" + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json new file mode 100644 index 000000000..4613e1617 --- /dev/null +++ b/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 000000000..f048fce52 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 000000000..f6d4479ee --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml new file mode 100644 index 000000000..ec9c2e6f5 --- /dev/null +++ b/.github/workflows/mypy.yaml @@ -0,0 +1,74 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: mypy + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + pull_request: + branches: + - "main" + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + +jobs: + mypy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: vllm-empty + run: | + pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Mypy + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 000000000..11573a84a --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,57 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: ruff + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - "**/*.py" + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml + pull_request: + branches: + - "main" + +jobs: + ruff: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 000000000..6a8ff7a28 --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,54 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml new file mode 100644 index 000000000..bea98471a --- /dev/null +++ b/.github/workflows/vllm_ascend_test.yaml @@ -0,0 +1,106 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: 'e2e test' + +on: + push: + branches: + - "main" + paths: + - '*.txt' + - '**/*.py' + - '.github/workflows/vllm_ascend_test.yaml' + pull_request: + branches: + - "main" + paths: + - '*.txt' + - '**/*.py' + - '.github/workflows/vllm_ascend_test.yaml' + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +jobs: + test: + name: vLLM Ascend test (self-host) + runs-on: ascend-arm64 # actionlint-ignore: runner-label + + container: + image: quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 + volumes: + - /usr/local/dcmi:/usr/local/dcmi + - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi + - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ + # Use self-host cache speed up pip and model download + - /home/action/actions-runner/_work/cache:/github/home/.cache/ + options: >- + --device /dev/davinci6 + --device /dev/davinci_manager + --device /dev/devmm_svm + --device /dev/hisi_hdc + env: + HF_ENDPOINT: https://hf-mirror.com + steps: + - name: Check npu driver + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + apt-get update -y + apt-get -y install `cat packages.txt` + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -e . + + - name: Run vllm-project/vllm-ascend test + run: | + pytest -sv tests + + - name: Run vllm-project/vllm test + run: | + bash tools/npu-vllm-test.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml new file mode 100644 index 000000000..14a3ae925 --- /dev/null +++ b/.github/workflows/yapf.yml @@ -0,0 +1,54 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: yapf + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - "**/*.py" + - .github/workflows/yapf.yml + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - .github/workflows/yapf.yml + +jobs: + yapf: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..3991ac8f0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,194 @@ +## vLLM Ascend Ignore +# VSCode +.vscode/ + +# egg-info +vllm_ascend.egg-info/ + +# DS Store +.DS_Store + +# Linting +actionlint +shellcheck*/ + + +# Python gitignore +## Adapted from: +## https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..f801b5f8f --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..c7d45f682 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,107 @@ +# Contributing to vLLM Ascend plugin + +## Building and testing +It's recommended to set up a local development environment to build and test +before you submit a PR. + +### Prepare environment and build + +Theoretically, the vllm-ascend build is only supported on Linux because +`vllm-ascend` dependency `torch_npu` only supports Linux. + +But you can still set up dev env on Linux/Windows/macOS for linting and basic +test as following commands: + +```bash +# Choose a base dir (~/vllm-project/) and set up venv +cd ~/vllm-project/ +python3 -m venv .venv +source ./.venv/bin/activate + +# Clone vllm code and install +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-build.txt +VLLM_TARGET_DEVICE="empty" pip install . +cd .. + +# Clone vllm-ascend and install +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -r requirements-dev.txt + +# Then you can run lint and mypy test +bash format.sh + +# Build: +# - only supported on Linux (torch_npu available) +# pip install -e . +# - build without deps for debugging in other OS +# pip install -e . --no-deps + +# Commit changed files using `-s` +git commit -sm "your commit info" +``` + +### Testing + +Although vllm-ascend CI provide integration test on [Ascend](.github/workflows/vllm_ascend_test.yaml), you can run it +locally. The simplest way to run these integration tests locally is through a container: + +```bash +# Under Ascend NPU environment +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend + +IMAGE=vllm-ascend-dev-image +CONTAINER_NAME=vllm-ascend-dev +DEVICE=/dev/davinci1 + +# The first build will take about 10 mins (10MB/s) to download the base image and packages +docker build -t $IMAGE -f ./Dockerfile . +# You can also specify the mirror repo via setting VLLM_REPO to speedup +# docker build -t $IMAGE -f ./Dockerfile . --build-arg VLLM_REPO=https://gitee.com/mirrors/vllm + +docker run --name $CONTAINER_NAME --network host --device $DEVICE \ + --device /dev/davinci_manager --device /dev/devmm_svm \ + --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -ti --rm $IMAGE bash + +cd vllm-ascend +pip install -r requirements-dev.txt + +pytest tests/ +``` + +## DCO and Signed-off-by + +When contributing changes to this project, you must agree to the DCO. Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO. + +Using `-s` with `git commit` will automatically add this header. + +## PR Title and Classification + +Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: + +- `[Attention]` for new features or optimization in attention. +- `[Communicator]` for new features or optimization in communicators. +- `[ModelRunner]` for new features or optimization in model runner. +- `[Platform]` for new features or optimization in platform. +- `[Worker]` for new features or optimization in worker. +- `[Core]` for new features or optimization in the core vllm-ascend logic (such as platform, attention, communicators, model runner) +- `[Kernel]` changes affecting compute kernels and ops. +- `[Bugfix]` for bug fixes. +- `[Doc]` for documentation fixes and improvements. +- `[Test]` for tests (such as unit tests). +- `[CI]` for build or continuous integration improvements. +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. + +> [!NOTE] +> If the PR spans more than one category, please include all relevant prefixes. + +## Others + +You may find more information about contributing to vLLM Ascend backend plugin on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). +If you find any problem when contributing, you can feel free to submit a PR to improve the doc to help other developers. diff --git a/DCO b/DCO new file mode 100644 index 000000000..49b8cb054 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..63c8bd685 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim + +WORKDIR /workspace + +COPY . /workspace/vllm-ascend/ + +RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + +# Install vLLM main +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +RUN git clone --depth 1 $VLLM_REPO /workspace/vllm +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ + +# Install vllm-ascend main +RUN python3 -m pip install /workspace/vllm-ascend/ + +CMD ["/bin/bash"] diff --git a/README.md b/README.md new file mode 100644 index 000000000..c16e8372d --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +

+ + + + vllm-ascend + +

+ +

+vLLM Ascend Plugin +

+ +

+| About Ascend | Developer Slack (#sig-ascend) | +

+ +--- +*Latest News* πŸ”₯ + +- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## Overview + +vLLM Ascend plugin (`vllm-ascend`) is a backend plugin for running vLLM on the Ascend NPU. + +This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM. + +By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU. + +## Prerequisites +### Support Devices +- Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2) +- Atlas 800I A2 Inference series (Atlas 800I A2) + +### Dependencies +| Requirement | Supported version | Recommended version | Note | +|-------------|-------------------| ----------- |------------------------------------------| +| vLLM | main | main | Required for vllm-ascend | +| Python | >= 3.9 | [3.10](https://www.python.org/downloads/) | Required for vllm | +| CANN | >= 8.0.RC2 | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu | +| torch-npu | >= 2.4.0 | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1) | Required for vllm-ascend | +| torch | >= 2.4.0 | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1) | Required for torch-npu and vllm required | + +Find more about how to setup your environment in [here](docs/environment.md). + +## Getting Started + +> [!NOTE] +> Currently, we are actively collaborating with the vLLM community to support the Ascend backend plugin, once supported you can use one line command `pip install vllm vllm-ascend` to compelete installation. + +Installation from source code: +```bash +# Install vllm main branch according: +# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#build-wheel-from-source +git clone --depth 1 https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-build.txt +VLLM_TARGET_DEVICE=empty pip install . + +# Install vllm-ascend main branch +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -e . +``` + +Run the following command to start the vLLM server with the [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model: + +```bash +# export VLLM_USE_MODELSCOPE=true to speed up download +vllm serve Qwen/Qwen2.5-0.5B-Instruct +curl http://localhost:8000/v1/models +``` + +Please refer to [vLLM Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) for more details. + +## Building + +#### Build Python package from source + +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -e . +``` + +#### Build container image from source +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +docker build -t vllm-ascend-dev-image -f ./Dockerfile . +``` + +See [Building and Testing](./CONTRIBUTING.md) for more details, which is a step-by-step guide to help you set up development environment, build and test. + +## Contributing +We welcome and value any contributions and collaborations: +- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues). +- Please see the guidance on how to contribute in [CONTRIBUTING.md](./CONTRIBUTING.md). + +## License + +Apache License 2.0, as found in the [LICENSE](./LICENSE) file. diff --git a/docs/environment.md b/docs/environment.md new file mode 100644 index 000000000..5dd70b29a --- /dev/null +++ b/docs/environment.md @@ -0,0 +1,38 @@ +### Prepare Ascend NPU environment + +### Dependencies +| Requirement | Supported version | Recommended version | Note | +| ------------ | ------- | ----------- | ----------- | +| Python | >= 3.9 | [3.10](https://www.python.org/downloads/) | Required for vllm | +| CANN | >= 8.0.RC2 | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu | +| torch-npu | >= 2.4.0 | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1) | Required for vllm-ascend | +| torch | >= 2.4.0 | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1) | Required for torch-npu and vllm required | + + +Below is a quick note to install recommended version software: + +#### Containerized installation + +You can use the [container image](https://hub.docker.com/r/ascendai/cann) directly with one line command: + +```bash +docker run \ + --name vllm-ascend-env \ + --device /dev/davinci1 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -it quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 bash +``` + +You do not need to install `torch` and `torch_npu` manually, they will be automatically installed as `vllm-ascend` dependencies. + +#### Manual installation + +Or follow the instructions provided in the [Ascend Installation Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) to set up the environment. + diff --git a/docs/logos/vllm-ascend-logo-text-dark.png b/docs/logos/vllm-ascend-logo-text-dark.png new file mode 100644 index 000000000..f534d09ee Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-dark.png differ diff --git a/docs/logos/vllm-ascend-logo-text-light.png b/docs/logos/vllm-ascend-logo-text-light.png new file mode 100644 index 000000000..b71b49267 Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-light.png differ diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py new file mode 100644 index 000000000..f8d5489a5 --- /dev/null +++ b/examples/offline_distributed_inference_npu.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +# TODO (cmq): ray is not supported currently, need some fixes +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="mp", + trust_remote_code=True, +) + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py new file mode 100644 index 000000000..785492c7d --- /dev/null +++ b/examples/offline_inference_audio_language.py @@ -0,0 +1,153 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/audio_language.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on audio language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" + +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = { + 0: "What is 1+1?", + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + + +# Ultravox 0.3 +def run_ultravox(question: str, audio_count: int): + model_name = "fixie-ai/ultravox-v0_3" + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [{ + 'role': 'user', + 'content': "<|audio|>\n" * audio_count + question + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Qwen2-Audio +def run_qwen2_audio(question: str, audio_count: int): + model_name = "Qwen/Qwen2-Audio-7B-Instruct" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + audio_in_prompt = "".join([ + f"Audio {idx+1}: " + f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# TODO (cmq): test ultravox +model_example_map = { + # "ultravox": run_ultravox, + "qwen2_audio": run_qwen2_audio +} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + audio_count = args.num_audios + llm, prompt, stop_token_ids = model_example_map[model]( + question_per_audio_count[audio_count], audio_count) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=stop_token_ids) + + mm_data = {} + if audio_count > 0: + mm_data = { + "audio": [ + asset.audio_and_sample_rate + for asset in audio_assets[:audio_count] + ] + } + + assert args.num_prompts > 0 + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + if args.num_prompts > 1: + # Batch inference + inputs = [inputs] * args.num_prompts + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'audio language models') + parser.add_argument('--model-type', + '-m', + type=str, + default="qwen2_audio", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument("--num-audios", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of audio items per prompt.") + + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py new file mode 100644 index 000000000..10c2c6e40 --- /dev/null +++ b/examples/offline_inference_npu.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +llm = LLM(model="facebook/opt-125m") + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/format.sh b/format.sh new file mode 100755 index 000000000..9ea7495c2 --- /dev/null +++ b/format.sh @@ -0,0 +1,341 @@ +#!/usr/bin/env bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# YAPF formatter, adapted from ray and skypilot. +# +# Usage: +# # Do work and commit your work. + +# # Format files that differ from origin/main. +# bash format.sh + +# # Commit changed files with message 'Run yapf and ruff' +# +# +# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. +# You are encouraged to run this locally before pushing changes for review. + +# Cause the script to exit if a single command fails +set -eo pipefail + +# this stops git rev-parse from failing if we run this from the .git directory +builtin cd "$(dirname "${BASH_SOURCE:-$0}")" +ROOT="$(git rev-parse --show-toplevel)" +builtin cd "$ROOT" || exit 1 + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "❓❓$1 is not installed, please run \`pip install -r requirements-lint.txt\`" + exit 1 + fi +} + +check_command yapf +check_command ruff +check_command mypy +check_command codespell +check_command isort +check_command clang-format + +YAPF_VERSION=$(yapf --version | awk '{print $2}') +RUFF_VERSION=$(ruff --version | awk '{print $2}') +MYPY_VERSION=$(mypy --version | awk '{print $2}') +CODESPELL_VERSION=$(codespell --version) +ISORT_VERSION=$(isort --vn) +CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') +SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') + +# params: tool name, tool version, required version +tool_version_check() { + expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) + if [[ "$2" != "$expected" ]]; then + echo "❓❓Wrong $1 version installed: $expected is required, not $2." + exit 1 + fi +} + +tool_version_check "yapf" "$YAPF_VERSION" +tool_version_check "ruff" "$RUFF_VERSION" +tool_version_check "mypy" "$MYPY_VERSION" +tool_version_check "isort" "$ISORT_VERSION" +tool_version_check "codespell" "$CODESPELL_VERSION" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" +tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" + +YAPF_FLAGS=( + '--recursive' + '--parallel' +) + +YAPF_EXCLUDES=( + '--exclude' 'build/**' +) + +# Format specified files +format() { + yapf --in-place "${YAPF_FLAGS[@]}" "$@" +} + +# Format files that differ from main branch. Ignores dirs that are not slated +# for autoformat yet. +format_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause yapf to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" + fi + +} + +# Format all files +format_all() { + yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . +} + +## This flag formats individual files. --files *must* be the first command line +## arg to use this option. +if [[ "$1" == '--files' ]]; then + format "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is formatted. +elif [[ "$1" == '--all' ]]; then + format_all +else + # Format only the files that changed in last commit. + format_changed +fi +echo 'vLLM yapf: Done' + +# Run mypy +echo 'vLLM mypy:' +tools/mypy.sh +echo 'vLLM mypy: Done' + + +# If git diff returns a file that is in the skip list, the file may be checked anyway: +# https://github.com/codespell-project/codespell/issues/1915 +# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem +CODESPELL_EXCLUDES=( + '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' +) + +# check spelling of specified files +spell_check() { + codespell "$@" +} + +spell_check_all(){ + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" +} + +# Spelling check of files that differ from main branch. +spell_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + codespell "${CODESPELL_EXCLUDES[@]}" + fi +} + +# Run Codespell +## This flag runs spell check of individual files. --files *must* be the first command line +## arg to use this option. +if [[ "$1" == '--files' ]]; then + spell_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + spell_check_all +else + # Check spelling only of the files that changed in last commit. + spell_check_changed +fi +echo 'vLLM codespell: Done' + + +# Lint specified files +lint() { + ruff check "$@" +} + +# Lint files that differ from main branch. Ignores dirs that are not slated +# for autolint yet. +lint_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + ruff check + fi + +} + +# Run Ruff +### This flag lints individual files. --files *must* be the first command line +### arg to use this option. +if [[ "$1" == '--files' ]]; then + lint "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + lint vllm tests +else + # Format only the files that changed in last commit. + lint_changed +fi +echo 'vLLM ruff: Done' + +# check spelling of specified files +isort_check() { + isort "$@" +} + +isort_check_all(){ + isort . +} + +# Spelling check of files that differ from main branch. +isort_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + isort + fi +} + +# Run Isort +# This flag runs spell check of individual files. --files *must* be the first command line +# arg to use this option. +if [[ "$1" == '--files' ]]; then + isort_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + isort_check_all +else + # Check spelling only of the files that changed in last commit. + isort_check_changed +fi +echo 'vLLM isort: Done' + +# Clang-format section +# Exclude some files for formatting because they are vendored +# NOTE: Keep up to date with .github/workflows/clang-format.yml +CLANG_FORMAT_EXCLUDES=( + 'csrc/moe/topk_softmax_kernels.cu' + 'csrc/quantization/gguf/ggml-common.h' + 'csrc/quantization/gguf/dequantize.cuh' + 'csrc/quantization/gguf/vecdotq.cuh' + 'csrc/quantization/gguf/mmq.cuh' + 'csrc/quantization/gguf/mmvq.cuh' +) + +# Format specified files with clang-format +clang_format() { + clang-format -i "$@" +} + +# Format files that differ from main branch with clang-format. +clang_format_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause clang-format to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + # Get the list of changed files, excluding the specified ones + changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) + if [ -n "$changed_files" ]; then + echo "$changed_files" | xargs -P 5 clang-format -i + fi +} + +# Format all files with clang-format +clang_format_all() { + find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ + | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ + | xargs clang-format -i +} + +# Run clang-format +if [[ "$1" == '--files' ]]; then + clang_format "${@:2}" +elif [[ "$1" == '--all' ]]; then + clang_format_all +else + clang_format_changed +fi +echo 'vLLM clang-format: Done' + +echo 'vLLM actionlint:' +tools/actionlint.sh -color +echo 'vLLM actionlint: Done' + +echo 'vLLM shellcheck:' +tools/shellcheck.sh +echo 'vLLM shellcheck: Done' + +echo 'excalidraw png check:' +tools/png-lint.sh +echo 'excalidraw png check: Done' + +if ! git diff --quiet &>/dev/null; then + echo + echo "πŸ”πŸ”There are files changed by the format checker or by you that are not added and committed:" + git --no-pager diff --name-only + echo "πŸ”πŸ”Format checker passed, but please add, commit and push all the files above to include changes made by the format checker." + + exit 1 +else + echo "βœ¨πŸŽ‰ Format check passed! Congratulations! πŸŽ‰βœ¨" +fi + +# echo 'vLLM sphinx-lint:' +# tools/sphinx-lint.sh +# echo 'vLLM sphinx-lint: Done' diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..b627e7f51 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,14 @@ +[mypy] +; warn_return_any = True +warn_unused_configs = True + +; Suppress all missing import errors from torch_npu for mypy. +[mypy-torch_npu.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True + +; Remove this after https://github.com/vllm-project/vllm/pull/11324 merged +[mypy-vllm.distributed.device_communicators.base_communicator] +ignore_missing_imports = True diff --git a/packages.txt b/packages.txt new file mode 100644 index 000000000..c6490115b --- /dev/null +++ b/packages.txt @@ -0,0 +1,3 @@ +git +vim + diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..60a78830d --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-r requirements-lint.txt +modelscope +pytest diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 000000000..711bb50a0 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,15 @@ +# formatting +yapf==0.32.0 +toml==0.10.2 +tomli==2.0.2 +ruff==0.6.5 +codespell==2.3.0 +isort==5.13.2 +clang-format==18.1.5 +sphinx-lint==1.0.0 + +# type checking +mypy==1.11.1 +types-PyYAML +types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..51cb33f2b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +decorator +pyyaml +scipy +setuptools +torch_npu == 2.5.1rc1 diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..4aaab9907 --- /dev/null +++ b/setup.py @@ -0,0 +1,95 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/blob/main/setup.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from typing import List + +from setuptools import setup + +ROOT_DIR = os.path.dirname(__file__) + + +def get_path(*filepath) -> str: + return os.path.join(ROOT_DIR, *filepath) + + +def read_readme() -> str: + """Read the README file if present.""" + p = get_path("README.md") + if os.path.isfile(p): + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() + else: + return "" + + +def get_requirements() -> List[str]: + """Get Python package dependencies from requirements.txt.""" + + def _read_requirements(filename: str) -> List[str]: + with open(get_path(filename)) as f: + requirements = f.read().strip().split("\n") + resolved_requirements = [] + for line in requirements: + if line.startswith("-r "): + resolved_requirements += _read_requirements(line.split()[1]) + elif line.startswith("--"): + continue + else: + resolved_requirements.append(line) + return resolved_requirements + + try: + requirements = _read_requirements("requirements.txt") + except ValueError: + print("Failed to read requirements.txt in vllm_ascend.") + return requirements + + +setup( + name='vllm_ascend', + # Follow: + # https://packaging.python.org/en/latest/specifications/version-specifiers + version='0.1.0a1', + author="vLLM-Ascend team", + license="Apache 2.0", + description=("vLLM Ascend backend plugin"), + long_description=read_readme(), + long_description_content_type="text/markdown", + url="https://github.com/vllm-project/vllm-ascend", + project_urls={ + "Homepage": "https://github.com/vllm-project/vllm-ascend", + }, + classifiers=[ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + ], + packages=['vllm_ascend'], + python_requires=">=3.9", + install_requires=get_requirements(), + extras_require={}, + entry_points={'vllm.platform_plugins': ["ascend = vllm_ascend:register"]}) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..3a593e45e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,331 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional, Tuple, TypeVar, Union + +import numpy as np +import pytest +from PIL import Image +from vllm import LLM, SamplingParams +from vllm.config import TaskOption +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams +from vllm.utils import is_list_of + +from tests.model_utils import (TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs) + +logger = init_logger(__name__) + +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] + + +class VllmRunner: + + def __init__( + self, + model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + tokenizer_mode: str = "auto", + # Use smaller max model length, otherwise bigger model cannot run due + # to kv cache size limit. + max_model_len: int = 1024, + dtype: str = "half", + disable_log_stats: bool = True, + tensor_parallel_size: int = 1, + block_size: int = 16, + enable_chunked_prefill: bool = False, + swap_space: int = 4, + enforce_eager: Optional[bool] = False, + **kwargs, + ) -> None: + self.model = LLM( + model=model_name, + task=task, + tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=True, + dtype=dtype, + swap_space=swap_space, + enforce_eager=enforce_eager, + disable_log_stats=disable_log_stats, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + block_size=block_size, + enable_chunked_prefill=enable_chunked_prefill, + **kwargs, + ) + + def get_inputs( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[TextPrompt]: + if images is not None: + assert len(prompts) == len(images) + + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + + inputs = [TextPrompt(prompt=prompt) for prompt in prompts] + if images is not None: + for i, image in enumerate(images): + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} + + if videos is not None: + for i, video in enumerate(videos): + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} + + if audios is not None: + for i, audio in enumerate(audios): + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} + + return inputs + + def generate( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[List[int]], List[str]]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + outputs: List[Tuple[List[List[int]], List[str]]] = [] + for req_output in req_outputs: + prompt_str = req_output.prompt + prompt_ids = req_output.prompt_token_ids + req_sample_output_ids: List[List[int]] = [] + req_sample_output_strs: List[str] = [] + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + req_sample_output_ids.append(prompt_ids + output_ids) + req_sample_output_strs.append(prompt_str + output_str) + outputs.append((req_sample_output_ids, req_sample_output_strs)) + return outputs + + @staticmethod + def _final_steps_generate_w_logprobs( + req_outputs: List[RequestOutput], + ) -> List[TokensTextLogprobsPromptLogprobs]: + outputs: List[TokensTextLogprobsPromptLogprobs] = [] + for req_output in req_outputs: + assert len(req_output.outputs) > 0 + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + output_logprobs = sample.logprobs + outputs.append((output_ids, output_str, output_logprobs, + req_output.prompt_logprobs)) + return outputs + + def generate_w_logprobs( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) + + def generate_encoder_decoder_w_logprobs( + self, + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + sampling_params: SamplingParams, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + ''' + Logprobs generation for vLLM encoder/decoder models + ''' + + assert sampling_params.logprobs is not None + req_outputs = self.model.generate(encoder_decoder_prompts, + sampling_params=sampling_params) + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) + + def generate_greedy( + self, + prompts: List[str], + max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[int], str]]: + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + outputs = self.generate(prompts, + greedy_params, + images=images, + videos=videos, + audios=audios) + return [(output_ids[0], output_str[0]) + for output_ids, output_str in outputs] + + def generate_greedy_logprobs( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + stop_token_ids: Optional[List[int]] = None, + stop: Optional[List[str]] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=num_prompt_logprobs, + stop_token_ids=stop_token_ids, + stop=stop) + + return self.generate_w_logprobs(prompts, + greedy_logprobs_params, + images=images, + audios=audios, + videos=videos) + + def generate_encoder_decoder_greedy_logprobs( + self, + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + max_tokens: int, + num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=(num_prompt_logprobs), + ) + ''' + Greedy logprobs generation for vLLM encoder/decoder models + ''' + + return self.generate_encoder_decoder_w_logprobs( + encoder_decoder_prompts, greedy_logprobs_params) + + def generate_beam_search( + self, + prompts: Union[List[str], List[List[int]]], + beam_width: int, + max_tokens: int, + ) -> List[Tuple[List[List[int]], List[str]]]: + if is_list_of(prompts, str, check="all"): + prompts = [TextPrompt(prompt=prompt) for prompt in prompts] + else: + prompts = [ + TokensPrompt(prompt_token_ids=tokens) for tokens in prompts + ] + outputs = self.model.beam_search( + prompts, + BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) + returned_outputs = [] + for output in outputs: + token_ids = [x.tokens for x in output.sequences] + texts = [x.text for x in output.sequences] + returned_outputs.append((token_ids, texts)) + return returned_outputs + + def classify(self, prompts: List[str]) -> List[List[float]]: + req_outputs = self.model.classify(prompts) + return [req_output.outputs.probs for req_output in req_outputs] + + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.embed(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] + + def score( + self, + text_1: Union[str, List[str]], + text_2: Union[str, List[str]], + ) -> List[float]: + req_outputs = self.model.score(text_1, text_2) + return [req_output.outputs.score for req_output in req_outputs] + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + del self.model + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="session") +def vllm_runner(): + return VllmRunner diff --git a/tests/model_utils.py b/tests/model_utils.py new file mode 100644 index 000000000..1b9eadccd --- /dev/null +++ b/tests/model_utils.py @@ -0,0 +1,303 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import warnings +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import torch +from vllm.config import ModelConfig, TaskOption +from vllm.inputs import InputContext +from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs + +TokensText = Tuple[List[int], str] + + +def check_outputs_equal( + *, + outputs_0_lst: Sequence[TokensText], + outputs_1_lst: Sequence[TokensText], + name_0: str, + name_1: str, +): + """ + Compare the two sequences generated by different models, + which should be equal. + """ + assert len(outputs_0_lst) == len(outputs_1_lst) + + for prompt_idx, (outputs_0, + outputs_1) in enumerate(zip(outputs_0_lst, + outputs_1_lst)): + output_ids_0, output_str_0 = outputs_0 + output_ids_1, output_str_1 = outputs_1 + + # The text and token outputs should exactly match + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") + + assert output_str_0 == output_str_1, fail_msg + assert output_ids_0 == output_ids_1, fail_msg + + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * List of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. +TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, + float]], + SampleLogprobs]]] + +# Allow for tokens to be represented as str's rather than IDs; +# tuple of +# * Token string representations list +# * String +# * Optional list of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. +TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], + List[Dict[str, + Logprob]]]]] + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * Optional list of top sample logprobs for each sampled token +# * Optional list of top prompt logprobs for each prompt token +# +# Allows prompt logprobs to be requested. +TokensTextLogprobsPromptLogprobs = Tuple[ + List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], + Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] + + +def check_logprobs_close( + *, + outputs_0_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], + outputs_1_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], + name_0: str, + name_1: str, + num_outputs_0_skip_tokens: int = 0, + warn_on_mismatch: bool = True, + always_check_logprobs: bool = False, +) -> None: + """Compare the logprobs of two sequences generated by different models, + which should be similar but not necessarily equal. + + How sample logprobs are compared: + * `always_check_logprobs == True`: set of highest-logprob token ids + must match between seq0 and seq1 at all sampled token offsets + * `always_check_logprobs == False`: highest-logprob token ids are + only compared at sampled token offsets for which generated token + ids don't match + + Prompt logprobs must be provided either for both input sequences, or + for neither. If prompt logprobs are provided, then highest-logprob + prompt token ids must match between seq0 and seq1 at all prompt token + offsets. + + Args: + outputs_0_lst: First sequence to compare + outputs_0_lst: Second sequence to compare + name_0: sequence #0 name + name_1: sequence #1 name + num_outputs_0_skip_tokens: If > 0, specifies the number of initial + sequence #0 tokens & logprobs to discard + before comparison, i.e. all + of sequence #1 will be compared to + sequence #0 beginning at index + num_outputs_0_skip_tokens + warn_on_mismatch: Issue a warning if there is token-wise or text-wise + mismatch between the two sequences + always_check_logprobs: If true, check logprobs even when tokens match + """ + assert len(outputs_0_lst) == len(outputs_1_lst) + + # Loop through responses to each prompt. + for prompt_idx, (outputs_0, + outputs_1) in enumerate(zip(outputs_0_lst, + outputs_1_lst)): + assert len(outputs_0) == len(outputs_1) + if len(outputs_0) == 3: + assert len(outputs_1) == 3 + # Break out tokens, text & sample logprobs + # (prompt logprobs were not provided) + output_ids_0, output_str_0, logprobs_0 = outputs_0 + output_ids_1, output_str_1, logprobs_1 = outputs_1 + elif len(outputs_0) == 4: + assert len(outputs_1) == 4 + # Break out tokens, text, sample logprobs & prompt logprobs + ( + output_ids_0, + output_str_0, + logprobs_0, + prompt_logprobs_0, + ) = outputs_0 + ( + output_ids_1, + output_str_1, + logprobs_1, + prompt_logprobs_1, + ) = outputs_1 + + # Test prompt logprobs closeness + if (prompt_logprobs_0 is not None + and prompt_logprobs_1 is not None): + # Both sequences' prompt logprobs lists are not `None`` + # (although individual list elements may be `None`); + # for each token's logprobs: + for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( + zip(prompt_logprobs_0, prompt_logprobs_1)): + fail_msg = ( + f"Prompt logprobs test:" + f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}" + f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}") + + if logprobs_elem_0 is None: + # If the seq 0 token's logprobs are `None`, + # the seq 1 token's logprobs must be `None` + assert logprobs_elem_1 is None, fail_msg + else: + # If the seq 0 token's logprobs are not `None`, + # the seq 1 token's logprobs must not be `None` + assert logprobs_elem_1 is not None, fail_msg + # Logprobs check: top-k token choices must be the same + assert (set(logprobs_elem_0.keys()) == set( + logprobs_elem_1.keys())), fail_msg + else: + # Both sequence logprobs lists must be `None` + fail_msg = (f"Prompt logprobs test:" + f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}" + f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}") + + assert (prompt_logprobs_0 is None + and prompt_logprobs_1 is None), fail_msg + else: + raise ValueError(f"Outputs tuple must have 3 or 4 elements but " + f"{len(outputs_0)} elements were provided: " + f"{outputs_0}") + + if logprobs_0 is None: + logprobs_0 = [None] * len(output_ids_0) + if logprobs_1 is None: + logprobs_1 = [None] * len(output_ids_1) + + # Skip specified number of initial sequence #0 tokens + # & logprobs, leaving output text as-is for simplicity + # (text mismatches may generate warnings but do not + # cause the test to fail.) + if num_outputs_0_skip_tokens < 0: + raise ValueError("num_outputs_0_skip_tokens must be non-negative") + output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:] + logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:] + + # Loop through generated tokens. + for idx, (output_id_0, + output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): + + is_tok_mismatch = output_id_0 != output_id_1 + + # If generated tokens don't match + # or it is desired to always check logprobs, + # then + if is_tok_mismatch or always_check_logprobs: + logprobs_elem_0 = logprobs_0[idx] + logprobs_elem_1 = logprobs_1[idx] + + # Each predicted token must be in top N logprobs of the other + fail_msg = ( + f"Test{prompt_idx}:" + f"\nMatched tokens:\t{output_ids_0[:idx]}" + f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}" + f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}") + + assert logprobs_elem_0 is not None, fail_msg + assert logprobs_elem_1 is not None, fail_msg + assert output_id_0 in logprobs_elem_1, fail_msg + assert output_id_1 in logprobs_elem_0, fail_msg + + if warn_on_mismatch and is_tok_mismatch: + with warnings.catch_warnings(): + # This ensures that repeated warnings are shown + # in the output, not just the first occurrence + warnings.simplefilter("always") + + warnings.warn(fail_msg, stacklevel=2) + + # Break out since sequences will now diverge. + break + else: + if output_str_0 != output_str_1 and warn_on_mismatch: + # The token outputs exactly match, + # so the text outputs should exactly match as well + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") + + with warnings.catch_warnings(): + # This ensures that repeated warnings are shown + # in the output, not just the first occurrence + warnings.simplefilter("always") + + warnings.warn(fail_msg, stacklevel=2) + + +def build_model_context(model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + trust_remote_code: bool = False, + dtype: Optional[Union[str, torch.dtype]] = None, + mm_processor_kwargs: Optional[Dict] = None, + limit_mm_per_prompt: Optional[Dict] = None): + """Creates an InputContext for a given model. + + Args: + model_name: Name of the model being considered. + tokenizer_name: Name of the tokenizer being considered. + trust_remote_code: Whether or not to allow loading remote code. + mm_processor_kwargs: optional processor kwargs for to be leveraged + in the input processor, mapper, dummy data creation, etc. + limit_mm_per_prompt: Multimodal limits. + + Returns: + InputContext for the model being considered. + """ + if tokenizer_name is None: + tokenizer_name = model_name + if dtype is None: + dtype = "half" + + model_config = ModelConfig( + model_name, + task=task, + tokenizer=tokenizer_name, + tokenizer_mode="auto", + trust_remote_code=trust_remote_code, + dtype=dtype, + seed=0, + mm_processor_kwargs=mm_processor_kwargs, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + return InputContext(model_config) diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py new file mode 100644 index 000000000..484bce63c --- /dev/null +++ b/tests/test_offline_inference.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/test_offline_inference.py`. +""" +import os + +import pytest +import vllm # noqa: F401 +from conftest import VllmRunner + +import vllm_ascend # noqa: F401 + +MODELS = [ + "Qwen/Qwen2.5-0.5B-Instruct", +] +os.environ["VLLM_USE_MODELSCOPE"] = "True" + +TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +def test_models( + model: str, + dtype: str, + max_tokens: int, +) -> None: + os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND" + + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tools/actionlint.sh b/tools/actionlint.sh new file mode 100755 index 000000000..72a10b18f --- /dev/null +++ b/tools/actionlint.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if command -v actionlint &> /dev/null; then + # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml becase sel-hosted runner `npu-arm64` is unknown + actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + exit 0 +elif [ -x ./actionlint ]; then + ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + exit 0 +fi + +# download a binary to the current directory - v1.7.3 +bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) +./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml diff --git a/tools/check_repo.sh b/tools/check_repo.sh new file mode 100644 index 000000000..e86d0f110 --- /dev/null +++ b/tools/check_repo.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) + +if ! git diff --quiet; then + echo "Repo is dirty" >&2 + + exit 1 +fi + +if ! git describe --tags; then + echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 + + exit 1 +fi diff --git a/tools/mypy.sh b/tools/mypy.sh new file mode 100755 index 000000000..fcb7c0e4b --- /dev/null +++ b/tools/mypy.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CI=${1:-0} +PYTHON_VERSION=${2:-3.9} + +if [ "$CI" -eq 1 ]; then + set -e +fi + +run_mypy() { + echo "Running mypy on $1" + if [ "$CI" -eq 1 ] && [ -z "$1" ]; then + mypy --python-version "${PYTHON_VERSION}" "$@" + return + fi + mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" +} + +run_mypy vllm_ascend +run_mypy examples +run_mypy tests diff --git a/tools/npu-vllm-test.sh b/tools/npu-vllm-test.sh new file mode 100644 index 000000000..17c7a4d43 --- /dev/null +++ b/tools/npu-vllm-test.sh @@ -0,0 +1,422 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail + +TEST_DIR="./vllm-empty/tests" +TEST_FILES=( + test_sequence.py + # test_utils.py + # test_config.py + test_cache_block_hashing.py + # test_scalartype.py + # test_embedded_commit.py + # test_inputs.py + # test_sharded_state_loader.py + test_sampling_params.py + # test_logger.py + # test_logits_processor.py + # test_regression.py + # prefix_caching/test_prefix_caching.py + # prefix_caching/test_disable_sliding_window.py + # weight_loading/test_weight_loading.py + # samplers/test_beam_search.py + # samplers/test_typical_acceptance_sampler.py + # samplers/test_no_bad_words.py + # samplers/test_rejection_sampler.py + # samplers/test_ignore_eos.py + # samplers/test_ranks.py + # samplers/test_logits_processor.py + # samplers/test_sampler.py + # samplers/test_seeded_generate.py + # samplers/test_logprobs.py + # kernels/test_encoder_decoder_attn.py + # kernels/test_rotary_embedding.py + # kernels/test_prefix_prefill.py + # kernels/test_flashinfer.py + # kernels/utils.py + # kernels/test_machete_mm.py + # kernels/test_flash_attn.py + # kernels/test_awq.py + # kernels/test_blocksparse_attention.py + # kernels/test_utils.py + # kernels/test_aqlm.py + # kernels/test_cutlass.py + # kernels/test_causal_conv1d.py + # kernels/test_marlin_gemm.py + # kernels/test_layernorm.py + # kernels/test_pos_encoding.py + # kernels/test_moe.py + # kernels/test_awq_marlin.py + # kernels/test_int8_quant.py + # kernels/test_gptq.py + # kernels/test_attention.py + # kernels/test_activation.py + # kernels/quant_utils.py + # kernels/test_permute_cols.py + # kernels/test_triton_scaled_mm.py + # kernels/test_gguf.py + # kernels/test_awq_triton.py + # kernels/test_attention_selector.py + # kernels/test_ggml.py + # kernels/test_mamba_ssm.py + # kernels/test_fused_quant_layernorm.py + # kernels/test_fp8_quant.py + # kernels/test_cascade_flash_attn.py + # kernels/conftest.py + # kernels/allclose_default.py + # kernels/test_block_fp8.py + # kernels/test_cache.py + # kernels/test_semi_structured.py + # quantization/test_quark.py + # quantization/test_compressed_tensors.py + # quantization/utils.py + # quantization/test_experts_int8.py + # quantization/test_lm_head.py + # quantization/test_ipex_quant.py + # quantization/test_bitsandbytes.py + # quantization/test_cpu_offload.py + # quantization/test_fp8.py + # quantization/test_configs.py + # tool_use/test_tool_calls.py + # tool_use/utils.py + # tool_use/test_chat_completions.py + # tool_use/test_jamba_tool_parser.py + # tool_use/test_chat_completion_request_validations.py + # tool_use/conftest.py + # tool_use/test_parallel_tool_calls.py + # runai_model_streamer/test_runai_model_streamer_loader.py + # runai_model_streamer/test_weight_utils.py + # kv_transfer/test_lookup_buffer.sh + # kv_transfer/test_send_recv.py + # kv_transfer/test_send_recv.sh + # kv_transfer/test_lookup_buffer.py + # kv_transfer/module_test.py + # kv_transfer/disagg_test.py + # plugins/vllm_add_dummy_platform/setup.py + # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py + # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py + # plugins/vllm_add_dummy_model/setup.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py + # prompt_adapter/test_multi_adapter_inference.py + # prompt_adapter/test_pa_lora.py + # prompt_adapter/test_bloom.py + # compile/test_pass_manager.py + # compile/utils.py + # compile/test_wrapper.py + # compile/test_fusion.py + # compile/backend.py + # compile/test_full_graph.py + # compile/test_basic_correctness.py + # compile/test_functionalization.py + # compile/piecewise/test_simple.py + # compile/piecewise/test_toy_llama.py + # lora/test_punica_ops_variation.py + # lora/test_quant_model.py + # lora/test_lora_checkpoints.py + # lora/test_mixtral.py + # lora/test_qwen2vl.py + # lora/test_baichuan.py + # lora/utils.py + # lora/test_phi.py + # lora/test_utils.py + # lora/test_minicpmv_tp.py + # lora/test_layers.py + # lora/test_worker.py + # lora/test_jamba.py + # lora/test_tokenizer_group.py + # lora/test_lora_bias_e2e.py + # lora/test_chatglm3_tp.py + # lora/test_punica_ops_sizes.py + # lora/test_lora_manager.py + # lora/test_llama_tp.py + # lora/test_lora_huggingface.py + # lora/test_long_context.py + # lora/test_gemma.py + # lora/conftest.py + # lora/data/long_context_test_data.py + # models/registry.py + # models/utils.py + # models/test_registry.py + # models/test_initialization.py + # models/test_oot_registration.py + # models/multimodal/processing/test_internvl.py + # models/multimodal/processing/test_llava_next.py + # models/multimodal/processing/test_idefics3.py + # models/multimodal/processing/test_qwen2_vl.py + # models/multimodal/processing/test_phi3v.py + # models/multimodal/processing/test_common.py + # models/multimodal/processing/test_qwen.py + # models/multimodal/processing/test_llava_onevision.py + # models/encoder_decoder/language/test_bart.py + # models/encoder_decoder/audio_language/test_whisper.py + # models/encoder_decoder/vision_language/test_broadcast.py + # models/encoder_decoder/vision_language/test_florence2.py + # models/encoder_decoder/vision_language/test_mllama.py + # models/decoder_only/language/test_models.py + # models/decoder_only/language/test_gptq_marlin.py + # models/decoder_only/language/test_granite.py + # models/decoder_only/language/test_modelopt.py + # models/decoder_only/language/test_phimoe.py + # models/decoder_only/language/test_aqlm.py + # models/decoder_only/language/test_mistral.py + # models/decoder_only/language/test_jamba.py + # models/decoder_only/language/test_gptq_marlin_24.py + # models/decoder_only/language/test_mamba.py + # models/decoder_only/language/test_gguf.py + # models/decoder_only/language/test_fp8.py + # models/decoder_only/audio_language/test_ultravox.py + # models/decoder_only/vision_language/test_models.py + # models/decoder_only/vision_language/test_awq.py + # models/decoder_only/vision_language/test_intern_vit.py + # models/decoder_only/vision_language/test_qwen2_vl.py + # models/decoder_only/vision_language/test_pixtral.py + # models/decoder_only/vision_language/test_phi3v.py + # models/decoder_only/vision_language/test_h2ovl.py + # models/decoder_only/vision_language/vlm_utils/types.py + # models/decoder_only/vision_language/vlm_utils/model_utils.py + # models/decoder_only/vision_language/vlm_utils/runners.py + # models/decoder_only/vision_language/vlm_utils/core.py + # models/decoder_only/vision_language/vlm_utils/custom_inputs.py + # models/decoder_only/vision_language/vlm_utils/case_filtering.py + # models/decoder_only/vision_language/vlm_utils/builders.py + # models/embedding/utils.py + # models/embedding/language/test_scoring.py + # models/embedding/language/test_gritlm.py + # models/embedding/language/test_cls_models.py + # models/embedding/language/test_embedding.py + # models/embedding/vision_language/test_llava_next.py + # models/embedding/vision_language/test_dse_qwen2_vl.py + # models/embedding/vision_language/test_phi3v.py + # multimodal/utils.py + # multimodal/test_processor_kwargs.py + # multimodal/test_utils.py + # multimodal/test_inputs.py + # multimodal/test_processing.py + # standalone_tests/python_only_compile.sh + # standalone_tests/lazy_torch_compile.py + # async_engine/test_async_llm_engine.py + # async_engine/api_server_async_engine.py + # async_engine/test_api_server.py + # async_engine/test_request_tracker.py + # mq_llm_engine/utils.py + # mq_llm_engine/test_load.py + # mq_llm_engine/test_abort.py + # mq_llm_engine/test_error_handling.py + # tokenization/test_tokenizer.py + # tokenization/test_tokenizer_group.py + # tokenization/test_get_eos.py + # tokenization/test_cached_tokenizer.py + # tokenization/test_detokenize.py + # core/utils.py + # core/test_chunked_prefill_scheduler.py + # core/test_serialization.py + # core/test_num_computed_tokens_update.py + # core/test_scheduler_encoder_decoder.py + # core/test_scheduler.py + # core/block/test_cpu_gpu_block_allocator.py + # core/block/test_prefix_caching_block.py + # core/block/test_common.py + # core/block/test_block_table.py + # core/block/test_block_manager.py + # core/block/conftest.py + # core/block/test_naive_block.py + # core/block/e2e/test_correctness.py + # core/block/e2e/test_correctness_sliding_window.py + # core/block/e2e/conftest.py + # tracing/test_tracing.py + # engine/test_arg_utils.py + # engine/test_detokenization.py + # engine/test_short_mm_context.py + # engine/test_custom_executor.py + # engine/test_multiproc_workers.py + # engine/test_computed_prefix_blocks.py + # engine/test_stop_reason.py + # engine/test_skip_tokenizer_init.py + # engine/test_stop_strings.py + # engine/output_processor/test_stop_checker.py + # engine/output_processor/test_multi_step.py + # tensorizer_loader/test_tensorizer.py + # tensorizer_loader/conftest.py + # entrypoints/test_chat_utils.py + # entrypoints/conftest.py + # entrypoints/llm/test_lazy_outlines.py + # entrypoints/llm/test_generate_multiple_loras.py + # entrypoints/llm/test_encode.py + # entrypoints/llm/test_init.py + # entrypoints/llm/test_guided_generate.py + # entrypoints/llm/test_gpu_utilization.py + # entrypoints/llm/test_chat.py + # entrypoints/llm/test_accuracy.py + # entrypoints/llm/test_prompt_validation.py + # entrypoints/llm/test_generate.py + # entrypoints/offline_mode/test_offline_mode.py + # entrypoints/openai/test_completion.py + # entrypoints/openai/test_models.py + # entrypoints/openai/test_chat_echo.py + # entrypoints/openai/test_score.py + # entrypoints/openai/test_tokenization.py + # entrypoints/openai/test_cli_args.py + # entrypoints/openai/test_chunked_prompt.py + # entrypoints/openai/test_encoder_decoder.py + # entrypoints/openai/test_chat_template.py + # entrypoints/openai/test_oot_registration.py + # entrypoints/openai/test_run_batch.py + # entrypoints/openai/test_metrics.py + # entrypoints/openai/test_vision_embedding.py + # entrypoints/openai/test_embedding.py + # entrypoints/openai/test_lora_adapters.py + # entrypoints/openai/test_video.py + # entrypoints/openai/test_serving_models.py + # entrypoints/openai/test_chat.py + # entrypoints/openai/test_pooling.py + # entrypoints/openai/test_basic.py + # entrypoints/openai/test_accuracy.py + # entrypoints/openai/test_prompt_validation.py + # entrypoints/openai/test_vision.py + # entrypoints/openai/test_audio.py + # entrypoints/openai/test_async_tokenization.py + # entrypoints/openai/test_return_tokens_as_ids.py + # entrypoints/openai/test_serving_chat.py + # entrypoints/openai/test_shutdown.py + # entrypoints/openai/test_root_path.py + # entrypoints/openai/tool_parsers/utils.py + # entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py + # model_executor/weight_utils.py + # model_executor/test_enabled_custom_ops.py + # model_executor/test_guided_processors.py + # model_executor/test_model_load_with_params.py + # model_executor/conftest.py + # metrics/test_metrics.py + # system_messages/sonnet3.5_nov2024.txt + # encoder_decoder/test_e2e_correctness.py + # v1/core/test_kv_cache_utils.py + # v1/core/test_prefix_caching.py + # v1/sample/test_sampler.py + # v1/engine/test_engine_core.py + # v1/engine/test_async_llm.py + # v1/engine/test_output_processor.py + # v1/engine/test_engine_args.py + # v1/engine/test_engine_core_client.py + # v1/e2e/test_cascade_attention.py + # v1/worker/test_gpu_input_batch.py + # spec_decode/utils.py + # spec_decode/test_utils.py + # spec_decode/test_ngram_worker.py + # spec_decode/test_metrics.py + # spec_decode/test_batch_expansion.py + # spec_decode/test_multi_step_worker.py + # spec_decode/test_scorer.py + # spec_decode/test_spec_decode_worker.py + # spec_decode/test_dynamic_spec_decode.py + # spec_decode/e2e/test_mlp_correctness.py + # spec_decode/e2e/test_ngram_correctness.py + # spec_decode/e2e/test_seed.py + # spec_decode/e2e/test_integration.py + # spec_decode/e2e/test_medusa_correctness.py + # spec_decode/e2e/test_integration_dist_tp4.py + # spec_decode/e2e/test_eagle_correctness.py + # spec_decode/e2e/test_compatibility.py + # spec_decode/e2e/test_multistep_correctness.py + # spec_decode/e2e/test_integration_dist_tp2.py + # spec_decode/e2e/conftest.py + # spec_decode/e2e/test_logprobs.py + # multi_step/test_correctness_async_llm.py + # multi_step/test_correctness_llm.py + # vllm_test_utils/setup.py + # vllm_test_utils/vllm_test_utils/blame.py + # vllm_test_utils/vllm_test_utils/monitor.py + # plugins_tests/test_platform_plugins.py + # tpu/test_compilation.py + # tpu/test_quantization_accuracy.py + # tpu/test_custom_dispatcher.py + # distributed/test_custom_all_reduce.py + # distributed/test_distributed_oot.py + # distributed/test_pipeline_parallel.py + # distributed/test_pynccl.py + # distributed/test_pipeline_partition.py + # distributed/test_utils.py + # distributed/test_pp_cudagraph.py + # distributed/test_ca_buffer_sharing.py + # distributed/test_multi_node_assignment.py + # distributed/test_same_node.py + # distributed/test_shm_broadcast.py + # distributed/test_comm_ops.py + # basic_correctness/test_chunked_prefill.py + # basic_correctness/test_preemption.py + # basic_correctness/test_cpu_offload.py + # basic_correctness/test_basic_correctness.py + # worker/test_model_runner.py + # worker/test_encoder_decoder_model_runner.py + # worker/test_swap.py + # worker/test_profile.py + # worker/test_model_input.py +) + +# print usage +usage() { + echo "Usage: $0 -t -t ..." + echo "Example: $0 -t test_inputs.py -t test_regression.py" + exit 1 +} + +# parse command line args +while getopts ":t:" opt; do + case ${opt} in + t) + TEST_FILES+=("${OPTARG}") + ;; + *) + usage + ;; + esac +done + +echo "------ Test vllm_ascend on vLLM native ut ------" + + +# check if the test scripts are specified +if [ ${#TEST_FILES[@]} -eq 0 ]; then + echo "Error: No test scripts specified." + usage +fi + + +# test all the specified ut +for test_file in "${TEST_FILES[@]}"; do + full_path="$TEST_DIR/$test_file" + if [ -f "$full_path" ]; then + echo "Running $test_file..." + # Check if pytest ran successfully + if ! pytest -sv "$full_path" + then + echo "Error: $test_file failed." + exit 1 + fi + echo "Completed $test_file." + else + echo "Error: $test_file not found in $TEST_DIR." + exit 1 + fi +done + +echo "------ All specified tests completed -------" diff --git a/tools/png-lint.sh b/tools/png-lint.sh new file mode 100755 index 000000000..5eeb11eda --- /dev/null +++ b/tools/png-lint.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Ensure that *.excalidraw.png files have the excalidraw metadata +# embedded in them. This ensures they can be loaded back into +# the tool and edited in the future. + +find . -iname '*.excalidraw.png' | while read -r file; do + if git check-ignore -q "$file"; then + continue + fi + if ! grep -q "excalidraw+json" "$file"; then + echo "$file was not exported from excalidraw with 'Embed Scene' enabled." + exit 1 + fi +done diff --git a/tools/shellcheck.sh b/tools/shellcheck.sh new file mode 100755 index 000000000..d782af70a --- /dev/null +++ b/tools/shellcheck.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -e + +scversion="stable" + +if [ -d "shellcheck-${scversion}" ]; then + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +if ! [ -x "$(command -v shellcheck)" ]; then + if [ "$(uname -s)" != "Linux" ] || [ "$(uname -m)" != "x86_64" ]; then + echo "Please install shellcheck: https://github.com/koalaman/shellcheck?tab=readme-ov-file#installing" + exit 1 + fi + + # automatic local install if linux x86_64 + wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv + PATH="$PATH:$(pwd)/shellcheck-${scversion}" + export PATH +fi + +# TODO - fix warnings in .buildkite/run-amd-test.sh +find . -name "*.sh" -not -path "./.buildkite/run-amd-test.sh" -print0 | xargs -0 -I {} sh -c 'git check-ignore -q "{}" || shellcheck "{}"' diff --git a/tools/sphinx-lint.sh b/tools/sphinx-lint.sh new file mode 100755 index 000000000..806408013 --- /dev/null +++ b/tools/sphinx-lint.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +sphinx-lint --disable trailing-whitespace,missing-final-newline docs diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py new file mode 100644 index 000000000..80af5a525 --- /dev/null +++ b/vllm_ascend/__init__.py @@ -0,0 +1,21 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +def register(): + """Register the NPU platform.""" + return "vllm_ascend.platform.NPUPlatform" diff --git a/vllm_ascend/attention.py b/vllm_ascend/attention.py new file mode 100644 index 000000000..0693d44e3 --- /dev/null +++ b/vllm_ascend/attention.py @@ -0,0 +1,678 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/attention/backends +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import math +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type + +import torch + +try: + import torch_npu # noqa: F401 +except ImportError: + print("Failed to import torch_npu.") + +from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl, + AttentionLayer, + AttentionMetadata, AttentionType) +from vllm.attention.backends.utils import (PAD_SLOT_ID, CommonAttentionState, + CommonMetadataBuilder, + compute_slot_mapping_start_idx, + is_block_tables_empty) +from vllm.attention.ops.paged_attn import (PagedAttention, + PagedAttentionMetadata) + +if TYPE_CHECKING: + from vllm_ascend.model_runner import ModelInputForNPUBuilder + +SHARE_MASK_TRIL_PREFIX_CACHE = None +SHARE_MASK_TRIL = None + + +class AscendAttentionBackend(AttentionBackend): + + @staticmethod + def get_name() -> str: + return "ASCEND" + + @staticmethod + def get_impl_cls() -> Type["AscendAttentionBackendImpl"]: + return AscendAttentionBackendImpl + + @staticmethod + def get_metadata_cls() -> Type["AscendMetadata"]: + return AscendMetadata + + @staticmethod + def get_state_cls() -> Type["CommonAttentionState"]: + return CommonAttentionState + + @staticmethod + def get_kv_cache_shape( + num_blocks: int, + block_size: int, + num_kv_heads: int, + head_size: int, + ) -> Tuple[int, ...]: + return (2, num_blocks, block_size, num_kv_heads * head_size) + + @staticmethod + def swap_blocks( + src_kv_cache: List[torch.Tensor], + dst_kv_cache: List[torch.Tensor], + src_to_dst: torch.Tensor, + ) -> None: + src_key_cache, src_value_cache = src_kv_cache[0], src_kv_cache[1] + dst_key_cache, dst_value_cache = dst_kv_cache[0], dst_kv_cache[1] + src_indices = src_to_dst[:, 0] + dst_indices = src_to_dst[:, 1] + + dst_key_cache[dst_indices] = src_key_cache[src_indices].to( + dst_key_cache.device) + dst_value_cache[dst_indices] = src_value_cache[src_indices].to( + dst_key_cache.device) + + @staticmethod + def copy_blocks( + kv_caches: List[torch.Tensor], + src_to_dists: torch.Tensor, + ) -> None: + src_indices = src_to_dists[:, 0] + dst_indices = src_to_dists[:, 1] + + for kv_cache in kv_caches: + key_caches = kv_cache[0] + value_caches = kv_cache[1] + key_caches[dst_indices] = key_caches[src_indices] + value_caches[dst_indices] = value_caches[src_indices] + + @staticmethod + def get_builder_cls() -> Type["AscendMetadataBuilder"]: + return AscendMetadataBuilder + + @classmethod + def make_metadata_builder(cls, *args, **kwargs) -> "AscendMetadataBuilder": + return cls.get_builder_cls()(*args, **kwargs) + + +class AscendPagedAttention(PagedAttention): + + @staticmethod + def write_to_paged_cache( + key: torch.Tensor, + value: torch.Tensor, + key_cache: torch.Tensor, + value_cache: torch.Tensor, + slot_indices: torch.Tensor, + ) -> None: + torch_npu.npu_scatter_nd_update_(key_cache, slot_indices, key) + torch_npu.npu_scatter_nd_update_(value_cache, slot_indices, value) + + +@dataclass +class AscendMetadata(AttentionMetadata, PagedAttentionMetadata): + """Metadata for Ascendbackend. + * modified from XFormersbackend + NOTE: Any python object stored here is not updated when it is + cuda-graph replayed. If you have values that need to be changed + dynamically, it should be stored in tensor. The tensor has to be + updated from `CUDAGraphRunner.forward` API. + """ + + # |---------- N-1 iteration --------| + # |---------------- N iteration ---------------------| + # |- tokenA -|......................|-- newTokens ---| + # |---------- context_len ----------| + # |-------------------- seq_len ----------------------| + # |-- query_len ---| + + # seq_lens stored as a tensor. + seq_lens_tensor: Optional[torch.Tensor] + + # FIXME: It is for flash attn. + # Maximum sequence length among prefill batch. 0 if there are decoding + # requests only. + max_prefill_seq_len: int + # Maximum sequence length among decode batch. 0 if there are prefill + # requests only. + max_decode_seq_len: int + + # Whether or not if cuda graph is enabled. + # Cuda-graph is currently enabled for decoding only. + # TODO(woosuk): Move `use_cuda_graph` out since it's unrelated to attention. + use_cuda_graph: bool + + # (batch_size,). The sequence length per sequence. Sequence length means + # the computed tokens + new tokens None if it is a decoding. + seq_lens: Optional[List[int]] = None + + # FIXME: It is for flash attn. + # (batch_size + 1,). The cumulative sequence lengths of the sequences in + # the batch, used to index into sequence. E.g., if the sequence length is + # [4, 6], it is [0, 4, 10]. + seq_start_loc: Optional[torch.Tensor] = None + + # (batch_size,) A tensor of context lengths (tokens that are computed + # so far). + context_lens_tensor: Optional[torch.Tensor] = None + + # Maximum query length in the batch. None for decoding. + max_query_len: Optional[int] = None + + # (batch_size + 1,). The cumulative subquery lengths of the sequences in + # the batch, used to index into subquery. E.g., if the subquery length + # is [4, 6], it is [0, 4, 10]. + query_start_loc: Optional[torch.Tensor] = None + + # Self-attention prefill/decode metadata cache + _cached_prefill_metadata: Optional["AscendMetadata"] = None + _cached_decode_metadata: Optional["AscendMetadata"] = None + + # Begin encoder attn & enc/dec cross-attn fields... + + # Encoder sequence lengths representation + encoder_seq_lens: Optional[List[int]] = None + encoder_seq_lens_tensor: Optional[torch.Tensor] = None + + # Maximum sequence length among encoder sequences + max_encoder_seq_len: Optional[int] = None + + # Number of tokens input to encoder + num_encoder_tokens: Optional[int] = None + + attn_mask: Optional[torch.Tensor] = None + pse_shift: Optional[torch.Tensor] = None + sparse_mode: int = 0 + + # Cross-attention memory-mapping data structures: slot mapping + # and block tables + cross_slot_mapping: Optional[torch.Tensor] = None + cross_block_tables: Optional[torch.Tensor] = None + + # slot_mapping: Optional[torch.Tensor] = None + + @property + def prefill_metadata(self) -> Optional["AscendMetadata"]: + if self.num_prefills == 0: + return None + + if self._cached_prefill_metadata is not None: + # Recover cached prefill-phase attention + # metadata structure + return self._cached_prefill_metadata + + assert ((self.seq_lens is not None) + or (self.encoder_seq_lens is not None)) + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + query_start_loc = (None if self.query_start_loc is None else + self.query_start_loc[:self.num_prefills + 1]) + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[:self.num_prefill_tokens]) + seq_lens = (None if self.seq_lens is None else + self.seq_lens[:self.num_prefills]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[:self.num_prefills]) + seq_start_loc = (None if self.seq_start_loc is None else + self.seq_start_loc[:self.num_prefills + 1]) + context_lens_tensor = (None if self.context_lens_tensor is None else + self.context_lens_tensor[:self.num_prefills]) + block_tables = (None if self.block_tables is None else + self.block_tables[:self.num_prefills]) + + # Construct & cache prefill-phase attention metadata structure + self._cached_prefill_metadata = AscendMetadata( + num_prefills=self.num_prefills, + num_prefill_tokens=self.num_prefill_tokens, + num_decode_tokens=0, + slot_mapping=slot_mapping, + seq_lens=seq_lens, + seq_lens_tensor=seq_lens_tensor, + max_query_len=self.max_query_len, + max_prefill_seq_len=self.max_prefill_seq_len, + max_decode_seq_len=0, + query_start_loc=query_start_loc, + seq_start_loc=seq_start_loc, + context_lens_tensor=context_lens_tensor, + block_tables=block_tables, + use_cuda_graph=False, + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + max_encoder_seq_len=self.max_encoder_seq_len, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables, + enable_kv_scales_calculation=False) + return self._cached_prefill_metadata + + @property + def decode_metadata(self) -> Optional["AscendMetadata"]: + if self.num_decode_tokens == 0: + return None + + if self._cached_decode_metadata is not None: + # Recover cached decode-phase attention + # metadata structure + return self._cached_decode_metadata + assert ((self.seq_lens_tensor is not None) + or (self.encoder_seq_lens_tensor is not None)) + + # Compute some attn_metadata fields which default to None + slot_mapping = (None if self.slot_mapping is None else + self.slot_mapping[self.num_prefill_tokens:]) + seq_lens_tensor = (None if self.seq_lens_tensor is None else + self.seq_lens_tensor[self.num_prefills:]) + block_tables = (None if self.block_tables is None else + self.block_tables[self.num_prefills:]) + + # Construct & cache decode-phase attention metadata structure + self._cached_decode_metadata = AscendMetadata( + num_prefills=0, + num_prefill_tokens=0, + num_decode_tokens=self.num_decode_tokens, + slot_mapping=slot_mapping, + seq_lens_tensor=seq_lens_tensor, + max_prefill_seq_len=0, + max_decode_seq_len=self.max_decode_seq_len, + # Batch may be composed of prefill|decodes, adjust query start + # indices to refer to the start of decodes. E.g. + # in tokens:[3 prefills|6 decodes], query_start_loc=[3,9] => [0,6]. + query_start_loc=(self.query_start_loc[self.num_prefills:] - + self.query_start_loc[self.num_prefills]) + if self.query_start_loc is not None else None, + seq_start_loc=self.seq_start_loc[self.num_prefills:] + if self.seq_start_loc is not None else None, + context_lens_tensor=None, + block_tables=block_tables, + use_cuda_graph=self.use_cuda_graph, + # Begin encoder & cross attn fields below... + encoder_seq_lens=self.encoder_seq_lens, + encoder_seq_lens_tensor=self.encoder_seq_lens_tensor, + max_encoder_seq_len=self.max_encoder_seq_len, + multi_modal_placeholder_index_maps=self. + multi_modal_placeholder_index_maps, + cross_slot_mapping=self.cross_slot_mapping, + cross_block_tables=self.cross_block_tables, + enable_kv_scales_calculation=False) + return self._cached_decode_metadata + + +class AscendMetadataBuilder(CommonMetadataBuilder[AscendMetadata]): + + _metadata_cls = AscendMetadata + + def compute_npu_slot_indices(self, is_profile_run, slot_indices, seq_id, + seq_len, context_len, start_idx, block_size, + block_tables, max_query_len): + """ + compute slot indices + slot mapping in other backend of vllm stores slot indices, + which are indicates by `block_number * block_size + block_offset` + In Ascend backend, slot mapping stores [block_number, block_offset]. + To distinguish this, slot_indices is used in this func + """ + if is_profile_run: + # During memory profiling, the block tables are not + # initialized yet. In this case, we just use a dummy + # slot mapping. + # In embeddings, the block tables are {seq_id: None}. + slot_indices.extend([[PAD_SLOT_ID, 0]] * seq_len) + return + # Mask the [0, start_idx) tokens of the prompt with + # [PAD_SLOT_ID, 0], where start_idx is max(0, seq_len - + # sliding_window). For example, if the prompt len is 10, + # sliding window is 8, and block size is 4, the first two + # tokens are masked and the slot mapping will be + # [-1, -1, 2, 3, 4, 5, 6, 7, 0, 1]. + padding_mask_len = max(0, start_idx - context_len) + slot_indices.extend([[PAD_SLOT_ID, 0]] * padding_mask_len) + + range_start = max(start_idx, context_len) + range_end = seq_len + numel = range_end - range_start + block_table = block_tables[seq_id] + + for i in range(range_start, range_end): + block_number = block_table[i // block_size] + block_offset = i % block_size + slot_indices.append([block_number, block_offset]) + slot_indices.extend([[PAD_SLOT_ID, 0]] * (max_query_len - numel)) + + def _add_seq_group( + self, inter_data: "ModelInputForNPUBuilder.InterDataForSeqGroup", + chunked_prefill_enabled: bool): + """Add a sequence group to the metadata. Specifically update/append + 1. context length. + 2. block table. + 3. slot mapping. + """ + is_prompt = inter_data.is_prompt + block_tables = inter_data.block_tables + max_query_len = max( + max(data.query_lens) + for data in self.input_builder.inter_data_list) + + is_prompt = inter_data.is_prompt + block_tables = inter_data.block_tables + + for (seq_id, token_len, seq_len, curr_seq_len, query_len, context_len, + curr_sliding_window_block) in zip( + inter_data.seq_ids, [len(t) for t in inter_data.input_tokens], + inter_data.orig_seq_lens, inter_data.seq_lens, + inter_data.query_lens, inter_data.context_lens, + inter_data.curr_sliding_window_blocks): + self.context_lens.append(context_len) + if is_prompt: + self.num_prefills += 1 + self.num_prefill_tokens += token_len + self.prefill_seq_lens.append(seq_len) + else: + assert query_len == 1, ( + "seq_len: {}, context_len: {}, query_len: {}".format( + seq_len, context_len, query_len)) + self.num_decode_tokens += query_len + self.curr_seq_lens.append(curr_seq_len) + + # Compute block table. + # TODO(sang): Combine chunked prefill and prefix caching by + # only allowing multiple of block_size chunk size. + # NOTE: This only works for oooooooxxx style attention. + block_table: List[int] = [] + prefix_cache_hit = any([ + inter_data.prefix_cache_hit + for inter_data in self.input_builder.inter_data_list + ]) + if prefix_cache_hit: + # NOTE(woosuk): For flash-attn, the block table should + # include the entries for the incoming prefill tokens. + if block_tables is not None: + block_table = block_tables[seq_id] + elif ((chunked_prefill_enabled or not is_prompt) + and block_tables is not None): + if curr_sliding_window_block == 0: + block_table = block_tables[seq_id] + else: + block_table = block_tables[seq_id][ + -curr_sliding_window_block:] + self.block_tables.append(block_table) + + # Compute slot mapping. + is_profile_run = is_block_tables_empty(block_tables) + start_idx = compute_slot_mapping_start_idx(is_prompt, query_len, + context_len, + self.sliding_window) + + self.compute_npu_slot_indices(is_profile_run, self.slot_mapping, + seq_id, seq_len, context_len, + start_idx, self.block_size, + inter_data.block_tables, + max_query_len) + + +class AscendAttentionBackendImpl(AttentionImpl): + + def __init__( + self, + num_heads: int, + head_size: int, + scale: float, + num_kv_heads: int, + alibi_slopes: Optional[List[float]], + sliding_window: Optional[int], + kv_cache_dtype: str, + blocksparse_params: Optional[Dict[str, Any]] = None, + logits_soft_cap: Optional[float] = None, + attn_type: str = AttentionType.DECODER, + ) -> None: + self.num_heads = num_heads + self.head_size = head_size + self.scale = float(scale) + self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads + self.kv_cache_dtype = kv_cache_dtype + self.sliding_window = sliding_window + if alibi_slopes is not None: + alibi_slopes = torch.tensor(alibi_slopes, + dtype=torch.float32, + device="npu") + self.alibi_slopes = alibi_slopes + self.attn_type = attn_type + + assert self.num_heads % self.num_kv_heads == 0 + self.num_queries_per_kv = self.num_heads // self.num_kv_heads + + def forward( + self, + layer: AttentionLayer, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + kv_cache: List[torch.Tensor], + attn_metadata: AscendMetadata, + attn_type: str = AttentionType.DECODER, + output: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """Forward pass with Ascend attention. + Args: + query: shape = [num_tokens, num_heads * head_size] + num_tokens = batch_size * seq_len + key: shape = [num_tokens, num_kv_heads * head_size] + value: shape = [num_tokens, num_kv_heads * head_size] + kv_cache: shape = [2, num_blocks, block_size, + num_kv_heads * head_size] + key_cache = [num_blocks, block_size, + num_kv_heads * head_size] + value_cache = [num_blocks, block_size, + num_kv_heads * head_size] + attn_metadata: Metadata for attention. + Returns: + shape = [batch_size, seq_len * num_heads * head_size] + """ + assert layer._k_scale == 1.0 and layer._v_scale == 1.0 + attn_type = self.attn_type + if attn_type != AttentionType.DECODER: + raise NotImplementedError("Encoder self-attention and " + "encoder/decoder cross-attention " + "are not implemented for " + "PallasAttentionBackendImpl") + # view q k v to BSH + num_tokens = query.shape[0] + + if kv_cache is not None and len(kv_cache) >= 2: + slot_indices = attn_metadata.slot_mapping + key_cache, value_cache = kv_cache[0], kv_cache[1] + AscendPagedAttention.write_to_paged_cache( + key, + value, + key_cache, + value_cache, + slot_indices, + ) + + if attn_metadata.num_prefills > 0: + if attn_metadata.attn_mask is None: + if num_tokens > 16384: + attn_metadata.sparse_mode = 2 + attention_mask = gen_input_mask( + attn_metadata.max_prefill_seq_len, self.sliding_window, + num_tokens) + attn_metadata.attn_mask = attention_mask + + if (self.alibi_slopes is not None + and attn_metadata.pse_shift is None): + attn_metadata.pse_shift = _make_alibi_bias( + self.alibi_slopes, + self.num_kv_heads, + dtype=query.dtype, + seq_len=attn_metadata.max_prefill_seq_len, + batch_size=num_tokens, + ) + + if (len(kv_cache) == 0 or attn_metadata.block_tables is None + or attn_metadata.block_tables.numel() == 0): + max_seq_len = attn_metadata.max_prefill_seq_len + + # shape of q/k/v [B,S*H] --> [B,S,N,D] + query = query.view(-1, max_seq_len, self.num_heads, + self.head_size).transpose(1, 2) + key = key.view(-1, max_seq_len, self.num_kv_heads, + self.head_size).transpose(1, 2) + value = value.view(-1, max_seq_len, self.num_kv_heads, + self.head_size).transpose(1, 2) + # FA for prefill phase + output = torch_npu.npu_prompt_flash_attention( + query, + key, + value, + pse_shift=attn_metadata.pse_shift, + atten_mask=attn_metadata.attn_mask, + num_heads=self.num_heads, + scale_value=1 / math.sqrt(self.head_size), + input_layout="BNSD", + num_key_value_heads=self.num_kv_heads, + pre_tokens=65535, + next_tokens=0, + sparse_mode=attn_metadata.sparse_mode, + ) + # reshape to [B,H] + output = output.transpose(1, 2).reshape( + num_tokens, self.num_heads * self.head_size) + else: + # prefix-enabled attention + assert attn_type == AttentionType.DECODER, ( + "Only decoder-only models support prefix caching") + assert attn_metadata.seq_lens is not None + assert kv_cache is not None + query = query.view(query.shape[0], -1, + self.num_heads * self.head_size) + output = torch.zeros(query.shape, + device="npu", + dtype=query.dtype) + # TODO (Mengqing Cao): torch_npu.npu_incre_flash_attention + # support only when `S == 1`, OPTIMIZE ME when prefix caching + # is supported in torch-npu ops. + for i in range(query.shape[0]): + # FA for prefill phase + output[i] = torch_npu.npu_incre_flash_attention( + query[i].unsqueeze(0), + key_cache, + value_cache, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + scale_value=self.scale, + input_layout="BSH", + block_table=attn_metadata.block_tables, + block_size=key_cache. + shape[1], # max val of block_size == 512 + actual_seq_lengths=attn_metadata.seq_lens, + ) + # [B,S,H] --> [B,H] + output = output.squeeze(1) + + elif attn_metadata.decode_metadata: + # FA for decoding phase + assert kv_cache is not None + # shape of query [B,S*H] --> [B,S,H] + query = query.view( + -1, + 1, + self.head_size * self.num_heads, + ) + output = torch_npu.npu_incre_flash_attention( + query, + key_cache, + value_cache, + num_heads=self.num_heads, + num_key_value_heads=self.num_kv_heads, + scale_value=self.scale, + input_layout="BSH", + block_table=attn_metadata.block_tables, + block_size=key_cache.shape[1], # max val of block_size == 512 + actual_seq_lengths=attn_metadata.seq_lens, + ) + + # [B,S,H] --> [B,H] + output = output.squeeze(1) + return output + + +def gen_input_mask(seq_len, sliding_window, len): + """ + Generating lower triangular matrix + """ + if len > 16384: + # improve computing performance on NPU when input tokens are huge + global SHARE_MASK_TRIL_PREFIX_CACHE + if SHARE_MASK_TRIL_PREFIX_CACHE is None: + SHARE_MASK_TRIL_PREFIX_CACHE = torch.triu( + torch.ones(1, 1, 2048, 2048, dtype=bool, device="npu"), + diagonal=1, + ) + attention_mask = SHARE_MASK_TRIL_PREFIX_CACHE + else: + global SHARE_MASK_TRIL + if SHARE_MASK_TRIL is None or SHARE_MASK_TRIL.shape[0] < seq_len: + SHARE_MASK_TRIL = ~torch.tril( + torch.ones(seq_len, seq_len, dtype=bool, device="npu")) + + attention_mask = SHARE_MASK_TRIL + if sliding_window is not None: + attention_mask = ~attention_mask + attention_mask = torch.triu(attention_mask, + diagonal=1 - sliding_window) + attention_mask = ~attention_mask + + return attention_mask + + +def _make_alibi_bias( + alibi_slopes: torch.Tensor, + num_kv_heads: int, + dtype: torch.dtype, + seq_len: int, + batch_size: int, +): + bias = torch.arange(seq_len, dtype=dtype, device=alibi_slopes.device) + # NOTE(zhuohan): HF uses + # `bias = bias[None, :].repeat(seq_len, 1)` + # here. We find that both biases give the same results, but + # the bias below more accurately follows the original ALiBi + # paper. + # Calculate a matrix where each element represents ith element- jth + # element. + bias = bias[None, :] - bias[:, None] + + padded_len = (seq_len + 7) // 8 * 8 + num_heads = alibi_slopes.shape[0] + bias = torch.empty( + 1, + num_heads, + seq_len, + padded_len, + device=alibi_slopes.device, + dtype=dtype, + )[:, :, :, :seq_len].copy_(bias) + bias.mul_(alibi_slopes[:, None, None]) + if num_heads != num_kv_heads: + bias = bias.unflatten(1, (num_kv_heads, num_heads // num_kv_heads)) + + return bias diff --git a/vllm_ascend/communicator.py b/vllm_ascend/communicator.py new file mode 100644 index 000000000..efef46e92 --- /dev/null +++ b/vllm_ascend/communicator.py @@ -0,0 +1,28 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import torch +import torch.distributed as dist +from vllm.distributed.device_communicators.base_communicator import \ + CommunicatorBase + + +class NPUCommunicator(CommunicatorBase): + + def all_reduce(self, x: torch.Tensor) -> torch.Tensor: + dist.all_reduce(x, group=self.group) + return x diff --git a/vllm_ascend/model_runner.py b/vllm_ascend/model_runner.py new file mode 100644 index 000000000..96ef1914b --- /dev/null +++ b/vllm_ascend/model_runner.py @@ -0,0 +1,620 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/model_runner.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import dataclasses +from typing import Any, Dict, List, Optional, Set, Type + +import torch +import torch.distributed +from torch import nn +from vllm.distributed import get_pp_group +from vllm.logger import init_logger +from vllm.lora.layers import LoRAMapping +from vllm.lora.request import LoRARequest +from vllm.model_executor import SamplingMetadata +from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderMap +from vllm.platforms import current_platform +from vllm.prompt_adapter.layers import PromptAdapterMapping +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.sequence import SequenceGroupMetadata +from vllm.utils import flatten_2d_lists, make_tensor_with_pad +from vllm.worker.model_runner import (ModelInputForGPU, + ModelInputForGPUBuilder, + ModelInputForGPUWithSamplingMetadata, + ModelRunner) + +logger = init_logger(__name__) + +LORA_WARMUP_RANK = 8 + + +class ModelInputForNPUBuilder(ModelInputForGPUBuilder): + """Build ModelInputForGPU from SequenceGroupMetadata.""" + + # Note: ideally we would be using a dataclass(kw_only=True) + # here, so that this can be subclassed easily, + # but kw_only is not supported in python<3.10. + def build(self) -> ModelInputForGPU: + """Finalize the builder intermediate data and + create on-device tensors. + """ + # Combine and flatten intermediate data. + input_tokens = [ + flatten_2d_lists(inter_data.input_tokens) + for inter_data in self.inter_data_list + ] + if not input_tokens: + # This may happen when all prefill requests hit + # prefix caching and there is no decode request. + return self.model_input_cls() + + mrope_input_positions: Optional[List[List[int]]] = None + if any(inter_data.mrope_input_positions is not None + for inter_data in self.inter_data_list): + mrope_input_positions = [[] for _ in range(3)] + # calculate max position length for padding + input_position_lens = [ + len(inter_data.input_positions[0]) + for inter_data in self.inter_data_list + ] + max_pos_len = max(input_position_lens) + + for idx in range(3): + for inter_data in self.inter_data_list: + msections = inter_data.mrope_input_positions + if msections is None: + for _seq_input_positions in inter_data.input_positions: + # zero pad + _seq_input_positions.extend( + [0] * + (max_pos_len - len(_seq_input_positions))) + mrope_input_positions[idx].extend( + _seq_input_positions) + else: + for _seq_mrope_input_positions in msections: + # zero pad + _seq_mrope_input_positions[idx].extend( + [0] * (max_pos_len - + len(_seq_mrope_input_positions[idx]))) + mrope_input_positions[idx].extend( + _seq_mrope_input_positions[idx]) + input_positions = None + else: + input_positions = [ + flatten_2d_lists(inter_data.input_positions) + for inter_data in self.inter_data_list + ] + + seq_lens = [] + max_decode_seq_len = 0 + for inter_data in self.inter_data_list: + seq_lens.extend(inter_data.seq_lens) + if not inter_data.is_prompt: + max_decode_seq_len = max(max_decode_seq_len, + max(inter_data.seq_lens)) + query_lens = flatten_2d_lists( + [inter_data.query_lens for inter_data in self.inter_data_list]) + # Mapping from request IDs to sequence IDs. Used for Jamba models + # that manages the cache by itself. + request_ids_to_seq_ids = { + data.request_id: data.seq_ids + for data in self.inter_data_list + } + + batch_size = len(input_tokens) + + # If cuda graph can be used, pad tensors accordingly. + # See `capture_model` API for more details. + # vLLM uses cuda graph only for decoding requests. + cuda_graph_pad_size = -1 + + if self.inter_data_list[0].is_prompt: + input_tokens_tensor = make_tensor_with_pad( + input_tokens, 0, dtype=torch.int, device=self.runner.device) + input_tokens_tensor = torch.flatten(input_tokens_tensor) + if mrope_input_positions is not None: + mrope_input_positions_tensor = make_tensor_with_pad( + mrope_input_positions, + 0, + dtype=torch.int, + device=self.runner.device) + input_positions_tensor = torch.tensor( + mrope_input_positions_tensor, + dtype=torch.long, + device=self.runner.device) + else: + input_positions_tensor = make_tensor_with_pad( + input_positions, + 0, + dtype=torch.int, + device=self.runner.device) + input_positions_tensor = torch.flatten(input_positions_tensor) + + max_seq_len = max(seq_lens) + seq_lens = len(seq_lens) * [max_seq_len] + else: + input_tokens_tensor = torch.tensor(flatten_2d_lists(input_tokens), + dtype=torch.long, + device=self.runner.device) + if mrope_input_positions is not None: + input_positions_tensor = torch.tensor( + mrope_input_positions, + dtype=torch.long, + device=self.runner.device) + else: + input_positions_tensor = torch.tensor( + flatten_2d_lists(input_positions), + dtype=torch.long, + device=self.runner.device) + + # Sequence and query lengths. + seq_lens.extend([1] * cuda_graph_pad_size) + + # Attention metadata. + attn_metadata = self.attn_metadata_builder.build( + seq_lens, query_lens, cuda_graph_pad_size, batch_size) + + # LoRA data. + lora_requests = set() + lora_mapping = None + if self.enable_lora: + lora_requests = set(r for data in self.inter_data_list + for r in data.lora_requests) + lora_index_mapping = flatten_2d_lists([ + flatten_2d_lists(inter_data.lora_index_mapping) + for inter_data in self.inter_data_list + ]) + lora_index_mapping.extend([0] * cuda_graph_pad_size) + lora_prompt_mapping = flatten_2d_lists([ + flatten_2d_lists(inter_data.lora_prompt_mapping) + for inter_data in self.inter_data_list + ]) + lora_mapping = LoRAMapping( + **dict(index_mapping=lora_index_mapping, + prompt_mapping=lora_prompt_mapping, + is_prefill=not self.decode_only)) + + # Prompt adapter data. + prompt_adapter_requests: Set[PromptAdapterRequest] = set() + prompt_adapter_mapping = None + if self.enable_prompt_adapter: + prompt_adapter_requests = set( + data.prompt_adapter_request for data in self.inter_data_list + if data.prompt_adapter_request is not None) + prompt_adapter_index_mapping = flatten_2d_lists([ + inter_data.prompt_adapter_index_mapping + for inter_data in self.inter_data_list + ]) + prompt_adapter_index_mapping.extend([0] * cuda_graph_pad_size) + prompt_adapter_prompt_mapping = flatten_2d_lists([ + inter_data.prompt_adapter_prompt_mapping + for inter_data in self.inter_data_list + ]) + prompt_adapter_mapping = PromptAdapterMapping( + prompt_adapter_index_mapping, + prompt_adapter_prompt_mapping, + ) + + # Multi-modal data. + multi_modal_kwargs_list = [ + data.multi_modal_kwargs for data in self.inter_data_list + if data.multi_modal_kwargs is not None + ] + multi_modal_kwargs = MultiModalKwargs.batch(multi_modal_kwargs_list) + + return self.model_input_cls( + input_tokens=input_tokens_tensor, + input_positions=input_positions_tensor, + attn_metadata=attn_metadata, + seq_lens=seq_lens, + query_lens=query_lens, + lora_mapping=lora_mapping, + lora_requests=lora_requests, + multi_modal_kwargs=multi_modal_kwargs, + request_ids_to_seq_ids=request_ids_to_seq_ids, + finished_requests_ids=self.finished_requests_ids, + prompt_adapter_mapping=prompt_adapter_mapping, + prompt_adapter_requests=prompt_adapter_requests) + + class InterDataForSeqGroup: + """Intermediate data for the current sequence group.""" + + def simple_reinit(self): + self.input_tokens[0].clear() # type: ignore + self.input_positions[0].clear() # type: ignore + self.token_types[0].clear() # type: ignore + self.mrope_input_positions = None # type: ignore + self.seq_lens[0] = 0 # type: ignore + self.orig_seq_lens[0] = 0 # type: ignore + self.query_lens[0] = 0 # type: ignore + self.context_lens[0] = 0 # type: ignore + self.curr_sliding_window_blocks[0] = 0 # type: ignore + self.lora_index_mapping.clear() # type: ignore + self.lora_prompt_mapping.clear() # type: ignore + self.lora_requests.clear() # type: ignore + self.prompt_adapter_index_mapping.clear() # type: ignore + self.prompt_adapter_prompt_mapping.clear() # type: ignore + + def __init__( + self, + *, + # From sequence group metadata. + request_id: str, + seq_ids: List[int], + is_prompt: bool, + block_tables: Optional[Dict[int, List[int]]], + computed_block_nums: List[int], + n_seqs: int = 0, + + # Input tokens and positions. + input_tokens: Optional[List[List[int]]] = None, + input_positions: Optional[List[List[int]]] = None, + token_types: Optional[List[List[int]]] = None, + mrope_input_positions: Optional[List[List[List[int]]]] = None, + + # The sequence length (may be capped to the sliding window). + seq_lens: Optional[List[int]] = None, + # The original sequence length (before applying sliding window). + # This is used to compute slot mapping. + orig_seq_lens: Optional[List[int]] = None, + # The query length. + query_lens: Optional[List[int]] = None, + # The number of tokens that are already computed. + context_lens: Optional[List[int]] = None, + # The current sliding window block. + curr_sliding_window_blocks: Optional[List[int]] = None, + + # LoRA inputs. + lora_index_mapping: Optional[List[List[int]]] = None, + lora_prompt_mapping: Optional[List[List[int]]] = None, + lora_requests: Optional[Set[LoRARequest]] = None, + + # Prompt adapter inputs. + prompt_adapter_index_mapping: Optional[List[int]] = None, + prompt_adapter_prompt_mapping: Optional[List[int]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + + # Multi-modal inputs. + multi_modal_kwargs: Optional[MultiModalKwargs] = None, + multi_modal_placeholder_maps: Optional[Dict[ + str, MultiModalPlaceholderMap]] = None, + + # Whether the prefix cache is hit (prefill only). + prefix_cache_hit: bool = False, + reinit: bool = False, + reinit_use_defaults: bool = False, + encoder_seq_len: int = 0, + ): + if reinit: + assert len(self.seq_ids) == len(seq_ids) # type: ignore + for i, seq_id in enumerate(seq_ids): + self.seq_ids[i] = seq_id # type: ignore + else: + self.seq_ids = seq_ids + + self.request_id = request_id + self.is_prompt = is_prompt + self.block_tables = block_tables + self.computed_block_nums = computed_block_nums + self.n_seqs = n_seqs + self.encoder_seq_len = encoder_seq_len + + if reinit: + if len(self.seq_ids) == 1 and reinit_use_defaults: + self.simple_reinit() + else: + if input_tokens: + self.input_tokens = input_tokens + else: + for seq_id in range(len(self.seq_ids)): + self.input_tokens[seq_id].clear() + + if input_positions: + self.input_positions = input_positions + else: + for seq_id in range(len(self.seq_ids)): + self.input_positions[seq_id].clear() + + if token_types: + self.token_types = token_types + else: + for seq_id in range(len(self.seq_ids)): + self.token_types[seq_id].clear() + + self.mrope_input_positions = None + + if seq_lens: + self.seq_lens = seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.seq_lens[seq_id] = 0 + + if orig_seq_lens: + self.orig_seq_lens = orig_seq_lens + else: + for seq_id in range(len(self.seq_ids)): + self.orig_seq_lens[seq_id] = 0 + + if query_lens: + self.query_lens = query_lens + else: + for seq_id in range(len(self.seq_ids)): + self.query_lens[seq_id] = 0 + + if context_lens: + self.context_lens = context_lens + else: + for seq_id in range(len(self.seq_ids)): + self.context_lens[seq_id] = 0 + + if curr_sliding_window_blocks: + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks + else: + for seq_id in range(len(self.seq_ids)): + self.curr_sliding_window_blocks[seq_id] = 0 + + if lora_index_mapping: + self.lora_index_mapping = lora_index_mapping + else: + self.lora_index_mapping.clear() + + if lora_prompt_mapping: + self.lora_prompt_mapping = lora_prompt_mapping + else: + self.lora_prompt_mapping.clear() + + if lora_requests: + self.lora_requests = lora_requests + else: + self.lora_requests.clear() + + if prompt_adapter_index_mapping: + self.prompt_adapter_index_mapping = \ + prompt_adapter_index_mapping + else: + self.prompt_adapter_index_mapping.clear() + + if prompt_adapter_prompt_mapping: + self.prompt_adapter_prompt_mapping = \ + prompt_adapter_prompt_mapping + else: + self.prompt_adapter_prompt_mapping.clear() + + else: + self.input_tokens = input_tokens or [] + self.input_positions = input_positions or [] + self.token_types = token_types or [] + self.mrope_input_positions = mrope_input_positions or None + self.seq_lens = seq_lens or [] + self.orig_seq_lens = orig_seq_lens or [] + self.query_lens = query_lens or [] + self.context_lens = context_lens or [] + self.curr_sliding_window_blocks = \ + curr_sliding_window_blocks or [] + + self.lora_index_mapping = lora_index_mapping or [] + self.lora_prompt_mapping = lora_prompt_mapping or [] + self.lora_requests = lora_requests or set() + + self.prompt_adapter_index_mapping = ( + prompt_adapter_index_mapping or []) + self.prompt_adapter_prompt_mapping = ( + prompt_adapter_prompt_mapping or []) + + self.prompt_adapter_request = prompt_adapter_request + self.multi_modal_kwargs = multi_modal_kwargs + self.multi_modal_placeholder_maps = multi_modal_placeholder_maps + self.prefix_cache_hit = prefix_cache_hit + + self.n_seqs = len(self.seq_ids) + + if not reinit: + self.__post_init__() + + def __post_init__(self): + self.n_seqs = len(self.seq_ids) + + self.input_tokens = [[] for _ in range(self.n_seqs)] + self.input_positions = [[] for _ in range(self.n_seqs)] + self.token_types = [[] for _ in range(self.n_seqs)] + self.mrope_input_positions = None + self.seq_lens = [0] * self.n_seqs + self.orig_seq_lens = [0] * self.n_seqs + self.query_lens = [0] * self.n_seqs + self.context_lens = [0] * self.n_seqs + self.curr_sliding_window_blocks = [0] * self.n_seqs + + self.lora_index_mapping = [] + self.lora_prompt_mapping = [] + + +class NPUModelRunner(ModelRunner): + """ + NPU model runner with sampling step. + """ + _model_input_cls: Type[ModelInputForGPUWithSamplingMetadata] = ( + ModelInputForGPUWithSamplingMetadata) + _builder_cls: Type[ModelInputForNPUBuilder] = ModelInputForNPUBuilder + + def make_model_input_from_broadcasted_tensor_dict( + self, + tensor_dict: Dict[str, Any], + ) -> ModelInputForGPUWithSamplingMetadata: + model_input = \ + ModelInputForGPUWithSamplingMetadata.from_broadcasted_tensor_dict( + tensor_dict, + attn_backend=self.attn_backend, + ) + return model_input + + @current_platform.inference_mode() + def profile_run(self) -> None: + # Enable top-k sampling to reflect the accurate memory usage. + sampling_params = SamplingParams(top_p=0.99, top_k=self.vocab_size - 1) + max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens + max_num_seqs = self.scheduler_config.max_num_seqs + # This represents the maximum number of different requests + # that will have unique loras, an therefore the max amount of memory + # consumption create dummy lora request copies from the lora request + # passed in, which contains a lora from the lora warmup path. + dummy_lora_requests: List[LoRARequest] = [] + dummy_lora_requests_per_seq: List[LoRARequest] = [] + if self.lora_config: + assert self.lora_manager is not None + with self.lora_manager.dummy_lora_cache(): + for idx in range(self.lora_config.max_loras): + lora_id = idx + 1 + dummy_lora_request = LoRARequest( + lora_name=f"warmup_{lora_id}", + lora_int_id=lora_id, + lora_path="/not/a/real/path", + ) + self.lora_manager.add_dummy_lora(dummy_lora_request, + rank=LORA_WARMUP_RANK) + dummy_lora_requests.append(dummy_lora_request) + dummy_lora_requests_per_seq = [ + dummy_lora_requests[idx % len(dummy_lora_requests)] + for idx in range(max_num_seqs) + ] + + # Profile memory usage with max_num_sequences sequences and the total + # number of tokens equal to max_num_batched_tokens. + seqs: List[SequenceGroupMetadata] = [] + # Additional GPU memory may be needed for multi-modal encoding, which + # needs to be accounted for when calculating the GPU blocks for + # vLLM blocker manager. + # To exercise the worst scenario for GPU memory consumption, + # the number of seqs (batch_size) is chosen to maximize the number + # of images processed. + + max_mm_tokens = self.mm_registry.get_max_multimodal_tokens( + self.model_config) + if max_mm_tokens > 0: + max_num_seqs_orig = max_num_seqs + max_num_seqs = min(max_num_seqs, + max_num_batched_tokens // max_mm_tokens) + if max_num_seqs < 1: + expr = (f"min({max_num_seqs_orig}, " + f"{max_num_batched_tokens} // {max_mm_tokens})") + logger.warning( + "Computed max_num_seqs (%s) to be less than 1. " + "Setting it to the minimum value of 1.", expr) + max_num_seqs = 1 + + batch_size = 0 + for group_id in range(max_num_seqs): + seq_len = (max_num_batched_tokens // max_num_seqs + + (group_id < max_num_batched_tokens % max_num_seqs)) + batch_size += seq_len + + dummy_data = self.input_registry \ + .dummy_data_for_profiling(self.model_config, + seq_len, + self.mm_registry) + + seq = SequenceGroupMetadata( + request_id=str(group_id), + is_prompt=True, + seq_data={group_id: dummy_data.seq_data}, + sampling_params=sampling_params, + block_tables=None, + lora_request=dummy_lora_requests_per_seq[group_id] + if dummy_lora_requests_per_seq else None, + multi_modal_data=dummy_data.multi_modal_data, + multi_modal_placeholders=dummy_data.multi_modal_placeholders, + ) + seqs.append(seq) + + # Run the model with the dummy inputs. + num_layers = self.model_config.get_num_layers(self.parallel_config) + # use an empty tensor instead of `None`` to force Dynamo to pass + # it by reference, rather by specializing on the value ``None``. + # the `dtype` argument does not matter, and we use `float32` as + # a placeholder (it has wide hardware support). + # it is important to create tensors inside the loop, rather than + # multiplying the list, to avoid Dynamo from treating them as + # tensor aliasing. + kv_caches = [ + torch.tensor([], dtype=torch.float32, device=self.device) + for _ in range(num_layers) + ] + finished_requests_ids = [seq.request_id for seq in seqs] + model_input = self.prepare_model_input( + seqs, finished_requests_ids=finished_requests_ids) + intermediate_tensors = None + if not get_pp_group().is_first_rank: + intermediate_tensors = self.model.make_empty_intermediate_tensors( + batch_size=batch_size, + dtype=self.model_config.dtype, + device=self.device) + self.execute_model(model_input, kv_caches, intermediate_tensors) + current_platform.synchronize() + return + + @current_platform.inference_mode() + def capture_model(self, kv_caches: List[List[torch.Tensor]]) -> None: + """NPU graph capture a model. + TODO: not support now + """ + pass + + def prepare_model_input( + self, + seq_group_metadata_list: List[SequenceGroupMetadata], + virtual_engine: int = 0, + finished_requests_ids: Optional[List[str]] = None, + ) -> ModelInputForGPUWithSamplingMetadata: + """Prepare the model input based on a given sequence group, including + metadata for the sampling step. + The API assumes seq_group_metadata_list is sorted by prefill -> decode. + The result tensors and data structure also batches input in prefill + -> decode order. For example, + - input_tokens[:num_prefill_tokens] contains prefill tokens. + - input_tokens[num_prefill_tokens:] contains decode tokens. + If cuda graph is required, this API automatically pads inputs. + """ + model_input = self._prepare_model_input_tensors( + seq_group_metadata_list, finished_requests_ids) + if get_pp_group().is_last_rank: + # Sampling metadata is only required for the final pp group + generators = self.get_generators(finished_requests_ids) + sampling_metadata = SamplingMetadata.prepare( + seq_group_metadata_list, + model_input.seq_lens, + model_input.query_lens, + self.device, + self.pin_memory, + generators, + self.sampling_metadata_cache, + # TODO (cmq): enable this after supported in vllm + # pad_for_invariant_seq_len=True, + ) + else: + sampling_metadata = None + is_prompt = (seq_group_metadata_list[0].is_prompt + if seq_group_metadata_list else None) + return dataclasses.replace(model_input, + sampling_metadata=sampling_metadata, + is_prompt=is_prompt, + virtual_engine=virtual_engine) + + def get_model(self) -> nn.Module: + return self.model diff --git a/vllm_ascend/ops/__init__.py b/vllm_ascend/ops/__init__.py new file mode 100644 index 000000000..bdc40cd5f --- /dev/null +++ b/vllm_ascend/ops/__init__.py @@ -0,0 +1,18 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import vllm_ascend.ops.layernorm # noqa diff --git a/vllm_ascend/ops/layernorm.py b/vllm_ascend/ops/layernorm.py new file mode 100644 index 000000000..719aa977d --- /dev/null +++ b/vllm_ascend/ops/layernorm.py @@ -0,0 +1,40 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Optional, Tuple, Union + +import torch +from vllm.model_executor.layers.layernorm import RMSNorm + + +def forward_oot( + self, + x: torch.Tensor, + residual: Optional[torch.Tensor] = None, +) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + import torch_npu + + if residual is not None: + x, _, residual = torch_npu.npu_add_rms_norm(x, residual, self.weight, + self.variance_epsilon) + return x, residual + + x, residual = torch_npu.npu_rms_norm(x, self.weight, self.variance_epsilon) + return x + + +RMSNorm.forward_oot = forward_oot diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py new file mode 100644 index 000000000..2b847de13 --- /dev/null +++ b/vllm_ascend/platform.py @@ -0,0 +1,115 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from typing import Optional, Tuple + +import torch + +try: + import torch_npu # noqa: F401 +except ImportError: + print("Failed to import torch_npu.") + +from vllm.config import VllmConfig +from vllm.platforms import Platform, PlatformEnum + +os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" + + +def _device_id_to_physical_device_id(device_id: int) -> int: + if "ASCEND_RT_VISIBLE_DEVICES" in os.environ: + device_ids = os.environ["ASCEND_RT_VISIBLE_DEVICES"].split(",") + if device_ids == [""]: + raise RuntimeError("ASCEND_RT_VISIBLE_DEVICES is set to empty" + "string, which means Ascend NPU support is" + "disabled.") + physical_device_id = device_ids[device_id] + return int(physical_device_id) + else: + return device_id + + +class NPUPlatform(Platform): + + _enum = PlatformEnum.OOT + device_name: str = "npu" + device_type: str = "npu" + simple_compile_backend: str = "npu" + ray_device_key: str = "NPU" + device_control_env_var: str = "ASCEND_RT_VISIBLE_DEVICES" + + @classmethod + def get_device_capability(cls, device_id: int = 0): + return None + + @classmethod + def get_device_name(cls, device_id: int = 0) -> str: + physical_device_id = _device_id_to_physical_device_id(device_id) + return torch.npu.get_device_name(physical_device_id) + + @classmethod + def is_async_output_supported(cls, enforce_eager: Optional[bool]) -> bool: + return True + + @classmethod + def inference_mode(cls): + return torch.inference_mode() + + @classmethod + def set_device(cls, device: torch.device): + torch.npu.set_device(device) + + @classmethod + def empty_cache(cls): + torch.npu.empty_cache() + + @classmethod + def synchronize(cls): + torch.npu.synchronize() + + @classmethod + def mem_get_info(cls) -> Tuple[int, int]: + return torch.npu.mem_get_info() + + @classmethod + def check_and_update_config(cls, vllm_config: VllmConfig) -> None: + # Register ops when setup. + from vllm_ascend import ops # noqa: F401 + + parallel_config = vllm_config.parallel_config + if parallel_config.worker_cls == "auto": + parallel_config.worker_cls = "vllm_ascend.worker.NPUWorker" + cache_config = vllm_config.cache_config + if cache_config and cache_config.block_size is None: + cache_config.block_size = 16 + + @classmethod + def get_attn_backend_cls(cls, selected_backend, head_size, dtype, + kv_cache_dtype, block_size, use_v1, use_mla): + return "vllm_ascend.attention.AscendAttentionBackend" + + @classmethod + def get_current_memory_usage(cls, + device: Optional[torch.types.Device] = None + ) -> float: + torch.npu.reset_peak_memory_stats(device) + return torch.npu.max_memory_allocated(device) + + @classmethod + def get_device_communicator_cls(cls) -> str: + return "vllm_ascend.communicator.NPUCommunicator" diff --git a/vllm_ascend/worker.py b/vllm_ascend/worker.py new file mode 100644 index 000000000..8ddd5302e --- /dev/null +++ b/vllm_ascend/worker.py @@ -0,0 +1,481 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/vllm/worker/worker.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import gc +from typing import Dict, List, Optional, Set, Tuple, Type, Union + +import torch +import torch.distributed +import torch_npu +from torch import nn +from vllm import envs +from vllm.config import ParallelConfig, VllmConfig +from vllm.distributed import (ensure_model_parallel_initialized, + init_distributed_environment, + set_custom_all_reduce) +from vllm.logger import init_logger +from vllm.lora.request import LoRARequest +from vllm.model_executor import set_random_seed +from vllm.model_executor.layers.sampler import SamplerOutput +from vllm.model_executor.model_loader.tensorizer import TensorizerConfig +from vllm.platforms import current_platform +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sequence import (ExecuteModelRequest, IntermediateTensors, + SequenceGroupMetadata, SequenceGroupMetadataDelta) +from vllm.utils import bind_kv_cache +from vllm.worker.cache_engine import CacheEngine +from vllm.worker.enc_dec_model_runner import EncoderDecoderModelRunner +from vllm.worker.model_runner_base import ModelRunnerBase +from vllm.worker.pooling_model_runner import PoolingModelRunner +from vllm.worker.worker_base import (LocalOrDistributedWorkerBase, WorkerBase, + WorkerInput) + +from vllm_ascend.model_runner import NPUModelRunner + +logger = init_logger(__name__) + + +class NPUWorker(LocalOrDistributedWorkerBase): + """A worker class that executes (a partition of) the model on a NPU. + Each worker is associated with a single NPU. The worker is responsible for + maintaining the KV cache and executing the model on the NPU. In case of + distributed inference, each worker is assigned a partition of the model. + """ + + def __init__( + self, + vllm_config: VllmConfig, + local_rank: int, + rank: int, + distributed_init_method: str, + is_driver_worker: bool = False, + model_runner_cls: Optional[Type[ModelRunnerBase]] = None, + ) -> None: + + WorkerBase.__init__(self, vllm_config=vllm_config) + # distribute related config + self.parallel_config.rank = rank + self.local_rank = local_rank + self.rank = rank + self.distributed_init_method = distributed_init_method + self.is_driver_worker = is_driver_worker + + if is_driver_worker: + assert rank % self.parallel_config.tensor_parallel_size == 0, \ + "Driver worker should be rank 0 of tensor parallel group." + if self.model_config.trust_remote_code: + # note: lazy import to avoid importing torch before initializing + from vllm.utils import init_cached_hf_modules + init_cached_hf_modules() + + # Return hidden states from target model if the draft model is an + # mlp_speculator + speculative_config = self.speculative_config + model_config = self.model_config + speculative_args = {} if speculative_config is None \ + or (speculative_config.draft_model_config.model == + model_config.model) \ + or (speculative_config.draft_model_config.hf_config.model_type + not in ["medusa", "mlp_speculator", "eagle"]) \ + else {"return_hidden_states": True} + + ModelRunnerClass: Type[ModelRunnerBase] = NPUModelRunner + if model_config.runner_type == "pooling": + ModelRunnerClass = PoolingModelRunner + elif self.model_config.is_encoder_decoder: + ModelRunnerClass = EncoderDecoderModelRunner + self.model_runner: ModelRunnerBase = ModelRunnerClass( + vllm_config=self.vllm_config, + kv_cache_dtype=self.cache_config.cache_dtype, + is_driver_worker=is_driver_worker, + **speculative_args, + ) + if model_runner_cls is not None: + self.model_runner = model_runner_cls(self.model_runner) + + # Uninitialized cache engine. Will be initialized by + # initialize_cache. + self.cache_engine: List[CacheEngine] + # Initialize gpu_cache as embedding models don't initialize kv_caches + self.gpu_cache: Optional[List[List[torch.Tensor]]] = None + self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {} + + # Torch profiler. Enabled and configured through env vars: + # VLLM_TORCH_PROFILER_DIR=/path/to/save/trace + if envs.VLLM_TORCH_PROFILER_DIR: + torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR + logger.info("Profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir) + + experimental_config = torch_npu.profiler._ExperimentalConfig( + export_type=torch_npu.profiler.ExportType.Text, + profiler_level=torch_npu.profiler.ProfilerLevel.Level0, + msprof_tx=False, + aic_metrics=torch_npu.profiler.AiCMetrics.AiCoreNone, + l2_cache=False, + op_attr=False, + data_simplification=False, + record_op_args=False, + gc_detect_threshold=None, + ) + + self.profiler = torch_npu.profiler.profile( + activities=[ + torch_npu.profiler.ProfilerActivity.CPU, + torch_npu.profiler.ProfilerActivity.NPU, + ], + with_stack=True, + profile_memory=True, + with_modules=True, + experimental_config=experimental_config, + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler( + torch_profiler_trace_dir)) + else: + self.profiler = None + + def start_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.start() + + def stop_profile(self): + if self.profiler is None: + raise RuntimeError("Profiler is not enabled.") + self.profiler.stop() + + def init_device(self) -> None: + if self.device_config.device.type == "npu": + # # This env var set by Ray causes exceptions with graph building. + # os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + self.device = torch.device(f"npu:{self.local_rank}") + current_platform.set_device(self.device) + + current_platform.empty_cache() + self.init_npu_memory = current_platform.mem_get_info()[0] + else: + raise RuntimeError( + f"Not support device type: {self.device_config.device}") + # Initialize the distributed environment. + init_worker_distributed_environment(self.parallel_config, self.rank, + self.distributed_init_method, + self.local_rank) + # Set random seed. + set_random_seed(self.model_config.seed) + + def load_model(self): + self.model_runner.load_model() + + def save_sharded_state( + self, + path: str, + pattern: Optional[str] = None, + max_size: Optional[int] = None, + ) -> None: + self.model_runner.save_sharded_state( + path, + pattern=pattern, + max_size=max_size, + ) + + def save_tensorized_model( + self, + tensorizer_config: TensorizerConfig, + ) -> None: + self.model_runner.save_tensorized_model( + tensorizer_config=tensorizer_config, ) + + @current_platform.inference_mode() + def determine_num_available_blocks(self) -> Tuple[int, int]: + """Profiles the peak memory usage of the model to determine how many + KV blocks may be allocated without OOMs. + The engine will first conduct a profiling of the existing memory usage. + Then, it calculate the maximum possible number of NPU and CPU blocks + that can be allocated with the remaining free memory. + .. tip:: + You may limit the usage of NPU memory + by adjusting the `gpu_memory_utilization` parameter. + """ + # Profile the memory usage of the model and get the maximum number of + # cache blocks that can be allocated with the remaining free memory. + current_platform.empty_cache() + + # Execute a forward pass with dummy inputs to profile the memory usage + # of the model. + self.model_runner.profile_run() + + # Calculate the number of blocks that can be allocated with the + # profiled peak memory. + free_npu_memory, total_npu_memory = current_platform.mem_get_info() + # NOTE(woosuk): Here we assume that the other processes using the same + # GPU did not change their memory usage during the profiling. + peak_memory = self.init_npu_memory - free_npu_memory + assert peak_memory > 0, ( + "Error in memory profiling. " + f"Initial free memory {self.init_npu_memory}, current free memory" + f" {free_npu_memory}. This happens when the NPU memory was " + "not properly cleaned up before initializing the vLLM instance.") + + cache_block_size = self.get_cache_block_size_bytes() + num_npu_blocks = int( + (total_npu_memory * self.cache_config.gpu_memory_utilization - + peak_memory) // cache_block_size) + num_cpu_blocks = int(self.cache_config.swap_space_bytes // + cache_block_size) + num_npu_blocks = max(num_npu_blocks, 0) + num_cpu_blocks = max(num_cpu_blocks, 0) + if self.model_runner.lora_manager: + self.model_runner.remove_all_loras() + gc.collect() + # TODO: don`t need impl this func after empty_cache in + # Worker.determine_num_available_blocks() unified` + current_platform.empty_cache() + return num_npu_blocks, num_cpu_blocks + + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + """Allocate NPU and CPU KV cache with the specified number of blocks. + """ + raise_if_cache_size_invalid(num_gpu_blocks, + self.cache_config.block_size, + self.cache_config.is_attention_free, + self.model_config.max_model_len) + + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + + self._init_cache_engine() + self._warm_up_model() + + def _init_cache_engine(self): + assert self.cache_config.num_gpu_blocks is not None + self.cache_engine = [ + CacheEngine(self.cache_config, self.model_config, + self.parallel_config, self.device_config) + for _ in range(self.parallel_config.pipeline_parallel_size) + ] + self.gpu_cache = [ + self.cache_engine[ve].gpu_cache + for ve in range(self.parallel_config.pipeline_parallel_size) + ] + bind_kv_cache(self.compilation_config.static_forward_context, + self.gpu_cache) + + def _warm_up_model(self) -> None: + # model capture is not supported, thus we just set seed here. + # Reset the seed to ensure that the random state is not affected by + # the model initialization and profiling. + set_random_seed(self.model_config.seed) + + @property + def do_metadata_broadcast(self) -> bool: + return self.parallel_config.tensor_parallel_size > 1 + + @property + def kv_cache(self) -> Optional[List[List[torch.Tensor]]]: + return self.gpu_cache + + @torch.inference_mode() + def prepare_worker_input( + self, execute_model_req: ExecuteModelRequest) -> WorkerInput: + virtual_engine = execute_model_req.virtual_engine + num_steps = execute_model_req.num_steps + num_seq_groups = len(execute_model_req.seq_group_metadata_list) + # `blocks_to_swap_in` and `blocks_to_swap_out` are cpu tensors. + # they contain parameters to launch cudamemcpyasync. + blocks_to_swap_in = torch.tensor(execute_model_req.blocks_to_swap_in, + device="cpu", + dtype=torch.int64).view(-1, 2) + blocks_to_swap_out = torch.tensor(execute_model_req.blocks_to_swap_out, + device="cpu", + dtype=torch.int64).view(-1, 2) + # `blocks_to_copy` is a gpu tensor. The src and tgt of + # blocks to copy are in the same device, and `blocks_to_copy` + # can be used directly within cuda kernels. + blocks_to_copy = torch.tensor(execute_model_req.blocks_to_copy, + device=self.device, + dtype=torch.int64).view(-1, 2) + + return WorkerInput( + num_seq_groups=num_seq_groups, + blocks_to_swap_in=blocks_to_swap_in, + blocks_to_swap_out=blocks_to_swap_out, + blocks_to_copy=blocks_to_copy, + virtual_engine=virtual_engine, + num_steps=num_steps, + ) + + def get_model(self) -> nn.Module: + return self.model_runner.get_model() + + @torch.inference_mode() + def execute_worker(self, worker_input: WorkerInput) -> None: + virtual_engine = worker_input.virtual_engine + # Issue cache operations. + if (worker_input.blocks_to_swap_in is not None + and worker_input.blocks_to_swap_in.numel() > 0): + self.cache_engine[virtual_engine].swap_in( + worker_input.blocks_to_swap_in) + if (worker_input.blocks_to_swap_out is not None + and worker_input.blocks_to_swap_out.numel() > 0): + self.cache_engine[virtual_engine].swap_out( + worker_input.blocks_to_swap_out) + if (worker_input.blocks_to_copy is not None + and worker_input.blocks_to_copy.numel() > 0): + self.cache_engine[virtual_engine].copy(worker_input.blocks_to_copy) + + def _get_cached_seq_group_metadata( + self, + seq_group_metadata_list: List[Union[SequenceGroupMetadata, + SequenceGroupMetadataDelta]], + finished_request_ids: List[str]) -> List[SequenceGroupMetadata]: + """Return a list of cached Sequence Group Metadata after updating its + state. + + It is used because scheduler only sends delta to workers to reduce + the data payload size. The function also cleans up cache based on + a given `finished_request_ids`. + """ + new_seq_group_metadata_list = [] + for metadata_or_delta in seq_group_metadata_list: + request_id = metadata_or_delta.request_id + if request_id not in self._seq_group_metadata_cache: + # The first prefill. + assert isinstance(metadata_or_delta, SequenceGroupMetadata) + self._seq_group_metadata_cache[request_id] = metadata_or_delta + else: + # The first prefill is already cached. + if isinstance(metadata_or_delta, SequenceGroupMetadataDelta): + self._seq_group_metadata_cache[request_id].apply_delta( + metadata_or_delta) + else: + # If metadata snapshot is sent again, it is + # preempted. Reset the cache because we need to start + # from scratch. + assert isinstance(metadata_or_delta, SequenceGroupMetadata) + self._seq_group_metadata_cache[ + request_id] = metadata_or_delta + + new_seq_group_metadata_list.append( + self._seq_group_metadata_cache[request_id]) + + # Clean up finished ids + for finished_id in finished_request_ids: + del self._seq_group_metadata_cache[finished_id] + + return new_seq_group_metadata_list + + def _execute_model_spmd( + self, + execute_model_req: ExecuteModelRequest, + intermediate_tensors: Optional[IntermediateTensors] = None, + ) -> Optional[List[SamplerOutput]]: + if execute_model_req is not None: + new_seq_group_metadata_list = self._get_cached_seq_group_metadata( + execute_model_req.seq_group_metadata_list, + execute_model_req.finished_requests_ids) + + execute_model_req.seq_group_metadata_list = ( + new_seq_group_metadata_list) + output = super()._execute_model_spmd(execute_model_req, + intermediate_tensors) + return output + + def add_lora(self, lora_request: LoRARequest) -> bool: + raise NotImplementedError( + "LoRA is not implemented for NPU backend currently.") + + def remove_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRA is not implemented for NPU backend currently.") + + def pin_lora(self, lora_id: int) -> bool: + raise NotImplementedError( + "LoRA is not implemented for NPU backend currently.") + + def list_loras(self) -> Set[int]: + raise NotImplementedError( + "LoRA is not implemented for NPU backend currently.") + + def add_prompt_adapter( + self, prompt_adapter_request: PromptAdapterRequest) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for NPU backend currently.") + + def remove_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for NPU backend currently.") + + def pin_prompt_adapter(self, prompt_adapter_id: int) -> bool: + raise NotImplementedError( + "Prompt Adapter is not implemented for NPU backend currently.") + + def list_prompt_adapters(self) -> Set[int]: + raise NotImplementedError( + "Prompt Adapter is not implemented for NPU backend currently.") + + @property + def max_model_len(self) -> int: + return self.model_config.max_model_len + + @property + def vocab_size(self) -> int: + return self.model_runner.vocab_size + + def get_cache_block_size_bytes(self) -> int: + """Get the size of the KV cache block size in bytes. + """ + return CacheEngine.get_cache_block_size(self.cache_config, + self.model_config, + self.parallel_config) + + +def init_worker_distributed_environment( + parallel_config: ParallelConfig, + rank: int, + distributed_init_method: Optional[str] = None, + local_rank: int = -1, + backend: str = "hccl") -> None: + """Initialize the distributed environment.""" + set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) + + init_distributed_environment(parallel_config.world_size, rank, + distributed_init_method, local_rank, backend) + + ensure_model_parallel_initialized(parallel_config.tensor_parallel_size, + parallel_config.pipeline_parallel_size) + + +def raise_if_cache_size_invalid(num_gpu_blocks, block_size, is_attention_free, + max_model_len) -> None: + if is_attention_free and num_gpu_blocks != 0: + raise ValueError("No memory should be allocated for the cache blocks " + f"for an attention-free model, but {num_gpu_blocks}" + "blocks are allocated.") + if not is_attention_free and num_gpu_blocks <= 0: + raise ValueError("No available memory for the cache blocks. " + "Try increasing `gpu_memory_utilization` when " + "initializing the engine.") + max_seq_len = block_size * num_gpu_blocks + if not is_attention_free and max_model_len > max_seq_len: + raise ValueError( + f"The model's max seq len ({max_model_len}) " + "is larger than the maximum number of tokens that can be " + f"stored in KV cache ({max_seq_len}). Try increasing " + "`gpu_memory_utilization` or decreasing `max_model_len` when " + "initializing the engine.")