diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 000000000..38c0b58e8 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,28 @@ + +### What this PR does / why we need it? + + +### Does this PR introduce _any_ user-facing change? + + +### How was this patch tested? + + diff --git a/.github/workflows/actionlint.yml b/.github/workflows/actionlint.yml new file mode 100644 index 000000000..1161a6e21 --- /dev/null +++ b/.github/workflows/actionlint.yml @@ -0,0 +1,57 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Lint GitHub Actions workflows +on: + push: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + pull_request: + branches: + - "main" + paths: + - '.github/workflows/*.ya?ml' + - '.github/workflows/actionlint.*' + - '.github/workflows/matchers/actionlint.json' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + actionlint: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Run actionlint" + run: | + echo "::add-matcher::.github/workflows/matchers/actionlint.json" + tools/actionlint.sh -color diff --git a/.github/workflows/matchers/actionlint.json b/.github/workflows/matchers/actionlint.json new file mode 100644 index 000000000..4613e1617 --- /dev/null +++ b/.github/workflows/matchers/actionlint.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "actionlint", + "pattern": [ + { + "regexp": "^(?:\\x1b\\[\\d+m)?(.+?)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*:(?:\\x1b\\[\\d+m)*(\\d+)(?:\\x1b\\[\\d+m)*: (?:\\x1b\\[\\d+m)*(.+?)(?:\\x1b\\[\\d+m)* \\[(.+?)\\]$", + "file": 1, + "line": 2, + "column": 3, + "message": 4, + "code": 5 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/mypy.json b/.github/workflows/matchers/mypy.json new file mode 100644 index 000000000..f048fce52 --- /dev/null +++ b/.github/workflows/matchers/mypy.json @@ -0,0 +1,16 @@ +{ + "problemMatcher": [ + { + "owner": "mypy", + "pattern": [ + { + "regexp": "^(.+):(\\d+):\\s(error|warning):\\s(.+)$", + "file": 1, + "line": 2, + "severity": 3, + "message": 4 + } + ] + } + ] +} diff --git a/.github/workflows/matchers/ruff.json b/.github/workflows/matchers/ruff.json new file mode 100644 index 000000000..f6d4479ee --- /dev/null +++ b/.github/workflows/matchers/ruff.json @@ -0,0 +1,17 @@ +{ + "problemMatcher": [ + { + "owner": "ruff", + "pattern": [ + { + "regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$", + "file": 1, + "line": 2, + "column": 3, + "code": 4, + "message": 5 + } + ] + } + ] + } diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml new file mode 100644 index 000000000..ec9c2e6f5 --- /dev/null +++ b/.github/workflows/mypy.yaml @@ -0,0 +1,74 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: mypy + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + pull_request: + branches: + - "main" + # This workflow is only relevant when one of the following files changes. + # However, we have github configured to expect and require this workflow + # to run and pass before github with auto-merge a pull request. Until github + # allows more flexible auto-merge policy, we can just run this on every PR. + # It doesn't take that long to run, anyway. + paths: + - '**/*.py' + - '.github/workflows/mypy.yaml' + - 'tools/mypy.sh' + +jobs: + mypy: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.9", "3.10", "3.11", "3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: vllm-empty + run: | + pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu + VLLM_TARGET_DEVICE=empty pip install . + + - name: Mypy + run: | + echo "::add-matcher::.github/workflows/matchers/mypy.json" + tools/mypy.sh 1 ${{ matrix.python-version }} diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml new file mode 100644 index 000000000..11573a84a --- /dev/null +++ b/.github/workflows/ruff.yml @@ -0,0 +1,57 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: ruff + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - "**/*.py" + - requirements-lint.txt + - .github/workflows/matchers/ruff.json + - .github/workflows/ruff.yml + pull_request: + branches: + - "main" + +jobs: + ruff: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install -r requirements-lint.txt + - name: Analysing the code with ruff + run: | + echo "::add-matcher::.github/workflows/matchers/ruff.json" + ruff check --output-format github . + - name: Run isort + run: | + isort . --check-only diff --git a/.github/workflows/shellcheck.yml b/.github/workflows/shellcheck.yml new file mode 100644 index 000000000..6a8ff7a28 --- /dev/null +++ b/.github/workflows/shellcheck.yml @@ -0,0 +1,54 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: Lint shell scripts +on: + push: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + pull_request: + branches: + - "main" + paths: + - '**/*.sh' + - '.github/workflows/shellcheck.yml' + +env: + LC_ALL: en_US.UTF-8 + +defaults: + run: + shell: bash + +permissions: + contents: read + +jobs: + shellcheck: + runs-on: ubuntu-latest + steps: + - name: "Checkout" + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + fetch-depth: 0 + + - name: "Check shell scripts" + run: | + tools/shellcheck.sh diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml new file mode 100644 index 000000000..bea98471a --- /dev/null +++ b/.github/workflows/vllm_ascend_test.yaml @@ -0,0 +1,106 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: 'e2e test' + +on: + push: + branches: + - "main" + paths: + - '*.txt' + - '**/*.py' + - '.github/workflows/vllm_ascend_test.yaml' + pull_request: + branches: + - "main" + paths: + - '*.txt' + - '**/*.py' + - '.github/workflows/vllm_ascend_test.yaml' + +# Bash shells do not use ~/.profile or ~/.bashrc so these shells need to be explicitly +# declared as "shell: bash -el {0}" on steps that need to be properly activated. +# It's used to activate ascend-toolkit environment variables. +defaults: + run: + shell: bash -el {0} + +jobs: + test: + name: vLLM Ascend test (self-host) + runs-on: ascend-arm64 # actionlint-ignore: runner-label + + container: + image: quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 + volumes: + - /usr/local/dcmi:/usr/local/dcmi + - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi + - /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ + # Use self-host cache speed up pip and model download + - /home/action/actions-runner/_work/cache:/github/home/.cache/ + options: >- + --device /dev/davinci6 + --device /dev/davinci_manager + --device /dev/devmm_svm + --device /dev/hisi_hdc + env: + HF_ENDPOINT: https://hf-mirror.com + steps: + - name: Check npu driver + run: | + npu-smi info + + - name: Config mirrors + run: | + sed -i 's|ports.ubuntu.com|mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list + pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + + - name: Checkout vllm-project/vllm-ascend repo + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + apt-get update -y + apt-get -y install `cat packages.txt` + + - name: Install dependencies + run: | + pip install -r requirements-dev.txt + + - name: Checkout vllm-project/vllm repo + uses: actions/checkout@v4 + with: + repository: vllm-project/vllm + path: ./vllm-empty + + - name: Install vllm-project/vllm from source + working-directory: ./vllm-empty + run: | + VLLM_TARGET_DEVICE=empty pip install -e . + + - name: Install vllm-project/vllm-ascend + run: | + pip install -e . + + - name: Run vllm-project/vllm-ascend test + run: | + pytest -sv tests + + - name: Run vllm-project/vllm test + run: | + bash tools/npu-vllm-test.sh diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml new file mode 100644 index 000000000..14a3ae925 --- /dev/null +++ b/.github/workflows/yapf.yml @@ -0,0 +1,54 @@ +# +# Adapted from vllm-project/vllm/blob/main/.github +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +name: yapf + +on: + # Trigger the workflow on push or pull request, + # but only for the main branch + push: + branches: + - "main" + paths: + - "**/*.py" + - .github/workflows/yapf.yml + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - .github/workflows/yapf.yml + +jobs: + yapf: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ["3.12"] + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install yapf==0.32.0 + - name: Running yapf + run: | + yapf --diff --recursive . diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..3991ac8f0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,194 @@ +## vLLM Ascend Ignore +# VSCode +.vscode/ + +# egg-info +vllm_ascend.egg-info/ + +# DS Store +.DS_Store + +# Linting +actionlint +shellcheck*/ + + +# Python gitignore +## Adapted from: +## https://github.com/github/gitignore/blob/main/Python.gitignore + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +.idea/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000..f801b5f8f --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). + diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 000000000..c7d45f682 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,107 @@ +# Contributing to vLLM Ascend plugin + +## Building and testing +It's recommended to set up a local development environment to build and test +before you submit a PR. + +### Prepare environment and build + +Theoretically, the vllm-ascend build is only supported on Linux because +`vllm-ascend` dependency `torch_npu` only supports Linux. + +But you can still set up dev env on Linux/Windows/macOS for linting and basic +test as following commands: + +```bash +# Choose a base dir (~/vllm-project/) and set up venv +cd ~/vllm-project/ +python3 -m venv .venv +source ./.venv/bin/activate + +# Clone vllm code and install +git clone https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-build.txt +VLLM_TARGET_DEVICE="empty" pip install . +cd .. + +# Clone vllm-ascend and install +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -r requirements-dev.txt + +# Then you can run lint and mypy test +bash format.sh + +# Build: +# - only supported on Linux (torch_npu available) +# pip install -e . +# - build without deps for debugging in other OS +# pip install -e . --no-deps + +# Commit changed files using `-s` +git commit -sm "your commit info" +``` + +### Testing + +Although vllm-ascend CI provide integration test on [Ascend](.github/workflows/vllm_ascend_test.yaml), you can run it +locally. The simplest way to run these integration tests locally is through a container: + +```bash +# Under Ascend NPU environment +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend + +IMAGE=vllm-ascend-dev-image +CONTAINER_NAME=vllm-ascend-dev +DEVICE=/dev/davinci1 + +# The first build will take about 10 mins (10MB/s) to download the base image and packages +docker build -t $IMAGE -f ./Dockerfile . +# You can also specify the mirror repo via setting VLLM_REPO to speedup +# docker build -t $IMAGE -f ./Dockerfile . --build-arg VLLM_REPO=https://gitee.com/mirrors/vllm + +docker run --name $CONTAINER_NAME --network host --device $DEVICE \ + --device /dev/davinci_manager --device /dev/devmm_svm \ + --device /dev/hisi_hdc -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -ti --rm $IMAGE bash + +cd vllm-ascend +pip install -r requirements-dev.txt + +pytest tests/ +``` + +## DCO and Signed-off-by + +When contributing changes to this project, you must agree to the DCO. Commits must include a `Signed-off-by:` header which certifies agreement with the terms of the DCO. + +Using `-s` with `git commit` will automatically add this header. + +## PR Title and Classification + +Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following: + +- `[Attention]` for new features or optimization in attention. +- `[Communicator]` for new features or optimization in communicators. +- `[ModelRunner]` for new features or optimization in model runner. +- `[Platform]` for new features or optimization in platform. +- `[Worker]` for new features or optimization in worker. +- `[Core]` for new features or optimization in the core vllm-ascend logic (such as platform, attention, communicators, model runner) +- `[Kernel]` changes affecting compute kernels and ops. +- `[Bugfix]` for bug fixes. +- `[Doc]` for documentation fixes and improvements. +- `[Test]` for tests (such as unit tests). +- `[CI]` for build or continuous integration improvements. +- `[Misc]` for PRs that do not fit the above categories. Please use this sparingly. + +> [!NOTE] +> If the PR spans more than one category, please include all relevant prefixes. + +## Others + +You may find more information about contributing to vLLM Ascend backend plugin on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html). +If you find any problem when contributing, you can feel free to submit a PR to improve the doc to help other developers. diff --git a/DCO b/DCO new file mode 100644 index 000000000..49b8cb054 --- /dev/null +++ b/DCO @@ -0,0 +1,34 @@ +Developer Certificate of Origin +Version 1.1 + +Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + +Everyone is permitted to copy and distribute verbatim copies of this +license document, but changing it is not allowed. + + +Developer's Certificate of Origin 1.1 + +By making a contribution to this project, I certify that: + +(a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + +(b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + +(c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + +(d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..63c8bd685 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,40 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +FROM quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 + +# Define environments +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update -y && \ + apt-get install -y python3-pip git vim + +WORKDIR /workspace + +COPY . /workspace/vllm-ascend/ + +RUN pip config set global.index-url https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple + +# Install vLLM main +ARG VLLM_REPO=https://github.com/vllm-project/vllm.git +RUN git clone --depth 1 $VLLM_REPO /workspace/vllm +RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install /workspace/vllm/ + +# Install vllm-ascend main +RUN python3 -m pip install /workspace/vllm-ascend/ + +CMD ["/bin/bash"] diff --git a/README.md b/README.md new file mode 100644 index 000000000..c16e8372d --- /dev/null +++ b/README.md @@ -0,0 +1,102 @@ +
+
+
+| About Ascend | Developer Slack (#sig-ascend) | +
+ +--- +*Latest News* π₯ + +- [2024/12] We are working with the vLLM community to support [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162). +--- +## Overview + +vLLM Ascend plugin (`vllm-ascend`) is a backend plugin for running vLLM on the Ascend NPU. + +This plugin is the recommended approach for supporting the Ascend backend within the vLLM community. It adheres to the principles outlined in the [[RFC]: Hardware pluggable](https://github.com/vllm-project/vllm/issues/11162), providing a hardware-pluggable interface that decouples the integration of the Ascend NPU with vLLM. + +By using vLLM Ascend plugin, popular open-source models, including Transformer-like, Mixture-of-Expert, Embedding, Multi-modal LLMs can run seamlessly on the Ascend NPU. + +## Prerequisites +### Support Devices +- Atlas A2 Training series (Atlas 800T A2, Atlas 900 A2 PoD, Atlas 200T A2 Box16, Atlas 300T A2) +- Atlas 800I A2 Inference series (Atlas 800I A2) + +### Dependencies +| Requirement | Supported version | Recommended version | Note | +|-------------|-------------------| ----------- |------------------------------------------| +| vLLM | main | main | Required for vllm-ascend | +| Python | >= 3.9 | [3.10](https://www.python.org/downloads/) | Required for vllm | +| CANN | >= 8.0.RC2 | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu | +| torch-npu | >= 2.4.0 | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1) | Required for vllm-ascend | +| torch | >= 2.4.0 | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1) | Required for torch-npu and vllm required | + +Find more about how to setup your environment in [here](docs/environment.md). + +## Getting Started + +> [!NOTE] +> Currently, we are actively collaborating with the vLLM community to support the Ascend backend plugin, once supported you can use one line command `pip install vllm vllm-ascend` to compelete installation. + +Installation from source code: +```bash +# Install vllm main branch according: +# https://docs.vllm.ai/en/latest/getting_started/installation/cpu/index.html#build-wheel-from-source +git clone --depth 1 https://github.com/vllm-project/vllm.git +cd vllm +pip install -r requirements-build.txt +VLLM_TARGET_DEVICE=empty pip install . + +# Install vllm-ascend main branch +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -e . +``` + +Run the following command to start the vLLM server with the [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct) model: + +```bash +# export VLLM_USE_MODELSCOPE=true to speed up download +vllm serve Qwen/Qwen2.5-0.5B-Instruct +curl http://localhost:8000/v1/models +``` + +Please refer to [vLLM Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html) for more details. + +## Building + +#### Build Python package from source + +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +pip install -e . +``` + +#### Build container image from source +```bash +git clone https://github.com/vllm-project/vllm-ascend.git +cd vllm-ascend +docker build -t vllm-ascend-dev-image -f ./Dockerfile . +``` + +See [Building and Testing](./CONTRIBUTING.md) for more details, which is a step-by-step guide to help you set up development environment, build and test. + +## Contributing +We welcome and value any contributions and collaborations: +- Please let us know if you encounter a bug by [filing an issue](https://github.com/vllm-project/vllm-ascend/issues). +- Please see the guidance on how to contribute in [CONTRIBUTING.md](./CONTRIBUTING.md). + +## License + +Apache License 2.0, as found in the [LICENSE](./LICENSE) file. diff --git a/docs/environment.md b/docs/environment.md new file mode 100644 index 000000000..5dd70b29a --- /dev/null +++ b/docs/environment.md @@ -0,0 +1,38 @@ +### Prepare Ascend NPU environment + +### Dependencies +| Requirement | Supported version | Recommended version | Note | +| ------------ | ------- | ----------- | ----------- | +| Python | >= 3.9 | [3.10](https://www.python.org/downloads/) | Required for vllm | +| CANN | >= 8.0.RC2 | [8.0.RC3](https://www.hiascend.com/developer/download/community/result?module=cann&cann=8.0.0.beta1) | Required for vllm-ascend and torch-npu | +| torch-npu | >= 2.4.0 | [2.5.1rc1](https://gitee.com/ascend/pytorch/releases/tag/v6.0.0.alpha001-pytorch2.5.1) | Required for vllm-ascend | +| torch | >= 2.4.0 | [2.5.1](https://github.com/pytorch/pytorch/releases/tag/v2.5.1) | Required for torch-npu and vllm required | + + +Below is a quick note to install recommended version software: + +#### Containerized installation + +You can use the [container image](https://hub.docker.com/r/ascendai/cann) directly with one line command: + +```bash +docker run \ + --name vllm-ascend-env \ + --device /dev/davinci1 \ + --device /dev/davinci_manager \ + --device /dev/devmm_svm \ + --device /dev/hisi_hdc \ + -v /usr/local/dcmi:/usr/local/dcmi \ + -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ + -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ + -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ + -v /etc/ascend_install.info:/etc/ascend_install.info \ + -it quay.io/ascend/cann:8.0.rc3.beta1-910b-ubuntu22.04-py3.10 bash +``` + +You do not need to install `torch` and `torch_npu` manually, they will be automatically installed as `vllm-ascend` dependencies. + +#### Manual installation + +Or follow the instructions provided in the [Ascend Installation Guide](https://ascend.github.io/docs/sources/ascend/quick_install.html) to set up the environment. + diff --git a/docs/logos/vllm-ascend-logo-text-dark.png b/docs/logos/vllm-ascend-logo-text-dark.png new file mode 100644 index 000000000..f534d09ee Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-dark.png differ diff --git a/docs/logos/vllm-ascend-logo-text-light.png b/docs/logos/vllm-ascend-logo-text-light.png new file mode 100644 index 000000000..b71b49267 Binary files /dev/null and b/docs/logos/vllm-ascend-logo-text-light.png differ diff --git a/examples/offline_distributed_inference_npu.py b/examples/offline_distributed_inference_npu.py new file mode 100644 index 000000000..f8d5489a5 --- /dev/null +++ b/examples/offline_distributed_inference_npu.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +# TODO (cmq): ray is not supported currently, need some fixes +llm = LLM( + model="facebook/opt-125m", + tensor_parallel_size=2, + distributed_executor_backend="mp", + trust_remote_code=True, +) + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py new file mode 100644 index 000000000..785492c7d --- /dev/null +++ b/examples/offline_inference_audio_language.py @@ -0,0 +1,153 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/audio_language.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +""" +This example shows how to use vLLM for running offline inference +with the correct prompt format on audio language models. + +For most models, the prompt format should follow corresponding examples +on HuggingFace model repository. +""" + +from transformers import AutoTokenizer +from vllm import LLM, SamplingParams +from vllm.assets.audio import AudioAsset +from vllm.utils import FlexibleArgumentParser + +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = { + 0: "What is 1+1?", + 1: "What is recited in the audio?", + 2: "What sport and what nursery rhyme are referenced?" +} + +# NOTE: The default `max_num_seqs` and `max_model_len` may result in OOM on +# lower-end GPUs. +# Unless specified, these settings have been tested to work on a single L4. + + +# Ultravox 0.3 +def run_ultravox(question: str, audio_count: int): + model_name = "fixie-ai/ultravox-v0_3" + + tokenizer = AutoTokenizer.from_pretrained(model_name) + messages = [{ + 'role': 'user', + 'content': "<|audio|>\n" * audio_count + question + }] + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + trust_remote_code=True, + limit_mm_per_prompt={"audio": audio_count}) + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# Qwen2-Audio +def run_qwen2_audio(question: str, audio_count: int): + model_name = "Qwen/Qwen2-Audio-7B-Instruct" + + llm = LLM(model=model_name, + max_model_len=4096, + max_num_seqs=5, + limit_mm_per_prompt={"audio": audio_count}) + + audio_in_prompt = "".join([ + f"Audio {idx+1}: " + f"<|audio_bos|><|AUDIO|><|audio_eos|>\n" for idx in range(audio_count) + ]) + + prompt = ("<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "<|im_start|>user\n" + f"{audio_in_prompt}{question}<|im_end|>\n" + "<|im_start|>assistant\n") + stop_token_ids = None + return llm, prompt, stop_token_ids + + +# TODO (cmq): test ultravox +model_example_map = { + # "ultravox": run_ultravox, + "qwen2_audio": run_qwen2_audio +} + + +def main(args): + model = args.model_type + if model not in model_example_map: + raise ValueError(f"Model type {model} is not supported.") + + audio_count = args.num_audios + llm, prompt, stop_token_ids = model_example_map[model]( + question_per_audio_count[audio_count], audio_count) + + # We set temperature to 0.2 so that outputs can be different + # even when all prompts are identical when running batch inference. + sampling_params = SamplingParams(temperature=0.2, + max_tokens=64, + stop_token_ids=stop_token_ids) + + mm_data = {} + if audio_count > 0: + mm_data = { + "audio": [ + asset.audio_and_sample_rate + for asset in audio_assets[:audio_count] + ] + } + + assert args.num_prompts > 0 + inputs = {"prompt": prompt, "multi_modal_data": mm_data} + if args.num_prompts > 1: + # Batch inference + inputs = [inputs] * args.num_prompts + + outputs = llm.generate(inputs, sampling_params=sampling_params) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'audio language models') + parser.add_argument('--model-type', + '-m', + type=str, + default="qwen2_audio", + choices=model_example_map.keys(), + help='Huggingface "model_type".') + parser.add_argument('--num-prompts', + type=int, + default=1, + help='Number of prompts to run.') + parser.add_argument("--num-audios", + type=int, + default=1, + choices=[0, 1, 2], + help="Number of audio items per prompt.") + + args = parser.parse_args() + main(args) diff --git a/examples/offline_inference_npu.py b/examples/offline_inference_npu.py new file mode 100644 index 000000000..10c2c6e40 --- /dev/null +++ b/examples/offline_inference_npu.py @@ -0,0 +1,39 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/examples/offline_inference/basic.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from vllm import LLM, SamplingParams + +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] + +# Create a sampling params object. +sampling_params = SamplingParams(max_tokens=100, temperature=0.0) +# Create an LLM. +llm = LLM(model="facebook/opt-125m") + +# Generate texts from the prompts. +outputs = llm.generate(prompts, sampling_params) +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/format.sh b/format.sh new file mode 100755 index 000000000..9ea7495c2 --- /dev/null +++ b/format.sh @@ -0,0 +1,341 @@ +#!/usr/bin/env bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# YAPF formatter, adapted from ray and skypilot. +# +# Usage: +# # Do work and commit your work. + +# # Format files that differ from origin/main. +# bash format.sh + +# # Commit changed files with message 'Run yapf and ruff' +# +# +# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. +# You are encouraged to run this locally before pushing changes for review. + +# Cause the script to exit if a single command fails +set -eo pipefail + +# this stops git rev-parse from failing if we run this from the .git directory +builtin cd "$(dirname "${BASH_SOURCE:-$0}")" +ROOT="$(git rev-parse --show-toplevel)" +builtin cd "$ROOT" || exit 1 + +check_command() { + if ! command -v "$1" &> /dev/null; then + echo "ββ$1 is not installed, please run \`pip install -r requirements-lint.txt\`" + exit 1 + fi +} + +check_command yapf +check_command ruff +check_command mypy +check_command codespell +check_command isort +check_command clang-format + +YAPF_VERSION=$(yapf --version | awk '{print $2}') +RUFF_VERSION=$(ruff --version | awk '{print $2}') +MYPY_VERSION=$(mypy --version | awk '{print $2}') +CODESPELL_VERSION=$(codespell --version) +ISORT_VERSION=$(isort --vn) +CLANGFORMAT_VERSION=$(clang-format --version | awk '{print $3}') +SPHINX_LINT_VERSION=$(sphinx-lint --version | awk '{print $2}') + +# params: tool name, tool version, required version +tool_version_check() { + expected=$(grep "$1" requirements-lint.txt | cut -d'=' -f3) + if [[ "$2" != "$expected" ]]; then + echo "ββWrong $1 version installed: $expected is required, not $2." + exit 1 + fi +} + +tool_version_check "yapf" "$YAPF_VERSION" +tool_version_check "ruff" "$RUFF_VERSION" +tool_version_check "mypy" "$MYPY_VERSION" +tool_version_check "isort" "$ISORT_VERSION" +tool_version_check "codespell" "$CODESPELL_VERSION" +tool_version_check "clang-format" "$CLANGFORMAT_VERSION" +tool_version_check "sphinx-lint" "$SPHINX_LINT_VERSION" + +YAPF_FLAGS=( + '--recursive' + '--parallel' +) + +YAPF_EXCLUDES=( + '--exclude' 'build/**' +) + +# Format specified files +format() { + yapf --in-place "${YAPF_FLAGS[@]}" "$@" +} + +# Format files that differ from main branch. Ignores dirs that are not slated +# for autoformat yet. +format_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause yapf to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ + yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" + fi + +} + +# Format all files +format_all() { + yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" . +} + +## This flag formats individual files. --files *must* be the first command line +## arg to use this option. +if [[ "$1" == '--files' ]]; then + format "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is formatted. +elif [[ "$1" == '--all' ]]; then + format_all +else + # Format only the files that changed in last commit. + format_changed +fi +echo 'vLLM yapf: Done' + +# Run mypy +echo 'vLLM mypy:' +tools/mypy.sh +echo 'vLLM mypy: Done' + + +# If git diff returns a file that is in the skip list, the file may be checked anyway: +# https://github.com/codespell-project/codespell/issues/1915 +# Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem +CODESPELL_EXCLUDES=( + '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**' +) + +# check spelling of specified files +spell_check() { + codespell "$@" +} + +spell_check_all(){ + codespell --toml pyproject.toml "${CODESPELL_EXCLUDES[@]}" +} + +# Spelling check of files that differ from main branch. +spell_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + codespell "${CODESPELL_EXCLUDES[@]}" + fi +} + +# Run Codespell +## This flag runs spell check of individual files. --files *must* be the first command line +## arg to use this option. +if [[ "$1" == '--files' ]]; then + spell_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + spell_check_all +else + # Check spelling only of the files that changed in last commit. + spell_check_changed +fi +echo 'vLLM codespell: Done' + + +# Lint specified files +lint() { + ruff check "$@" +} + +# Lint files that differ from main branch. Ignores dirs that are not slated +# for autolint yet. +lint_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + ruff check + fi + +} + +# Run Ruff +### This flag lints individual files. --files *must* be the first command line +### arg to use this option. +if [[ "$1" == '--files' ]]; then + lint "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + lint vllm tests +else + # Format only the files that changed in last commit. + lint_changed +fi +echo 'vLLM ruff: Done' + +# check spelling of specified files +isort_check() { + isort "$@" +} + +isort_check_all(){ + isort . +} + +# Spelling check of files that differ from main branch. +isort_check_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause ruff to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then + git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ + isort + fi +} + +# Run Isort +# This flag runs spell check of individual files. --files *must* be the first command line +# arg to use this option. +if [[ "$1" == '--files' ]]; then + isort_check "${@:2}" + # If `--all` is passed, then any further arguments are ignored and the + # entire python directory is linted. +elif [[ "$1" == '--all' ]]; then + isort_check_all +else + # Check spelling only of the files that changed in last commit. + isort_check_changed +fi +echo 'vLLM isort: Done' + +# Clang-format section +# Exclude some files for formatting because they are vendored +# NOTE: Keep up to date with .github/workflows/clang-format.yml +CLANG_FORMAT_EXCLUDES=( + 'csrc/moe/topk_softmax_kernels.cu' + 'csrc/quantization/gguf/ggml-common.h' + 'csrc/quantization/gguf/dequantize.cuh' + 'csrc/quantization/gguf/vecdotq.cuh' + 'csrc/quantization/gguf/mmq.cuh' + 'csrc/quantization/gguf/mmvq.cuh' +) + +# Format specified files with clang-format +clang_format() { + clang-format -i "$@" +} + +# Format files that differ from main branch with clang-format. +clang_format_changed() { + # The `if` guard ensures that the list of filenames is not empty, which + # could cause clang-format to receive 0 positional arguments, making it hang + # waiting for STDIN. + # + # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that + # exist on both branches. + MERGEBASE="$(git merge-base origin/main HEAD)" + + # Get the list of changed files, excluding the specified ones + changed_files=$(git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.h' '*.cpp' '*.cu' '*.cuh' | (grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") || echo -e)) + if [ -n "$changed_files" ]; then + echo "$changed_files" | xargs -P 5 clang-format -i + fi +} + +# Format all files with clang-format +clang_format_all() { + find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \ + | grep -vFf <(printf "%s\n" "${CLANG_FORMAT_EXCLUDES[@]}") \ + | xargs clang-format -i +} + +# Run clang-format +if [[ "$1" == '--files' ]]; then + clang_format "${@:2}" +elif [[ "$1" == '--all' ]]; then + clang_format_all +else + clang_format_changed +fi +echo 'vLLM clang-format: Done' + +echo 'vLLM actionlint:' +tools/actionlint.sh -color +echo 'vLLM actionlint: Done' + +echo 'vLLM shellcheck:' +tools/shellcheck.sh +echo 'vLLM shellcheck: Done' + +echo 'excalidraw png check:' +tools/png-lint.sh +echo 'excalidraw png check: Done' + +if ! git diff --quiet &>/dev/null; then + echo + echo "ππThere are files changed by the format checker or by you that are not added and committed:" + git --no-pager diff --name-only + echo "ππFormat checker passed, but please add, commit and push all the files above to include changes made by the format checker." + + exit 1 +else + echo "β¨π Format check passed! Congratulations! πβ¨" +fi + +# echo 'vLLM sphinx-lint:' +# tools/sphinx-lint.sh +# echo 'vLLM sphinx-lint: Done' diff --git a/mypy.ini b/mypy.ini new file mode 100644 index 000000000..b627e7f51 --- /dev/null +++ b/mypy.ini @@ -0,0 +1,14 @@ +[mypy] +; warn_return_any = True +warn_unused_configs = True + +; Suppress all missing import errors from torch_npu for mypy. +[mypy-torch_npu.*] +ignore_missing_imports = True + +[mypy-transformers.*] +ignore_missing_imports = True + +; Remove this after https://github.com/vllm-project/vllm/pull/11324 merged +[mypy-vllm.distributed.device_communicators.base_communicator] +ignore_missing_imports = True diff --git a/packages.txt b/packages.txt new file mode 100644 index 000000000..c6490115b --- /dev/null +++ b/packages.txt @@ -0,0 +1,3 @@ +git +vim + diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 000000000..60a78830d --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-r requirements-lint.txt +modelscope +pytest diff --git a/requirements-lint.txt b/requirements-lint.txt new file mode 100644 index 000000000..711bb50a0 --- /dev/null +++ b/requirements-lint.txt @@ -0,0 +1,15 @@ +# formatting +yapf==0.32.0 +toml==0.10.2 +tomli==2.0.2 +ruff==0.6.5 +codespell==2.3.0 +isort==5.13.2 +clang-format==18.1.5 +sphinx-lint==1.0.0 + +# type checking +mypy==1.11.1 +types-PyYAML +types-requests +types-setuptools diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 000000000..51cb33f2b --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +decorator +pyyaml +scipy +setuptools +torch_npu == 2.5.1rc1 diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..4aaab9907 --- /dev/null +++ b/setup.py @@ -0,0 +1,95 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/blob/main/setup.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from typing import List + +from setuptools import setup + +ROOT_DIR = os.path.dirname(__file__) + + +def get_path(*filepath) -> str: + return os.path.join(ROOT_DIR, *filepath) + + +def read_readme() -> str: + """Read the README file if present.""" + p = get_path("README.md") + if os.path.isfile(p): + with open(get_path("README.md"), encoding="utf-8") as f: + return f.read() + else: + return "" + + +def get_requirements() -> List[str]: + """Get Python package dependencies from requirements.txt.""" + + def _read_requirements(filename: str) -> List[str]: + with open(get_path(filename)) as f: + requirements = f.read().strip().split("\n") + resolved_requirements = [] + for line in requirements: + if line.startswith("-r "): + resolved_requirements += _read_requirements(line.split()[1]) + elif line.startswith("--"): + continue + else: + resolved_requirements.append(line) + return resolved_requirements + + try: + requirements = _read_requirements("requirements.txt") + except ValueError: + print("Failed to read requirements.txt in vllm_ascend.") + return requirements + + +setup( + name='vllm_ascend', + # Follow: + # https://packaging.python.org/en/latest/specifications/version-specifiers + version='0.1.0a1', + author="vLLM-Ascend team", + license="Apache 2.0", + description=("vLLM Ascend backend plugin"), + long_description=read_readme(), + long_description_content_type="text/markdown", + url="https://github.com/vllm-project/vllm-ascend", + project_urls={ + "Homepage": "https://github.com/vllm-project/vllm-ascend", + }, + classifiers=[ + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Intended Audience :: Information Technology", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Topic :: Scientific/Engineering :: Information Analysis", + ], + packages=['vllm_ascend'], + python_requires=">=3.9", + install_requires=get_requirements(), + extras_require={}, + entry_points={'vllm.platform_plugins': ["ascend = vllm_ascend:register"]}) diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..3a593e45e --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,331 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/conftest.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import List, Optional, Tuple, TypeVar, Union + +import numpy as np +import pytest +from PIL import Image +from vllm import LLM, SamplingParams +from vllm.config import TaskOption +from vllm.distributed import cleanup_dist_env_and_memory +from vllm.inputs import ExplicitEncoderDecoderPrompt, TextPrompt, TokensPrompt +from vllm.logger import init_logger +from vllm.outputs import RequestOutput +from vllm.sampling_params import BeamSearchParams +from vllm.utils import is_list_of + +from tests.model_utils import (TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs) + +logger = init_logger(__name__) + +_M = TypeVar("_M") +_PromptMultiModalInput = Union[List[_M], List[List[_M]]] + +PromptImageInput = _PromptMultiModalInput[Image.Image] +PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]] +PromptVideoInput = _PromptMultiModalInput[np.ndarray] + + +class VllmRunner: + + def __init__( + self, + model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + tokenizer_mode: str = "auto", + # Use smaller max model length, otherwise bigger model cannot run due + # to kv cache size limit. + max_model_len: int = 1024, + dtype: str = "half", + disable_log_stats: bool = True, + tensor_parallel_size: int = 1, + block_size: int = 16, + enable_chunked_prefill: bool = False, + swap_space: int = 4, + enforce_eager: Optional[bool] = False, + **kwargs, + ) -> None: + self.model = LLM( + model=model_name, + task=task, + tokenizer=tokenizer_name, + tokenizer_mode=tokenizer_mode, + trust_remote_code=True, + dtype=dtype, + swap_space=swap_space, + enforce_eager=enforce_eager, + disable_log_stats=disable_log_stats, + tensor_parallel_size=tensor_parallel_size, + max_model_len=max_model_len, + block_size=block_size, + enable_chunked_prefill=enable_chunked_prefill, + **kwargs, + ) + + def get_inputs( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[TextPrompt]: + if images is not None: + assert len(prompts) == len(images) + + if videos is not None: + assert len(prompts) == len(videos) + + if audios is not None: + assert len(prompts) == len(audios) + + inputs = [TextPrompt(prompt=prompt) for prompt in prompts] + if images is not None: + for i, image in enumerate(images): + if image is not None: + inputs[i]["multi_modal_data"] = {"image": image} + + if videos is not None: + for i, video in enumerate(videos): + if video is not None: + inputs[i]["multi_modal_data"] = {"video": video} + + if audios is not None: + for i, audio in enumerate(audios): + if audio is not None: + inputs[i]["multi_modal_data"] = {"audio": audio} + + return inputs + + def generate( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[List[int]], List[str]]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + outputs: List[Tuple[List[List[int]], List[str]]] = [] + for req_output in req_outputs: + prompt_str = req_output.prompt + prompt_ids = req_output.prompt_token_ids + req_sample_output_ids: List[List[int]] = [] + req_sample_output_strs: List[str] = [] + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + req_sample_output_ids.append(prompt_ids + output_ids) + req_sample_output_strs.append(prompt_str + output_str) + outputs.append((req_sample_output_ids, req_sample_output_strs)) + return outputs + + @staticmethod + def _final_steps_generate_w_logprobs( + req_outputs: List[RequestOutput], + ) -> List[TokensTextLogprobsPromptLogprobs]: + outputs: List[TokensTextLogprobsPromptLogprobs] = [] + for req_output in req_outputs: + assert len(req_output.outputs) > 0 + for sample in req_output.outputs: + output_str = sample.text + output_ids = list(sample.token_ids) + output_logprobs = sample.logprobs + outputs.append((output_ids, output_str, output_logprobs, + req_output.prompt_logprobs)) + return outputs + + def generate_w_logprobs( + self, + prompts: List[str], + sampling_params: SamplingParams, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.generate(inputs, + sampling_params=sampling_params) + + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) + + def generate_encoder_decoder_w_logprobs( + self, + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + sampling_params: SamplingParams, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + ''' + Logprobs generation for vLLM encoder/decoder models + ''' + + assert sampling_params.logprobs is not None + req_outputs = self.model.generate(encoder_decoder_prompts, + sampling_params=sampling_params) + toks_str_logsprobs_prompt_logprobs = ( + self._final_steps_generate_w_logprobs(req_outputs)) + # Omit prompt logprobs if not required by sampling params + return ([x[0:-1] for x in toks_str_logsprobs_prompt_logprobs] + if sampling_params.prompt_logprobs is None else + toks_str_logsprobs_prompt_logprobs) + + def generate_greedy( + self, + prompts: List[str], + max_tokens: int, + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[Tuple[List[int], str]]: + greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) + outputs = self.generate(prompts, + greedy_params, + images=images, + videos=videos, + audios=audios) + return [(output_ids[0], output_str[0]) + for output_ids, output_str in outputs] + + def generate_greedy_logprobs( + self, + prompts: List[str], + max_tokens: int, + num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, + images: Optional[PromptImageInput] = None, + audios: Optional[PromptAudioInput] = None, + videos: Optional[PromptVideoInput] = None, + stop_token_ids: Optional[List[int]] = None, + stop: Optional[List[str]] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=num_prompt_logprobs, + stop_token_ids=stop_token_ids, + stop=stop) + + return self.generate_w_logprobs(prompts, + greedy_logprobs_params, + images=images, + audios=audios, + videos=videos) + + def generate_encoder_decoder_greedy_logprobs( + self, + encoder_decoder_prompts: List[ExplicitEncoderDecoderPrompt[str, str]], + max_tokens: int, + num_logprobs: int, + num_prompt_logprobs: Optional[int] = None, + ) -> Union[List[TokensTextLogprobs], + List[TokensTextLogprobsPromptLogprobs]]: + greedy_logprobs_params = SamplingParams( + temperature=0.0, + max_tokens=max_tokens, + logprobs=num_logprobs, + prompt_logprobs=(num_prompt_logprobs), + ) + ''' + Greedy logprobs generation for vLLM encoder/decoder models + ''' + + return self.generate_encoder_decoder_w_logprobs( + encoder_decoder_prompts, greedy_logprobs_params) + + def generate_beam_search( + self, + prompts: Union[List[str], List[List[int]]], + beam_width: int, + max_tokens: int, + ) -> List[Tuple[List[List[int]], List[str]]]: + if is_list_of(prompts, str, check="all"): + prompts = [TextPrompt(prompt=prompt) for prompt in prompts] + else: + prompts = [ + TokensPrompt(prompt_token_ids=tokens) for tokens in prompts + ] + outputs = self.model.beam_search( + prompts, + BeamSearchParams(beam_width=beam_width, max_tokens=max_tokens)) + returned_outputs = [] + for output in outputs: + token_ids = [x.tokens for x in output.sequences] + texts = [x.text for x in output.sequences] + returned_outputs.append((token_ids, texts)) + return returned_outputs + + def classify(self, prompts: List[str]) -> List[List[float]]: + req_outputs = self.model.classify(prompts) + return [req_output.outputs.probs for req_output in req_outputs] + + def encode( + self, + prompts: List[str], + images: Optional[PromptImageInput] = None, + videos: Optional[PromptVideoInput] = None, + audios: Optional[PromptAudioInput] = None, + ) -> List[List[float]]: + inputs = self.get_inputs(prompts, + images=images, + videos=videos, + audios=audios) + + req_outputs = self.model.embed(inputs) + return [req_output.outputs.embedding for req_output in req_outputs] + + def score( + self, + text_1: Union[str, List[str]], + text_2: Union[str, List[str]], + ) -> List[float]: + req_outputs = self.model.score(text_1, text_2) + return [req_output.outputs.score for req_output in req_outputs] + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + del self.model + cleanup_dist_env_and_memory() + + +@pytest.fixture(scope="session") +def vllm_runner(): + return VllmRunner diff --git a/tests/model_utils.py b/tests/model_utils.py new file mode 100644 index 000000000..1b9eadccd --- /dev/null +++ b/tests/model_utils.py @@ -0,0 +1,303 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm-project/vllm/blob/main/tests/models/utils.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import warnings +from typing import Dict, List, Optional, Sequence, Tuple, Union + +import torch +from vllm.config import ModelConfig, TaskOption +from vllm.inputs import InputContext +from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs + +TokensText = Tuple[List[int], str] + + +def check_outputs_equal( + *, + outputs_0_lst: Sequence[TokensText], + outputs_1_lst: Sequence[TokensText], + name_0: str, + name_1: str, +): + """ + Compare the two sequences generated by different models, + which should be equal. + """ + assert len(outputs_0_lst) == len(outputs_1_lst) + + for prompt_idx, (outputs_0, + outputs_1) in enumerate(zip(outputs_0_lst, + outputs_1_lst)): + output_ids_0, output_str_0 = outputs_0 + output_ids_1, output_str_1 = outputs_1 + + # The text and token outputs should exactly match + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") + + assert output_str_0 == output_str_1, fail_msg + assert output_ids_0 == output_ids_1, fail_msg + + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * List of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. +TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int, + float]], + SampleLogprobs]]] + +# Allow for tokens to be represented as str's rather than IDs; +# tuple of +# * Token string representations list +# * String +# * Optional list of top sample logprobs for each sampled token +# +# Assumes prompt logprobs were not requested. +TextTextLogprobs = Tuple[List[str], str, Optional[Union[List[Dict[str, float]], + List[Dict[str, + Logprob]]]]] + +# Representation of generated sequence as a tuple of +# * Token ID list +# * String +# * Optional list of top sample logprobs for each sampled token +# * Optional list of top prompt logprobs for each prompt token +# +# Allows prompt logprobs to be requested. +TokensTextLogprobsPromptLogprobs = Tuple[ + List[int], str, Optional[Union[List[Dict[int, float]], SampleLogprobs]], + Optional[Union[List[Optional[Dict[int, float]]], PromptLogprobs]]] + + +def check_logprobs_close( + *, + outputs_0_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], + outputs_1_lst: Sequence[Union[TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + TextTextLogprobs]], + name_0: str, + name_1: str, + num_outputs_0_skip_tokens: int = 0, + warn_on_mismatch: bool = True, + always_check_logprobs: bool = False, +) -> None: + """Compare the logprobs of two sequences generated by different models, + which should be similar but not necessarily equal. + + How sample logprobs are compared: + * `always_check_logprobs == True`: set of highest-logprob token ids + must match between seq0 and seq1 at all sampled token offsets + * `always_check_logprobs == False`: highest-logprob token ids are + only compared at sampled token offsets for which generated token + ids don't match + + Prompt logprobs must be provided either for both input sequences, or + for neither. If prompt logprobs are provided, then highest-logprob + prompt token ids must match between seq0 and seq1 at all prompt token + offsets. + + Args: + outputs_0_lst: First sequence to compare + outputs_0_lst: Second sequence to compare + name_0: sequence #0 name + name_1: sequence #1 name + num_outputs_0_skip_tokens: If > 0, specifies the number of initial + sequence #0 tokens & logprobs to discard + before comparison, i.e. all + of sequence #1 will be compared to + sequence #0 beginning at index + num_outputs_0_skip_tokens + warn_on_mismatch: Issue a warning if there is token-wise or text-wise + mismatch between the two sequences + always_check_logprobs: If true, check logprobs even when tokens match + """ + assert len(outputs_0_lst) == len(outputs_1_lst) + + # Loop through responses to each prompt. + for prompt_idx, (outputs_0, + outputs_1) in enumerate(zip(outputs_0_lst, + outputs_1_lst)): + assert len(outputs_0) == len(outputs_1) + if len(outputs_0) == 3: + assert len(outputs_1) == 3 + # Break out tokens, text & sample logprobs + # (prompt logprobs were not provided) + output_ids_0, output_str_0, logprobs_0 = outputs_0 + output_ids_1, output_str_1, logprobs_1 = outputs_1 + elif len(outputs_0) == 4: + assert len(outputs_1) == 4 + # Break out tokens, text, sample logprobs & prompt logprobs + ( + output_ids_0, + output_str_0, + logprobs_0, + prompt_logprobs_0, + ) = outputs_0 + ( + output_ids_1, + output_str_1, + logprobs_1, + prompt_logprobs_1, + ) = outputs_1 + + # Test prompt logprobs closeness + if (prompt_logprobs_0 is not None + and prompt_logprobs_1 is not None): + # Both sequences' prompt logprobs lists are not `None`` + # (although individual list elements may be `None`); + # for each token's logprobs: + for idx, (logprobs_elem_0, logprobs_elem_1) in enumerate( + zip(prompt_logprobs_0, prompt_logprobs_1)): + fail_msg = ( + f"Prompt logprobs test:" + f"\n{name_0}:\tPrompt index {idx}\t{logprobs_elem_0}" + f"\n{name_1}:\tPrompt index {idx}\t{logprobs_elem_1}") + + if logprobs_elem_0 is None: + # If the seq 0 token's logprobs are `None`, + # the seq 1 token's logprobs must be `None` + assert logprobs_elem_1 is None, fail_msg + else: + # If the seq 0 token's logprobs are not `None`, + # the seq 1 token's logprobs must not be `None` + assert logprobs_elem_1 is not None, fail_msg + # Logprobs check: top-k token choices must be the same + assert (set(logprobs_elem_0.keys()) == set( + logprobs_elem_1.keys())), fail_msg + else: + # Both sequence logprobs lists must be `None` + fail_msg = (f"Prompt logprobs test:" + f"\n{name_0}:\tlogprobs\t{prompt_logprobs_0}" + f"\n{name_1}:\tlogprobs\t{prompt_logprobs_1}") + + assert (prompt_logprobs_0 is None + and prompt_logprobs_1 is None), fail_msg + else: + raise ValueError(f"Outputs tuple must have 3 or 4 elements but " + f"{len(outputs_0)} elements were provided: " + f"{outputs_0}") + + if logprobs_0 is None: + logprobs_0 = [None] * len(output_ids_0) + if logprobs_1 is None: + logprobs_1 = [None] * len(output_ids_1) + + # Skip specified number of initial sequence #0 tokens + # & logprobs, leaving output text as-is for simplicity + # (text mismatches may generate warnings but do not + # cause the test to fail.) + if num_outputs_0_skip_tokens < 0: + raise ValueError("num_outputs_0_skip_tokens must be non-negative") + output_ids_0 = output_ids_0[num_outputs_0_skip_tokens:] + logprobs_0 = logprobs_0[num_outputs_0_skip_tokens:] + + # Loop through generated tokens. + for idx, (output_id_0, + output_id_1) in enumerate(zip(output_ids_0, output_ids_1)): + + is_tok_mismatch = output_id_0 != output_id_1 + + # If generated tokens don't match + # or it is desired to always check logprobs, + # then + if is_tok_mismatch or always_check_logprobs: + logprobs_elem_0 = logprobs_0[idx] + logprobs_elem_1 = logprobs_1[idx] + + # Each predicted token must be in top N logprobs of the other + fail_msg = ( + f"Test{prompt_idx}:" + f"\nMatched tokens:\t{output_ids_0[:idx]}" + f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}" + f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}") + + assert logprobs_elem_0 is not None, fail_msg + assert logprobs_elem_1 is not None, fail_msg + assert output_id_0 in logprobs_elem_1, fail_msg + assert output_id_1 in logprobs_elem_0, fail_msg + + if warn_on_mismatch and is_tok_mismatch: + with warnings.catch_warnings(): + # This ensures that repeated warnings are shown + # in the output, not just the first occurrence + warnings.simplefilter("always") + + warnings.warn(fail_msg, stacklevel=2) + + # Break out since sequences will now diverge. + break + else: + if output_str_0 != output_str_1 and warn_on_mismatch: + # The token outputs exactly match, + # so the text outputs should exactly match as well + fail_msg = (f"Test{prompt_idx}:" + f"\n{name_0}:\t{output_str_0!r}" + f"\n{name_1}:\t{output_str_1!r}") + + with warnings.catch_warnings(): + # This ensures that repeated warnings are shown + # in the output, not just the first occurrence + warnings.simplefilter("always") + + warnings.warn(fail_msg, stacklevel=2) + + +def build_model_context(model_name: str, + task: TaskOption = "auto", + tokenizer_name: Optional[str] = None, + trust_remote_code: bool = False, + dtype: Optional[Union[str, torch.dtype]] = None, + mm_processor_kwargs: Optional[Dict] = None, + limit_mm_per_prompt: Optional[Dict] = None): + """Creates an InputContext for a given model. + + Args: + model_name: Name of the model being considered. + tokenizer_name: Name of the tokenizer being considered. + trust_remote_code: Whether or not to allow loading remote code. + mm_processor_kwargs: optional processor kwargs for to be leveraged + in the input processor, mapper, dummy data creation, etc. + limit_mm_per_prompt: Multimodal limits. + + Returns: + InputContext for the model being considered. + """ + if tokenizer_name is None: + tokenizer_name = model_name + if dtype is None: + dtype = "half" + + model_config = ModelConfig( + model_name, + task=task, + tokenizer=tokenizer_name, + tokenizer_mode="auto", + trust_remote_code=trust_remote_code, + dtype=dtype, + seed=0, + mm_processor_kwargs=mm_processor_kwargs, + limit_mm_per_prompt=limit_mm_per_prompt, + ) + return InputContext(model_config) diff --git a/tests/test_offline_inference.py b/tests/test_offline_inference.py new file mode 100644 index 000000000..484bce63c --- /dev/null +++ b/tests/test_offline_inference.py @@ -0,0 +1,61 @@ +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from vllm/tests/basic_correctness/test_basic_correctness.py +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +"""Compare the short outputs of HF and vLLM when using greedy sampling. + +Run `pytest tests/test_offline_inference.py`. +""" +import os + +import pytest +import vllm # noqa: F401 +from conftest import VllmRunner + +import vllm_ascend # noqa: F401 + +MODELS = [ + "Qwen/Qwen2.5-0.5B-Instruct", +] +os.environ["VLLM_USE_MODELSCOPE"] = "True" + +TARGET_TEST_SUITE = os.environ.get("TARGET_TEST_SUITE", "L4") + + +@pytest.mark.parametrize("model", MODELS) +@pytest.mark.parametrize("dtype", ["half", "float16"]) +@pytest.mark.parametrize("max_tokens", [5]) +def test_models( + model: str, + dtype: str, + max_tokens: int, +) -> None: + os.environ["VLLM_ATTENTION_BACKEND"] = "ASCEND" + + # 5042 tokens for gemma2 + # gemma2 has alternating sliding window size of 4096 + # we need a prompt with more than 4096 tokens to test the sliding window + prompt = "The following numbers of the sequence " + ", ".join( + str(i) for i in range(1024)) + " are:" + example_prompts = [prompt] + + with VllmRunner(model, + max_model_len=8192, + dtype=dtype, + enforce_eager=False, + gpu_memory_utilization=0.7) as vllm_model: + vllm_model.generate_greedy(example_prompts, max_tokens) diff --git a/tools/actionlint.sh b/tools/actionlint.sh new file mode 100755 index 000000000..72a10b18f --- /dev/null +++ b/tools/actionlint.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +if command -v actionlint &> /dev/null; then + # NOTE: avoid check .github/workflows/vllm_ascend_test.yaml becase sel-hosted runner `npu-arm64` is unknown + actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + exit 0 +elif [ -x ./actionlint ]; then + ./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml + exit 0 +fi + +# download a binary to the current directory - v1.7.3 +bash <(curl https://raw.githubusercontent.com/rhysd/actionlint/aa0a7be8e566b096e64a5df8ff290ec24fa58fbc/scripts/download-actionlint.bash) +./actionlint .github/workflows/*.yml .github/workflows/mypy.yaml diff --git a/tools/check_repo.sh b/tools/check_repo.sh new file mode 100644 index 000000000..e86d0f110 --- /dev/null +++ b/tools/check_repo.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Checks whether the repo is clean and whether tags are available (necessary to correctly produce vllm version at build time) + +if ! git diff --quiet; then + echo "Repo is dirty" >&2 + + exit 1 +fi + +if ! git describe --tags; then + echo "No tags are present. Is this a shallow clone? git fetch --unshallow --tags" >&2 + + exit 1 +fi diff --git a/tools/mypy.sh b/tools/mypy.sh new file mode 100755 index 000000000..fcb7c0e4b --- /dev/null +++ b/tools/mypy.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Adapted from https://github.com/vllm-project/vllm/tree/main/tools +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +CI=${1:-0} +PYTHON_VERSION=${2:-3.9} + +if [ "$CI" -eq 1 ]; then + set -e +fi + +run_mypy() { + echo "Running mypy on $1" + if [ "$CI" -eq 1 ] && [ -z "$1" ]; then + mypy --python-version "${PYTHON_VERSION}" "$@" + return + fi + mypy --follow-imports skip --python-version "${PYTHON_VERSION}" "$@" +} + +run_mypy vllm_ascend +run_mypy examples +run_mypy tests diff --git a/tools/npu-vllm-test.sh b/tools/npu-vllm-test.sh new file mode 100644 index 000000000..17c7a4d43 --- /dev/null +++ b/tools/npu-vllm-test.sh @@ -0,0 +1,422 @@ +#!/bin/bash + +# +# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. +# This file is a part of the vllm-ascend project. +# Copyright 2023 The vLLM team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set -o pipefail + +TEST_DIR="./vllm-empty/tests" +TEST_FILES=( + test_sequence.py + # test_utils.py + # test_config.py + test_cache_block_hashing.py + # test_scalartype.py + # test_embedded_commit.py + # test_inputs.py + # test_sharded_state_loader.py + test_sampling_params.py + # test_logger.py + # test_logits_processor.py + # test_regression.py + # prefix_caching/test_prefix_caching.py + # prefix_caching/test_disable_sliding_window.py + # weight_loading/test_weight_loading.py + # samplers/test_beam_search.py + # samplers/test_typical_acceptance_sampler.py + # samplers/test_no_bad_words.py + # samplers/test_rejection_sampler.py + # samplers/test_ignore_eos.py + # samplers/test_ranks.py + # samplers/test_logits_processor.py + # samplers/test_sampler.py + # samplers/test_seeded_generate.py + # samplers/test_logprobs.py + # kernels/test_encoder_decoder_attn.py + # kernels/test_rotary_embedding.py + # kernels/test_prefix_prefill.py + # kernels/test_flashinfer.py + # kernels/utils.py + # kernels/test_machete_mm.py + # kernels/test_flash_attn.py + # kernels/test_awq.py + # kernels/test_blocksparse_attention.py + # kernels/test_utils.py + # kernels/test_aqlm.py + # kernels/test_cutlass.py + # kernels/test_causal_conv1d.py + # kernels/test_marlin_gemm.py + # kernels/test_layernorm.py + # kernels/test_pos_encoding.py + # kernels/test_moe.py + # kernels/test_awq_marlin.py + # kernels/test_int8_quant.py + # kernels/test_gptq.py + # kernels/test_attention.py + # kernels/test_activation.py + # kernels/quant_utils.py + # kernels/test_permute_cols.py + # kernels/test_triton_scaled_mm.py + # kernels/test_gguf.py + # kernels/test_awq_triton.py + # kernels/test_attention_selector.py + # kernels/test_ggml.py + # kernels/test_mamba_ssm.py + # kernels/test_fused_quant_layernorm.py + # kernels/test_fp8_quant.py + # kernels/test_cascade_flash_attn.py + # kernels/conftest.py + # kernels/allclose_default.py + # kernels/test_block_fp8.py + # kernels/test_cache.py + # kernels/test_semi_structured.py + # quantization/test_quark.py + # quantization/test_compressed_tensors.py + # quantization/utils.py + # quantization/test_experts_int8.py + # quantization/test_lm_head.py + # quantization/test_ipex_quant.py + # quantization/test_bitsandbytes.py + # quantization/test_cpu_offload.py + # quantization/test_fp8.py + # quantization/test_configs.py + # tool_use/test_tool_calls.py + # tool_use/utils.py + # tool_use/test_chat_completions.py + # tool_use/test_jamba_tool_parser.py + # tool_use/test_chat_completion_request_validations.py + # tool_use/conftest.py + # tool_use/test_parallel_tool_calls.py + # runai_model_streamer/test_runai_model_streamer_loader.py + # runai_model_streamer/test_weight_utils.py + # kv_transfer/test_lookup_buffer.sh + # kv_transfer/test_send_recv.py + # kv_transfer/test_send_recv.sh + # kv_transfer/test_lookup_buffer.py + # kv_transfer/module_test.py + # kv_transfer/disagg_test.py + # plugins/vllm_add_dummy_platform/setup.py + # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py + # plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_attention_backend.py + # plugins/vllm_add_dummy_model/setup.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_opt.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_gemma_embedding.py + # plugins/vllm_add_dummy_model/vllm_add_dummy_model/my_llava.py + # prompt_adapter/test_multi_adapter_inference.py + # prompt_adapter/test_pa_lora.py + # prompt_adapter/test_bloom.py + # compile/test_pass_manager.py + # compile/utils.py + # compile/test_wrapper.py + # compile/test_fusion.py + # compile/backend.py + # compile/test_full_graph.py + # compile/test_basic_correctness.py + # compile/test_functionalization.py + # compile/piecewise/test_simple.py + # compile/piecewise/test_toy_llama.py + # lora/test_punica_ops_variation.py + # lora/test_quant_model.py + # lora/test_lora_checkpoints.py + # lora/test_mixtral.py + # lora/test_qwen2vl.py + # lora/test_baichuan.py + # lora/utils.py + # lora/test_phi.py + # lora/test_utils.py + # lora/test_minicpmv_tp.py + # lora/test_layers.py + # lora/test_worker.py + # lora/test_jamba.py + # lora/test_tokenizer_group.py + # lora/test_lora_bias_e2e.py + # lora/test_chatglm3_tp.py + # lora/test_punica_ops_sizes.py + # lora/test_lora_manager.py + # lora/test_llama_tp.py + # lora/test_lora_huggingface.py + # lora/test_long_context.py + # lora/test_gemma.py + # lora/conftest.py + # lora/data/long_context_test_data.py + # models/registry.py + # models/utils.py + # models/test_registry.py + # models/test_initialization.py + # models/test_oot_registration.py + # models/multimodal/processing/test_internvl.py + # models/multimodal/processing/test_llava_next.py + # models/multimodal/processing/test_idefics3.py + # models/multimodal/processing/test_qwen2_vl.py + # models/multimodal/processing/test_phi3v.py + # models/multimodal/processing/test_common.py + # models/multimodal/processing/test_qwen.py + # models/multimodal/processing/test_llava_onevision.py + # models/encoder_decoder/language/test_bart.py + # models/encoder_decoder/audio_language/test_whisper.py + # models/encoder_decoder/vision_language/test_broadcast.py + # models/encoder_decoder/vision_language/test_florence2.py + # models/encoder_decoder/vision_language/test_mllama.py + # models/decoder_only/language/test_models.py + # models/decoder_only/language/test_gptq_marlin.py + # models/decoder_only/language/test_granite.py + # models/decoder_only/language/test_modelopt.py + # models/decoder_only/language/test_phimoe.py + # models/decoder_only/language/test_aqlm.py + # models/decoder_only/language/test_mistral.py + # models/decoder_only/language/test_jamba.py + # models/decoder_only/language/test_gptq_marlin_24.py + # models/decoder_only/language/test_mamba.py + # models/decoder_only/language/test_gguf.py + # models/decoder_only/language/test_fp8.py + # models/decoder_only/audio_language/test_ultravox.py + # models/decoder_only/vision_language/test_models.py + # models/decoder_only/vision_language/test_awq.py + # models/decoder_only/vision_language/test_intern_vit.py + # models/decoder_only/vision_language/test_qwen2_vl.py + # models/decoder_only/vision_language/test_pixtral.py + # models/decoder_only/vision_language/test_phi3v.py + # models/decoder_only/vision_language/test_h2ovl.py + # models/decoder_only/vision_language/vlm_utils/types.py + # models/decoder_only/vision_language/vlm_utils/model_utils.py + # models/decoder_only/vision_language/vlm_utils/runners.py + # models/decoder_only/vision_language/vlm_utils/core.py + # models/decoder_only/vision_language/vlm_utils/custom_inputs.py + # models/decoder_only/vision_language/vlm_utils/case_filtering.py + # models/decoder_only/vision_language/vlm_utils/builders.py + # models/embedding/utils.py + # models/embedding/language/test_scoring.py + # models/embedding/language/test_gritlm.py + # models/embedding/language/test_cls_models.py + # models/embedding/language/test_embedding.py + # models/embedding/vision_language/test_llava_next.py + # models/embedding/vision_language/test_dse_qwen2_vl.py + # models/embedding/vision_language/test_phi3v.py + # multimodal/utils.py + # multimodal/test_processor_kwargs.py + # multimodal/test_utils.py + # multimodal/test_inputs.py + # multimodal/test_processing.py + # standalone_tests/python_only_compile.sh + # standalone_tests/lazy_torch_compile.py + # async_engine/test_async_llm_engine.py + # async_engine/api_server_async_engine.py + # async_engine/test_api_server.py + # async_engine/test_request_tracker.py + # mq_llm_engine/utils.py + # mq_llm_engine/test_load.py + # mq_llm_engine/test_abort.py + # mq_llm_engine/test_error_handling.py + # tokenization/test_tokenizer.py + # tokenization/test_tokenizer_group.py + # tokenization/test_get_eos.py + # tokenization/test_cached_tokenizer.py + # tokenization/test_detokenize.py + # core/utils.py + # core/test_chunked_prefill_scheduler.py + # core/test_serialization.py + # core/test_num_computed_tokens_update.py + # core/test_scheduler_encoder_decoder.py + # core/test_scheduler.py + # core/block/test_cpu_gpu_block_allocator.py + # core/block/test_prefix_caching_block.py + # core/block/test_common.py + # core/block/test_block_table.py + # core/block/test_block_manager.py + # core/block/conftest.py + # core/block/test_naive_block.py + # core/block/e2e/test_correctness.py + # core/block/e2e/test_correctness_sliding_window.py + # core/block/e2e/conftest.py + # tracing/test_tracing.py + # engine/test_arg_utils.py + # engine/test_detokenization.py + # engine/test_short_mm_context.py + # engine/test_custom_executor.py + # engine/test_multiproc_workers.py + # engine/test_computed_prefix_blocks.py + # engine/test_stop_reason.py + # engine/test_skip_tokenizer_init.py + # engine/test_stop_strings.py + # engine/output_processor/test_stop_checker.py + # engine/output_processor/test_multi_step.py + # tensorizer_loader/test_tensorizer.py + # tensorizer_loader/conftest.py + # entrypoints/test_chat_utils.py + # entrypoints/conftest.py + # entrypoints/llm/test_lazy_outlines.py + # entrypoints/llm/test_generate_multiple_loras.py + # entrypoints/llm/test_encode.py + # entrypoints/llm/test_init.py + # entrypoints/llm/test_guided_generate.py + # entrypoints/llm/test_gpu_utilization.py + # entrypoints/llm/test_chat.py + # entrypoints/llm/test_accuracy.py + # entrypoints/llm/test_prompt_validation.py + # entrypoints/llm/test_generate.py + # entrypoints/offline_mode/test_offline_mode.py + # entrypoints/openai/test_completion.py + # entrypoints/openai/test_models.py + # entrypoints/openai/test_chat_echo.py + # entrypoints/openai/test_score.py + # entrypoints/openai/test_tokenization.py + # entrypoints/openai/test_cli_args.py + # entrypoints/openai/test_chunked_prompt.py + # entrypoints/openai/test_encoder_decoder.py + # entrypoints/openai/test_chat_template.py + # entrypoints/openai/test_oot_registration.py + # entrypoints/openai/test_run_batch.py + # entrypoints/openai/test_metrics.py + # entrypoints/openai/test_vision_embedding.py + # entrypoints/openai/test_embedding.py + # entrypoints/openai/test_lora_adapters.py + # entrypoints/openai/test_video.py + # entrypoints/openai/test_serving_models.py + # entrypoints/openai/test_chat.py + # entrypoints/openai/test_pooling.py + # entrypoints/openai/test_basic.py + # entrypoints/openai/test_accuracy.py + # entrypoints/openai/test_prompt_validation.py + # entrypoints/openai/test_vision.py + # entrypoints/openai/test_audio.py + # entrypoints/openai/test_async_tokenization.py + # entrypoints/openai/test_return_tokens_as_ids.py + # entrypoints/openai/test_serving_chat.py + # entrypoints/openai/test_shutdown.py + # entrypoints/openai/test_root_path.py + # entrypoints/openai/tool_parsers/utils.py + # entrypoints/openai/tool_parsers/test_pythonic_tool_parser.py + # model_executor/weight_utils.py + # model_executor/test_enabled_custom_ops.py + # model_executor/test_guided_processors.py + # model_executor/test_model_load_with_params.py + # model_executor/conftest.py + # metrics/test_metrics.py + # system_messages/sonnet3.5_nov2024.txt + # encoder_decoder/test_e2e_correctness.py + # v1/core/test_kv_cache_utils.py + # v1/core/test_prefix_caching.py + # v1/sample/test_sampler.py + # v1/engine/test_engine_core.py + # v1/engine/test_async_llm.py + # v1/engine/test_output_processor.py + # v1/engine/test_engine_args.py + # v1/engine/test_engine_core_client.py + # v1/e2e/test_cascade_attention.py + # v1/worker/test_gpu_input_batch.py + # spec_decode/utils.py + # spec_decode/test_utils.py + # spec_decode/test_ngram_worker.py + # spec_decode/test_metrics.py + # spec_decode/test_batch_expansion.py + # spec_decode/test_multi_step_worker.py + # spec_decode/test_scorer.py + # spec_decode/test_spec_decode_worker.py + # spec_decode/test_dynamic_spec_decode.py + # spec_decode/e2e/test_mlp_correctness.py + # spec_decode/e2e/test_ngram_correctness.py + # spec_decode/e2e/test_seed.py + # spec_decode/e2e/test_integration.py + # spec_decode/e2e/test_medusa_correctness.py + # spec_decode/e2e/test_integration_dist_tp4.py + # spec_decode/e2e/test_eagle_correctness.py + # spec_decode/e2e/test_compatibility.py + # spec_decode/e2e/test_multistep_correctness.py + # spec_decode/e2e/test_integration_dist_tp2.py + # spec_decode/e2e/conftest.py + # spec_decode/e2e/test_logprobs.py + # multi_step/test_correctness_async_llm.py + # multi_step/test_correctness_llm.py + # vllm_test_utils/setup.py + # vllm_test_utils/vllm_test_utils/blame.py + # vllm_test_utils/vllm_test_utils/monitor.py + # plugins_tests/test_platform_plugins.py + # tpu/test_compilation.py + # tpu/test_quantization_accuracy.py + # tpu/test_custom_dispatcher.py + # distributed/test_custom_all_reduce.py + # distributed/test_distributed_oot.py + # distributed/test_pipeline_parallel.py + # distributed/test_pynccl.py + # distributed/test_pipeline_partition.py + # distributed/test_utils.py + # distributed/test_pp_cudagraph.py + # distributed/test_ca_buffer_sharing.py + # distributed/test_multi_node_assignment.py + # distributed/test_same_node.py + # distributed/test_shm_broadcast.py + # distributed/test_comm_ops.py + # basic_correctness/test_chunked_prefill.py + # basic_correctness/test_preemption.py + # basic_correctness/test_cpu_offload.py + # basic_correctness/test_basic_correctness.py + # worker/test_model_runner.py + # worker/test_encoder_decoder_model_runner.py + # worker/test_swap.py + # worker/test_profile.py + # worker/test_model_input.py +) + +# print usage +usage() { + echo "Usage: $0 -t