Bump up the version to v0.1.4 (#846 )

Set replacement=True in torch.multinomial (#858 )
Clean up code (#844 )
2025-10-20 23:03:52 +08:00 · 2023-08-25 12:28:00 +09:00 · 2023-08-25 12:22:01 +09:00 · 2023-08-23 16:44:15 -07:00 · 2023-08-23 17:44:21 +09:00 · 2023-08-22 20:48:36 -07:00
96 changed files with 7052 additions and 1142 deletions
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -0,0 +1,101 @@
+# This workflow will upload a Python Package to Release asset
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Create Release
+
+on:
+  push:
+    tags:
+      - v*
+
+# Needed to create release and upload assets
+permissions:
+  contents: write
+
+jobs:
+  release:
+    # Retrieve tag and create release
+    name: Create Release
+    runs-on: ubuntu-latest
+    outputs:
+      upload_url: ${{ steps.create_release.outputs.upload_url }}
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Extract branch info
+        shell: bash
+        run: |
+          echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
+
+      - name: Create Release
+        id: create_release
+        uses: "actions/github-script@v6"
+        env:
+          RELEASE_TAG: ${{ env.release_tag }}
+        with:
+          github-token: "${{ secrets.GITHUB_TOKEN }}"
+          script: |
+            const script = require('.github/workflows/scripts/create_release.js')
+            await script(github, context, core)
+
+  wheel:
+    name: Build Wheel
+    runs-on: ${{ matrix.os }}
+    needs: release
+    
+    strategy:
+      fail-fast: false
+      matrix:
+          os: ['ubuntu-20.04']
+          python-version: ['3.8', '3.9', '3.10', '3.11']
+          cuda-version: ['11.8'] # Github runner can't build anything older than 11.8
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+
+      - name: Set up Linux Env
+        if: ${{ runner.os == 'Linux' }}
+        run: |
+          bash -x .github/workflows/scripts/env.sh
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+            python-version: ${{ matrix.python-version }}
+
+      - name: Install CUDA ${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
+
+      - name: Install PyTorch-cu${{ matrix.cuda-version }}
+        run: |
+          bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+
+      - name: Build wheel
+        shell: bash
+        run: |
+          bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
+          wheel_name=$(ls dist/*whl | xargs -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
+          echo "asset_name=${asset_name}" >> $GITHUB_ENV
+      
+      - name: Upload Release Asset
+        uses: actions/upload-release-asset@v1
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
+          asset_content_type: application/*
+
+      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
+      # - name: Publish package
+      #   uses: pypa/gh-action-pypi-publish@release/v1.8
+      #   with:
+      #     repository-url: https://test.pypi.org/legacy/
+      #     password: ${{ secrets.PYPI_API_TOKEN }}
+      #     skip-existing: true
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -0,0 +1,31 @@
+name: pylint
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  pylint:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install pylint==2.8.2
+    - name: Analysing the code with pylint
+      run: |
+        pylint vllm
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@ -0,0 +1,15 @@
+#!/bin/bash
+
+python_executable=python$1
+cuda_home=/usr/local/cuda-$2
+
+# Update paths
+PATH=${cuda_home}/bin:$PATH
+LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
+
+# Install requirements
+$python_executable -m pip install wheel packaging
+$python_executable -m pip install -r requirements.txt
+
+# Build
+$python_executable setup.py bdist_wheel --dist-dir=dist
--- a/.github/workflows/scripts/create_release.js
+++ b/.github/workflows/scripts/create_release.js
@ -0,0 +1,20 @@
+// Uses Github's API to create the release and wait for result.
+// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
+
+module.exports = async (github, context, core) => {
+	try {
+		const response = await github.rest.repos.createRelease({
+			draft: false,
+			generate_release_notes: true,
+			name: process.env.RELEASE_TAG,
+			owner: context.repo.owner,
+			prerelease: false,
+			repo: context.repo.repo,
+			tag_name: process.env.RELEASE_TAG,
+		});
+
+		core.setOutput('upload_url', response.data.upload_url);
+	} catch (error) {
+		core.setFailed(error.message);
+	}
+}
--- a/.github/workflows/scripts/cuda-install.sh
+++ b/.github/workflows/scripts/cuda-install.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+# Replace '.' with '-' ex: 11.8 -> 11-8
+cuda_version=$(echo $1 | tr "." "-")
+# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
+OS=$(echo $2 | tr -d ".\-")
+
+# Installs CUDA
+wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
+sudo dpkg -i cuda-keyring_1.1-1_all.deb
+rm cuda-keyring_1.1-1_all.deb
+sudo apt -qq update
+sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
+sudo apt clean
+
+# Test nvcc
+PATH=/usr/local/cuda-$1/bin:${PATH}
+nvcc --version
--- a/.github/workflows/scripts/env.sh
+++ b/.github/workflows/scripts/env.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+
+# This file installs common linux environment tools
+
+export LANG C.UTF-8
+
+# python_version=$1
+
+sudo    apt-get update && \
+sudo    apt-get install -y --no-install-recommends \
+        software-properties-common \
+
+sudo    apt-get install -y --no-install-recommends \
+        build-essential \
+        apt-utils \
+        ca-certificates \
+        wget \
+        git \
+        vim \
+        libssl-dev \
+        curl \
+        unzip \
+        unrar \
+        cmake \
+        net-tools \
+        sudo \
+        autotools-dev \
+        rsync \
+        jq \
+        openssh-server \
+        tmux \
+        screen \
+        htop \
+        pdsh \
+        openssh-client \
+        lshw \
+        dmidecode \
+        util-linux \
+        automake \
+        autoconf \
+        libtool \
+        net-tools \
+        pciutils \
+        libpci-dev \
+        libaio-dev \
+        libcap2 \
+        libtinfo5 \
+        fakeroot \
+        devscripts \
+        debhelper \
+        nfs-common
+
+# Remove github bloat files to free up disk space
+sudo rm -rf "/usr/local/share/boost"
+sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+sudo rm -rf "/usr/share/dotnet"
--- a/.github/workflows/scripts/pytorch-install.sh
+++ b/.github/workflows/scripts/pytorch-install.sh
@ -0,0 +1,14 @@
+#!/bin/bash
+
+python_executable=python$1
+cuda_version=$2
+
+# Install torch
+$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
+$python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html
+
+# Print version information
+$python_executable --version
+$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
+$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
+$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@ -0,0 +1,31 @@
+name: yapf
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+jobs:
+  yapf:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.10"]
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install yapf==0.32.0
+        pip install toml==0.10.2
+    - name: Running yapf
+      run: |
+        yapf --diff --recursive vllm --exclude 'vllm/model_executor/parallel_utils/**'
--- a/.gitignore
+++ b/.gitignore
@ -170,3 +170,6 @@ cython_debug/

 # Python pickle files
 *.pkl
+
+# Sphinx documentation
+_build/
--- a/.pylintrc
+++ b/.pylintrc
@ -0,0 +1,434 @@
+# This Pylint rcfile contains a best-effort configuration to uphold the
+# best-practices and style described in the Google Python style guide:
+#   https://google.github.io/styleguide/pyguide.html
+#
+# Its canonical open-source location is:
+#   https://google.github.io/styleguide/pylintrc
+
+[MASTER]
+
+# Files or directories to be skipped. They should be base names, not paths.
+ignore=docs,parallel_utils
+
+# Files or directories matching the regex patterns are skipped. The regex
+# matches against base names, not paths.
+ignore-patterns=
+
+# Pickle collected data for later comparisons.
+persistent=no
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+load-plugins=
+
+# Use multiple processes to speed up Pylint.
+jobs=4
+
+# Allow loading of arbitrary C extensions. Extensions are imported into the
+# active Python interpreter and may run arbitrary code.
+unsafe-load-any-extension=no
+
+
+[MESSAGES CONTROL]
+
+# Only show warnings with the listed confidence levels. Leave empty to show
+# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
+confidence=
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time (only on the command line, not in the configuration file where
+# it should appear only once). See also the "--disable" option for examples.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+disable=abstract-method,
+        apply-builtin,
+        arguments-differ,
+        attribute-defined-outside-init,
+        backtick,
+        bad-option-value,
+        basestring-builtin,
+        buffer-builtin,
+        c-extension-no-member,
+        consider-using-enumerate,
+        cmp-builtin,
+        cmp-method,
+        coerce-builtin,
+        coerce-method,
+        delslice-method,
+        div-method,
+        duplicate-code,
+        eq-without-hash,
+        execfile-builtin,
+        file-builtin,
+        filter-builtin-not-iterating,
+        fixme,
+        getslice-method,
+        global-statement,
+        hex-method,
+        idiv-method,
+        implicit-str-concat-in-sequence,
+        import-error,
+        import-self,
+        import-star-module-level,
+        inconsistent-return-statements,
+        input-builtin,
+        intern-builtin,
+        invalid-str-codec,
+        locally-disabled,
+        logging-fstring-interpolation,  # added by vLLM
+        logging-not-lazy,  # added by vLLM
+        long-builtin,
+        long-suffix,
+        map-builtin-not-iterating,
+        misplaced-comparison-constant,
+        missing-class-docstring,  # TODO (vLLM): enable
+        missing-function-docstring,
+        missing-module-docstring,  # TODO (vLLM): enable
+        metaclass-assignment,
+        next-method-called,
+        next-method-defined,
+        no-absolute-import,
+        no-else-break,
+        no-else-continue,
+        no-else-raise,
+        no-else-return,
+        no-init,  # added
+        no-member,
+        no-name-in-module,
+        no-self-use,
+        nonzero-method,
+        oct-method,
+        old-division,
+        old-ne-operator,
+        old-octal-literal,
+        old-raise-syntax,
+        parameter-unpacking,
+        print-statement,
+        raising-string,
+        range-builtin-not-iterating,
+        raw_input-builtin,
+        rdiv-method,
+        reduce-builtin,
+        relative-import,
+        reload-builtin,
+        round-builtin,
+        setslice-method,
+        signature-differs,
+        standarderror-builtin,
+        suppressed-message,
+        sys-max-int,
+        too-few-public-methods,
+        too-many-ancestors,
+        too-many-arguments,
+        too-many-boolean-expressions,
+        too-many-branches,
+        too-many-instance-attributes,
+        too-many-locals,
+        too-many-nested-blocks,
+        too-many-public-methods,
+        too-many-return-statements,
+        too-many-statements,
+        trailing-newlines,
+        unichr-builtin,
+        unicode-builtin,
+        unnecessary-pass,
+        unpacking-in-except,
+        unspecified-encoding,
+        useless-else-on-loop,
+        useless-object-inheritance,
+        useless-suppression,
+        using-cmp-argument,
+        wrong-import-order,
+        xrange-builtin,
+        zip-builtin-not-iterating,
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+#msg-template=
+
+
+[BASIC]
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=main,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=
+
+# Colon-delimited sets of names that determine each other's naming style when
+# the name regexes allow several styles.
+name-group=
+
+# Include a hint for the correct naming format with invalid-name
+include-naming-hint=no
+
+# List of decorators that produce properties, such as abc.abstractproperty. Add
+# to this list to register other decorators that produce valid properties.
+property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
+
+# Regular expression matching correct function names
+function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
+
+# Regular expression matching correct variable names
+variable-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct constant names
+const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct attribute names
+attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
+
+# Regular expression matching correct argument names
+argument-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class attribute names
+class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
+
+# Regular expression matching correct inline iteration names
+inlinevar-rgx=^[a-z][a-z0-9_]*$
+
+# Regular expression matching correct class names
+class-rgx=^_?[A-Z][a-zA-Z0-9]*$
+
+# Regular expression matching correct module names
+module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
+
+# Regular expression matching correct method names
+method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=10
+
+
+[TYPECHECK]
+
+# List of decorators that produce context managers, such as
+# contextlib.contextmanager. Add to this list to register other decorators that
+# produce valid context managers.
+contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of module names for which member attributes should not be checked
+# (useful for modules/projects where namespaces are manipulated during runtime
+# and thus existing member attributes cannot be deduced by static analysis. It
+# supports qualified module names, as well as Unix pattern matching.
+ignored-modules=
+
+# List of class names for which member attributes should not be checked (useful
+# for classes with dynamically set attributes). This supports the use of
+# qualified names.
+ignored-classes=optparse.Values,thread._local,_thread._local
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E1101 when accessed. Python regular
+# expressions are accepted.
+generated-members=
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=80
+
+# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
+# lines made too long by directives to pytype.
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=(?x)(
+  ^\s*(\#\ )?<?https?://\S+>?$|
+  ^\s*(from\s+\S+\s+)?import\s+.+$)
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=yes
+
+# Maximum number of lines in a module
+max-module-lines=99999
+
+# String used as indentation unit.  The internal Google style guide mandates 2
+# spaces.  Google's externaly-published style guide says 4, consistent with
+# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
+# projects (like TensorFlow).
+indent-string='    '
+
+# Number of spaces of indent required inside a hanging  or continued line.
+indent-after-paren=4
+
+# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
+expected-line-ending-format=
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=TODO
+
+
+[STRING]
+
+# This flag controls whether inconsistent-quotes generates a warning when the
+# character used as a quote delimiter is used inconsistently within a module.
+check-quote-consistency=yes
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the name of dummy variables (i.e. expectedly
+# not used).
+dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+# List of strings which can identify a callback function by name. A callback
+# name must start or end with one of those strings.
+callbacks=cb_,_cb
+
+# List of qualified module names which can have objects that can redefine
+# builtins.
+redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
+
+
+[LOGGING]
+
+# Logging modules to check that the string format arguments are in logging
+# function parameter format
+logging-modules=logging,absl.logging,tensorflow.io.logging
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[SPELLING]
+
+# Spelling dictionary name. Available dictionaries: none. To make it working
+# install python-enchant package.
+spelling-dict=
+
+# List of comma separated words that should not be checked.
+spelling-ignore-words=
+
+# A path to a file that contains private dictionary; one word per line.
+spelling-private-dict-file=
+
+# Tells whether to store unknown words to indicated private dictionary in
+# --spelling-private-dict-file option instead of raising a message.
+spelling-store-unknown-words=no
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,
+                   TERMIOS,
+                   Bastion,
+                   rexec,
+                   sets
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+# Force import order to recognize a module as part of the standard
+# compatibility libraries.
+known-standard-library=
+
+# Force import order to recognize a module as part of a third party library.
+known-third-party=enchant, absl
+
+# Analyse import fallback blocks. This can be used to support both Python 2 and
+# 3 compatible code, which means that the block might have code that exists
+# only in one or another interpreter, leading to false positives when analysed.
+analyse-fallback-blocks=no
+
+
+[CLASSES]
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,
+                      __new__,
+                      setUp
+
+# List of member names, which should be excluded from the protected access
+# warning.
+exclude-protected=_asdict,
+                  _fields,
+                  _replace,
+                  _source,
+                  _make
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls,
+                            class_
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=StandardError,
+                       Exception,
+                       BaseException
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -49,12 +49,15 @@ If not, please file a new issue, providing as much relevant information as possi

 In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).

+We include a formatting script [`format.sh`](./format.sh) to format the code.
+
 ### Pull Requests

 When submitting a pull request:

 1. Make sure your code has been rebased on top of the latest commit on the main branch.
-2. Include a detailed description of the changes in the pull request.
+2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
+3. Include a detailed description of the changes in the pull request.
 Explain why you made the changes you did.
 If your pull request fixes an open issue, please include a reference to it in the description.

--- a/README.md
+++ b/README.md
@ -17,8 +17,9 @@ Easy, fast, and cheap LLM serving for everyone
 ---

 *Latest News* 🔥
-
- [2023/06] We officially released vLLM! vLLM has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid April. Check out our [blog post](https://vllm.ai).
+- [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
+- [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
+- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).

 ---

@ -28,7 +29,7 @@ vLLM is fast with:

 - State-of-the-art serving throughput
 - Efficient management of attention key and value memory with **PagedAttention**
- Dynamic batching of incoming requests
+- Continuous batching of incoming requests
 - Optimized CUDA kernels

 vLLM is flexible and easy to use with:
@ -41,10 +42,19 @@ vLLM is flexible and easy to use with:

 vLLM seamlessly supports many Huggingface models, including the following architectures:

+- Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
+- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
+- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
+- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
- GPTNeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
- LLaMA (`lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
+- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
+- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
+- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
+- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
+- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
+- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)

 Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -17,9 +17,11 @@ def main(args: argparse.Namespace):
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
        model=args.model,
+        tokenizer=args.tokenizer,
        tensor_parallel_size=args.tensor_parallel_size,
        max_num_seqs=args.batch_size,
        max_num_batched_tokens=args.batch_size * args.input_len,
+        trust_remote_code=args.trust_remote_code,
    )

    sampling_params = SamplingParams(
@ -63,6 +65,7 @@ if __name__ == '__main__':
        description='Benchmark the latency of processing a single batch of '
                    'requests till completion.')
    parser.add_argument('--model', type=str, default='facebook/opt-125m')
+    parser.add_argument('--tokenizer', type=str, default=None)
    parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
    parser.add_argument('--input-len', type=int, default=32)
    parser.add_argument('--output-len', type=int, default=128)
@ -72,5 +75,7 @@ if __name__ == '__main__':
    parser.add_argument('--use-beam-search', action='store_true')
    parser.add_argument('--num-iters', type=int, default=3,
                        help='Number of iterations to run.')
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='trust remote code from huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -24,20 +24,13 @@ from typing import AsyncGenerator, List, Tuple

 import aiohttp
 import numpy as np
-from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizerBase
+from transformers import PreTrainedTokenizerBase
+from vllm.transformers_utils.tokenizer import get_tokenizer

 # (prompt len, output len, latency)
 REQUEST_LATENCY: List[Tuple[int, int, float]] = []


-def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase:
-    config = AutoConfig.from_pretrained(model_name)
-    if config.model_type == "llama":
-        # A workaround for potential protobuf errors.
-        model_name = "hf-internal-testing/llama-tokenizer"
-    return AutoTokenizer.from_pretrained(model_name)
-
-
 def sample_requests(
    dataset_path: str,
    num_requests: int,
@ -184,7 +177,7 @@ def main(args: argparse.Namespace):
    np.random.seed(args.seed)

    api_url = f"http://{args.host}:{args.port}/generate"
-    tokenizer = get_tokenizer(args.tokenizer)
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
    input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

    benchmark_start_time = time.time()
@ -217,7 +210,7 @@ if __name__ == "__main__":
    parser.add_argument("--backend", type=str, default="vllm",
                        choices=["vllm", "tgi"])
    parser.add_argument("--host", type=str, default="localhost")
-    parser.add_argument("--port", type=int, default=8001)
+    parser.add_argument("--port", type=int, default=8000)
    parser.add_argument("--dataset", type=str, required=True,
                        help="Path to the dataset.")
    parser.add_argument("--tokenizer", type=str, required=True,
@ -234,5 +227,7 @@ if __name__ == "__main__":
                             "Otherwise, we use Poisson process to synthesize "
                             "the request arrival times.")
    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument('--trust-remote-code', action='store_true',
+                        help='trust remote code from huggingface')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -6,23 +6,11 @@ import time
 from typing import List, Tuple

 import torch
-from transformers import (AutoConfig, AutoTokenizer, AutoModelForCausalLM,
-                          PreTrainedTokenizerBase)
+from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
 from tqdm import tqdm

 from vllm import LLM, SamplingParams
-
-
-def get_tokenizer(model_name: str) -> PreTrainedTokenizerBase:
-    config = AutoConfig.from_pretrained(model_name)
-    if config.model_type == "llama":
-        # A workaround for potential protobuf errors.
-        model_name = "hf-internal-testing/llama-tokenizer"
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        # To enable padding in the HF backend.
-        tokenizer.pad_token = tokenizer.eos_token
-        return tokenizer
-    return AutoTokenizer.from_pretrained(model_name)
+from vllm.transformers_utils.tokenizer import get_tokenizer


 def sample_requests(
@ -74,15 +62,19 @@ def sample_requests(
 def run_vllm(
    requests: List[Tuple[str, int, int]],
    model: str,
+    tokenizer: str,
    tensor_parallel_size: int,
    seed: int,
    n: int,
    use_beam_search: bool,
+    trust_remote_code: bool,
 ) -> float:
    llm = LLM(
        model=model,
+        tokenizer=tokenizer,
        tensor_parallel_size=tensor_parallel_size,
        seed=seed,
+        trust_remote_code=trust_remote_code,
    )

    # Add the requests to the engine.
@ -116,11 +108,14 @@ def run_hf(
    n: int,
    use_beam_search: bool,
    max_batch_size: int,
+    trust_remote_code: bool,
 ) -> float:
    assert not use_beam_search
-    tokenizer = get_tokenizer(model)
-    llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16)
+    llm = AutoModelForCausalLM.from_pretrained(model,
+        torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
    llm = llm.cuda()

    pbar = tqdm(total=len(requests))
@ -170,17 +165,18 @@ def main(args: argparse.Namespace):
    random.seed(args.seed)

    # Sample the requests.
-    tokenizer = get_tokenizer(args.model)
+    tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
    requests = sample_requests(args.dataset, args.num_prompts, tokenizer)

    if args.backend == "vllm":
        elapsed_time = run_vllm(
-            requests, args.model, args.tensor_parallel_size, args.seed, args.n,
-            args.use_beam_search)
+            requests, args.model, args.tokenizer, args.tensor_parallel_size,
+            args.seed, args.n, args.use_beam_search, args.trust_remote_code)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
-        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
-                              args.use_beam_search, args.hf_max_batch_size)
+        elapsed_time = run_hf(
+            requests, args.model, tokenizer, args.n, args.use_beam_search,
+            args.hf_max_batch_size, args.trust_remote_code)
    else:
        raise ValueError(f"Unknown backend: {args.backend}")
    total_num_tokens = sum(
@ -198,6 +194,7 @@ if __name__ == "__main__":
    parser.add_argument("--dataset", type=str, required=True,
                        help="Path to the dataset.")
    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
    parser.add_argument("--n", type=int, default=1,
                        help="Number of generated sequences per prompt.")
@ -207,12 +204,18 @@ if __name__ == "__main__":
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--hf-max-batch-size", type=int, default=None,
                        help="Maximum batch size for HF backend.")
+    parser.add_argument('--trust-remote-code',
+                        action='store_true',
+                        help='trust remote code from huggingface')
    args = parser.parse_args()
+
    if args.backend == "vllm":
        if args.hf_max_batch_size is not None:
            raise ValueError("HF max batch size is only for HF backend.")
    elif args.backend == "hf":
        if args.hf_max_batch_size is None:
            raise ValueError("HF max batch size is required for HF backend.")
+    if args.tokenizer is None:
+        args.tokenizer = args.model

    main(args)
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-PORT=8001
+PORT=8000
 MODEL=$1
 TOKENS=$2

--- a/csrc/activation.cpp
+++ b/csrc/activation.cpp
@ -4,9 +4,25 @@ void silu_and_mul(
  torch::Tensor& out,
  torch::Tensor& input);

+void gelu_new(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+void gelu_fast(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
    "silu_and_mul",
    &silu_and_mul,
    "Activation function used in SwiGLU.");
+  m.def(
+    "gelu_new",
+    &gelu_new,
+    "GELU implementation used in GPT-2.");
+  m.def(
+    "gelu_fast",
+    &gelu_fast,
+    "Approximate GELU implementation.");
 }
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@ -46,3 +46,71 @@ void silu_and_mul(
        d);
    });
 }
+
+namespace vllm {
+
+// Element-wise activation kernel template.
+template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void activation_kernel(
+  scalar_t* __restrict__ out,               // [num_tokens, d]
+  const scalar_t* __restrict__ input,       // [num_tokens, d]
+  const int d) {
+  const int token_idx = blockIdx.x;
+  for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = __ldg(&input[token_idx * d + idx]);
+    out[token_idx * d + idx] = ACT_FN(x);
+  }
+}
+
+} // namespace vllm
+
+// Launch element-wise activation kernel.
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
+  int num_tokens = input.size(0);                                                         \
+  int d = input.size(1);                                                                  \
+  dim3 grid(num_tokens);                                                                  \
+  dim3 block(std::min(d, 1024));                                                          \
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
+  AT_DISPATCH_FLOATING_TYPES_AND2(                                                        \
+    at::ScalarType::Half,                                                                 \
+    at::ScalarType::BFloat16,                                                             \
+    input.scalar_type(),                                                                  \
+    "activation_kernel",                                                                  \
+    [&] {                                                                                 \
+      vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
+        out.data_ptr<scalar_t>(),                                                         \
+        input.data_ptr<scalar_t>(),                                                       \
+        d);                                                                               \
+    });
+
+namespace vllm {
+
+template<typename T>
+__device__ __forceinline__ T gelu_new_kernel(const T& x) {
+  const float x3 = (float) (x * x * x);
+  const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
+  return ((T) 0.5) * x * (((T) 1.0) + t);
+}
+
+template<typename T>
+__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
+  const float f = (float) x;
+  const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
+  return ((T) 0.5) * x * (((T) 1.0) + t);
+}
+
+} // namespace vllm
+
+void gelu_new(
+  torch::Tensor& out,     // [num_tokens, d]
+  torch::Tensor& input)   // [num_tokens, d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
+}
+
+void gelu_fast(
+  torch::Tensor& out,     // [num_tokens, d]
+  torch::Tensor& input)   // [num_tokens, d]
+{
+  LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
+}
--- a/csrc/attention.cpp
+++ b/csrc/attention.cpp
@ -1,15 +1,18 @@
 #include <torch/extension.h>
+#include <c10/util/Optional.h>

 void single_query_cached_kv_attention(
  torch::Tensor& out,
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
  int block_size,
-  int max_context_len);
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes);

 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
  m.def(
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@ -74,14 +74,20 @@ template<
 __global__ void single_query_cached_kv_attention_kernel(
  scalar_t* __restrict__ out,             // [num_seqs, num_heads, head_size]
  const scalar_t* __restrict__ q,         // [num_seqs, num_heads, head_size]
-  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_heads, head_size/x, block_size, x]
-  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_heads, head_size, block_size]
+  const scalar_t* __restrict__ k_cache,   // [num_blocks, num_kv_heads, head_size/x, block_size, x]
+  const scalar_t* __restrict__ v_cache,   // [num_blocks, num_kv_heads, head_size, block_size]
+  const int* __restrict__ head_mapping,   // [num_heads]
  const float scale,
  const int* __restrict__ block_tables,   // [num_seqs, max_num_blocks_per_seq]
  const int* __restrict__ context_lens,   // [num_seqs]
  const int max_num_blocks_per_seq,
-  const int q_stride) {
+  const float* __restrict__ alibi_slopes, // [num_heads]
+  const int q_stride,
+  const int kv_block_stride,
+  const int kv_head_stride) {
  constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
+  constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
+  assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
  constexpr int NUM_TOKENS_PER_THREAD_GROUP = (BLOCK_SIZE + WARP_SIZE - 1) / WARP_SIZE;
  constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
  const int thread_idx = threadIdx.x;
@ -90,7 +96,9 @@ __global__ void single_query_cached_kv_attention_kernel(

  const int head_idx = blockIdx.x;
  const int num_heads = gridDim.x;
+  const int kv_head_idx = head_mapping[head_idx];
  const int seq_idx = blockIdx.y;
+  const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];

  // A vector type to store a part of a key or a query.
  // The vector size is configured in such a way that the threads in a thread group
@ -114,12 +122,13 @@ __global__ void single_query_cached_kv_attention_kernel(
  // th vectors of the query, and so on.
  // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
  const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
-  Q_vec q_vecs[NUM_VECS_PER_THREAD];
+  __shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
 #pragma unroll
-  for (int i = 0; i < NUM_VECS_PER_THREAD; i++) {
+  for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
    const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
-    q_vecs[i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
+    q_vecs[thread_group_offset][i] = *reinterpret_cast<const Q_vec*>(q_ptr + vec_idx * VEC_SIZE);
  }
+  __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs

  // Memory planning.
  extern __shared__ char shared_mem[];
@ -156,8 +165,8 @@ __global__ void single_query_cached_kv_attention_kernel(

 #pragma unroll
      for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
-        const scalar_t* k_ptr = k_cache + physical_block_number * num_heads * HEAD_SIZE * BLOCK_SIZE
-                                        + head_idx * HEAD_SIZE * BLOCK_SIZE
+        const scalar_t* k_ptr = k_cache + physical_block_number * kv_block_stride
+                                        + kv_head_idx * kv_head_stride
                                        + physical_block_offset * x;
        const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
        const int offset1 = (vec_idx * VEC_SIZE) / x;
@ -167,12 +176,14 @@ __global__ void single_query_cached_kv_attention_kernel(

      // Compute dot product.
      // This includes a reduction across the threads in the same thread group.
-      const float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs, k_vecs);
-      const bool mask = token_idx >= context_len;
+      float qk = scale * Qk_dot<scalar_t, THREAD_GROUP_SIZE>::dot(q_vecs[thread_group_offset], k_vecs);
+      // Add the ALiBi bias if slopes are given.
+      qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len) : 0;

      if (thread_group_offset == 0) {
        // Store the partial reductions to shared memory.
        // NOTE(woosuk): It is required to zero out the masked logits.
+        const bool mask = token_idx >= context_len;
        logits[token_idx] = mask ? 0.f : qk;
        // Update the max value.
        qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@ -242,8 +253,8 @@ __global__ void single_query_cached_kv_attention_kernel(
    L_vec logits_vec;
    from_float(logits_vec, *reinterpret_cast<Float_L_vec*>(logits + token_idx));

-    const scalar_t* v_ptr = v_cache + physical_block_number * num_heads * HEAD_SIZE * BLOCK_SIZE
-                                    + head_idx * HEAD_SIZE * BLOCK_SIZE;
+    const scalar_t* v_ptr = v_cache + physical_block_number * kv_block_stride
+                                    + kv_head_idx * kv_head_stride;
 #pragma unroll
    for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
      const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@ -324,11 +335,15 @@ __global__ void single_query_cached_kv_attention_kernel(
    query_ptr,                                                                                \
    key_cache_ptr,                                                                            \
    value_cache_ptr,                                                                          \
+    head_mapping_ptr,                                                                         \
    scale,                                                                                    \
    block_tables_ptr,                                                                         \
    context_lens_ptr,                                                                         \
    max_num_blocks_per_seq,                                                                   \
-    query_stride);
+    alibi_slopes_ptr,                                                                         \
+    q_stride,                                                                                 \
+    kv_block_stride,                                                                          \
+    kv_head_stride);

 // TODO(woosuk): Tune NUM_THREADS.
 template<
@ -340,23 +355,33 @@ void single_query_cached_kv_attention_launcher(
  torch::Tensor& query,
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
  float scale,
  torch::Tensor& block_tables,
  torch::Tensor& context_lens,
-  int max_context_len) {
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes) {
  int num_seqs = query.size(0);
  int num_heads = query.size(1);
  int head_size = query.size(2);
  int max_num_blocks_per_seq = block_tables.size(1);
-  int query_stride = query.stride(0);
+  int q_stride = query.stride(0);
+  int kv_block_stride = key_cache.stride(0);
+  int kv_head_stride = key_cache.stride(1);

  int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
  assert(head_size % thread_group_size == 0);

+  // NOTE: alibi_slopes is optional.
+  const float* alibi_slopes_ptr = alibi_slopes ?
+    reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
+    : nullptr;
+
  T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
  T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
  T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
  T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
+  int* head_mapping_ptr = reinterpret_cast<int*>(head_mapping.data_ptr());
  int* block_tables_ptr = block_tables.data_ptr<int>();
  int* context_lens_ptr = context_lens.data_ptr<int>();

@ -371,7 +396,7 @@ void single_query_cached_kv_attention_launcher(
  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
  switch (head_size) {
    // NOTE(woosuk): To reduce the compilation time, we omitted head sizes
-    // 32, 160, 192, 256.
+    // 32, 160, 192.
    // case 32:
    //   LAUNCH_ATTENTION_KERNEL(T, 32, BLOCK_SIZE, NUM_THREADS);
    //   break;
@ -384,6 +409,9 @@ void single_query_cached_kv_attention_launcher(
    case 96:
      LAUNCH_ATTENTION_KERNEL(T, 96, BLOCK_SIZE, NUM_THREADS);
      break;
+    case 112:
+      LAUNCH_ATTENTION_KERNEL(T, 112, BLOCK_SIZE, NUM_THREADS);
+      break;
    case 128:
      LAUNCH_ATTENTION_KERNEL(T, 128, BLOCK_SIZE, NUM_THREADS);
      break;
@ -393,9 +421,9 @@ void single_query_cached_kv_attention_launcher(
    // case 192:
    //   LAUNCH_ATTENTION_KERNEL(T, 192, BLOCK_SIZE, NUM_THREADS);
    //   break;
-    // case 256:
-    //   LAUNCH_ATTENTION_KERNEL(T, 256, BLOCK_SIZE, NUM_THREADS);
-    //   break;
+    case 256:
+      LAUNCH_ATTENTION_KERNEL(T, 256, BLOCK_SIZE, NUM_THREADS);
+      break;
    default:
      TORCH_CHECK(false, "Unsupported head size: ", head_size);
      break;
@ -408,10 +436,12 @@ void single_query_cached_kv_attention_launcher(
    query,                                                          \
    key_cache,                                                      \
    value_cache,                                                    \
+    head_mapping,                                                   \
    scale,                                                          \
    block_tables,                                                   \
    context_lens,                                                   \
-    max_context_len);
+    max_context_len,                                                \
+    alibi_slopes);

 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
 // 1, 2, 4, 64, 128, 256.
@ -454,11 +484,13 @@ void single_query_cached_kv_attention(
  torch::Tensor& query,           // [num_seqs, num_heads, head_size]
  torch::Tensor& key_cache,       // [num_blocks, num_heads, head_size/x, block_size, x]
  torch::Tensor& value_cache,     // [num_blocks, num_heads, head_size, block_size]
+  torch::Tensor& head_mapping,    // [num_heads]
  float scale,
  torch::Tensor& block_tables,    // [num_seqs, max_num_blocks_per_seq]
  torch::Tensor& context_lens,    // [num_seqs]
  int block_size,
-  int max_context_len) {
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes) {
  if (query.dtype() == at::ScalarType::Float) {
    CALL_KERNEL_LAUNCHER_BLOCK_SIZE(float);
  } else if (query.dtype() == at::ScalarType::Half) {
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@ -7,11 +7,13 @@ template<typename scalar_t>
 __global__ void rotary_embedding_neox_kernel(
  const int64_t* __restrict__ positions,        // [num_tokens]
  scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
-  scalar_t* __restrict__ key,                   // [num_tokens, num_heads, head_size]
+  scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
  const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
  const int rot_dim,
-  const int stride,
+  const int query_stride,
+  const int key_stride,
  const int num_heads,
+  const int num_kv_heads,
  const int head_size) {
  // Each thread block is responsible for one token.
  const int token_idx = blockIdx.x;
@ -19,17 +21,17 @@ __global__ void rotary_embedding_neox_kernel(
  const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;

  const int embed_dim = rot_dim / 2;
-  const int n = num_heads * embed_dim;
-  for (int i = threadIdx.x; i < n; i += blockDim.x) {
+  const int nq = num_heads * embed_dim;
+  for (int i = threadIdx.x; i < nq; i += blockDim.x) {
    const int head_idx = i / embed_dim;
-    const int token_head = token_idx * stride + head_idx * head_size;
+    const int token_head = token_idx * query_stride + head_idx * head_size;

    const int rot_offset = i % embed_dim;
    const int x_index = rot_offset;
    const int y_index = embed_dim + rot_offset;

-    const int out_x = token_idx * stride + head_idx * head_size + x_index;
-    const int out_y = token_idx * stride + head_idx * head_size + y_index;
+    const int out_x = token_idx * query_stride + head_idx * head_size + x_index;
+    const int out_y = token_idx * query_stride + head_idx * head_size + y_index;

    const scalar_t cos = __ldg(cache_ptr + x_index);
    const scalar_t sin = __ldg(cache_ptr + y_index);
@ -38,6 +40,22 @@ __global__ void rotary_embedding_neox_kernel(
    const scalar_t q_y = query[token_head + y_index];
    query[out_x] = q_x * cos - q_y * sin;
    query[out_y] = q_y * cos + q_x * sin;
+  }
+
+  const int nk = num_kv_heads * embed_dim;
+  for (int i = threadIdx.x; i < nk; i += blockDim.x) {
+    const int head_idx = i / embed_dim;
+    const int token_head = token_idx * key_stride + head_idx * head_size;
+
+    const int rot_offset = i % embed_dim;
+    const int x_index = rot_offset;
+    const int y_index = embed_dim + rot_offset;
+
+    const int out_x = token_idx * key_stride + head_idx * head_size + x_index;
+    const int out_y = token_idx * key_stride + head_idx * head_size + y_index;
+
+    const scalar_t cos = __ldg(cache_ptr + x_index);
+    const scalar_t sin = __ldg(cache_ptr + y_index);

    const scalar_t k_x = key[token_head + x_index];
    const scalar_t k_y = key[token_head + y_index];
@ -51,15 +69,16 @@ __global__ void rotary_embedding_neox_kernel(
 void rotary_embedding_neox(
  torch::Tensor& positions,         // [num_tokens]
  torch::Tensor& query,             // [num_tokens, num_heads * head_size]
-  torch::Tensor& key,               // [num_tokens, num_heads * head_size]
+  torch::Tensor& key,               // [num_tokens, num_kv_heads * head_size]
  int head_size,
  torch::Tensor& cos_sin_cache)     // [max_position, rot_dim]
 {
  int num_tokens = query.size(0);
  int rot_dim = cos_sin_cache.size(1);
  int num_heads = query.size(1) / head_size;
-  int stride = query.stride(0);
-  TORCH_CHECK(stride == key.stride(0));
+  int num_kv_heads = key.size(1) / head_size;
+  int query_stride = query.stride(0);
+  int key_stride = key.stride(0);

  dim3 grid(num_tokens);
  dim3 block(std::min(num_heads * rot_dim / 2, 512));
@ -76,8 +95,10 @@ void rotary_embedding_neox(
        key.data_ptr<scalar_t>(),
        cos_sin_cache.data_ptr<scalar_t>(),
        rot_dim,
-        stride,
+        query_stride,
+        key_stride,
        num_heads,
+        num_kv_heads,
        head_size);
    });
 }
--- a/docs/README.md
+++ b/docs/README.md
@ -4,14 +4,14 @@

 ```bash
 # Install dependencies.
-pip -r requirements-docs.txt
+pip install -r requirements-docs.txt

 # Build the docs.
 make clean
 make html
 ```

-## Open the docs with your brower
+## Open the docs with your browser

 ```bash
 python -m http.server -d build/html/
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -29,7 +29,7 @@ vLLM is fast with:

 * State-of-the-art serving throughput
 * Efficient management of attention key and value memory with **PagedAttention**
-* Dynamic batching of incoming requests
+* Continuous batching of incoming requests
 * Optimized CUDA kernels

 vLLM is flexible and easy to use with:
@ -40,7 +40,11 @@ vLLM is flexible and easy to use with:
 * Streaming outputs
 * OpenAI-compatible API server

-For more information, please refer to our `blog post <https://vllm.ai>`_.
+For more information, check out the following:
+
+* `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
+* `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
+


 Documentation
@ -53,6 +57,13 @@ Documentation
   getting_started/installation
   getting_started/quickstart

+.. toctree::
+   :maxdepth: 1
+   :caption: Serving
+
+   serving/distributed_serving
+   serving/run_on_sky
+
 .. toctree::
   :maxdepth: 1
   :caption: Models
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -14,18 +14,45 @@ Alongside each architecture, we include some popular models that use it.
  * - Architecture
    - Models
    - Example HuggingFace Models
+  * - :code:`AquilaForCausalLM`
+    - Aqualia
+    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
+  * - :code:`BaiChuanForCausalLM`
+    - Baichuan
+    - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
+  * - :code:`BloomForCausalLM`
+    - BLOOM, BLOOMZ, BLOOMChat
+    - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
+  * - :code:`FalconForCausalLM`
+    - Falcon
+    - :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
  * - :code:`GPT2LMHeadModel`
    - GPT-2
    - :code:`gpt2`, :code:`gpt2-xl`, etc.
+  * - :code:`GPTBigCodeForCausalLM`
+    - StarCoder, SantaCoder, WizardCoder
+    - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
+  * - :code:`GPTJForCausalLM`
+    - GPT-J
+    - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
  * - :code:`GPTNeoXForCausalLM`
    - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
    - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
+  * - :code:`InternLMForCausalLM`
+    - InternLM
+    - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
  * - :code:`LlamaForCausalLM`
-    - LLaMA, Vicuna, Alpaca, Koala, Guanaco
-    - :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, :code:`JosephusCheung/Guanaco`, etc.
+    - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
+    - :code:`meta-llama/Llama-2-13b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, :code:`JosephusCheung/Guanaco`, etc.
+  * - :code:`MPTForCausalLM`
+    - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
+    - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
  * - :code:`OPTForCausalLM`
    - OPT, OPT-IML
    - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
+  * - :code:`OPTForCausalLM`
+    - Qwen
+    - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.

 If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
 Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
--- a/docs/source/serving/distributed_serving.rst
+++ b/docs/source/serving/distributed_serving.rst
@ -0,0 +1,38 @@
+.. _distributed_serving:
+
+Distributed Inference and Serving
+=================================
+
+vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
+
+.. code-block:: console
+
+    $ pip install ray
+
+To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
+
+.. code-block:: python
+
+    from vllm import LLM
+    llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
+    output = llm.generate("San Franciso is a")
+
+To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
+
+.. code-block:: console
+
+    $ python -m vllm.entrypoints.api_server \
+    $     --model facebook/opt-13b \
+    $     --tensor-parallel-size 4
+
+To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
+
+.. code-block:: console
+
+    $ # On head node
+    $ ray start --head
+
+    $ # On worker nodes
+    $ ray start --address=<ray-head-address>
+
+After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.
--- a/docs/source/serving/run_on_sky.rst
+++ b/docs/source/serving/run_on_sky.rst
@ -0,0 +1,69 @@
+.. _on_cloud:
+
+Running on clouds with SkyPilot
+===============================
+
+.. raw:: html
+
+    <p align="center">
+        <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
+    </p>
+
+vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
+
+To install SkyPilot and setup your cloud credentials, run:
+
+.. code-block:: console
+
+    $ pip install skypilot
+    $ sky check
+
+See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
+
+.. code-block:: yaml
+
+    resources:
+        accelerators: A100
+
+    envs:
+        MODEL_NAME: decapoda-research/llama-13b-hf
+        TOKENIZER: hf-internal-testing/llama-tokenizer
+
+    setup: |
+        conda create -n vllm python=3.9 -y
+        conda activate vllm
+        git clone https://github.com/vllm-project/vllm.git
+        cd vllm
+        pip install .
+        pip install gradio
+
+    run: |
+        conda activate vllm
+        echo 'Starting vllm api server...'
+        python -u -m vllm.entrypoints.api_server \
+                        --model $MODEL_NAME \
+                        --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+                        --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
+        echo 'Waiting for vllm api server to start...'
+        while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
+        echo 'Starting gradio server...'
+        python vllm/examples/gradio_webserver.py
+
+Start the serving the LLaMA-13B model on an A100 GPU:
+
+.. code-block:: console
+
+    $ sky launch serving.yaml
+
+Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
+
+.. code-block:: console
+
+    (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
+
+**Optional**: Serve the 65B model instead of the default 13B and use more GPU:
+
+.. code-block:: console
+
+    sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
+
--- a/examples/api_client.py
+++ b/examples/api_client.py
@ -14,7 +14,9 @@ def clear_line(n: int = 1) -> None:
        print(LINE_UP, end=LINE_CLEAR, flush=True)


-def post_http_request(prompt: str, api_url: str, n: int = 1,
+def post_http_request(prompt: str,
+                      api_url: str,
+                      n: int = 1,
                      stream: bool = False) -> requests.Response:
    headers = {"User-Agent": "Test Client"}
    pload = {
@ -30,7 +32,8 @@ def post_http_request(prompt: str, api_url: str, n: int = 1,


 def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False,
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
--- a/examples/gradio_webserver.py
+++ b/examples/gradio_webserver.py
@ -12,9 +12,14 @@ def http_bot(prompt):
        "stream": True,
        "max_tokens": 128,
    }
-    response = requests.post(args.model_url, headers=headers, json=pload, stream=True)
+    response = requests.post(args.model_url,
+                             headers=headers,
+                             json=pload,
+                             stream=True)

-    for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
+    for chunk in response.iter_lines(chunk_size=8192,
+                                     decode_unicode=False,
+                                     delimiter=b"\0"):
        if chunk:
            data = json.loads(chunk.decode("utf-8"))
            output = data["text"][0]
@ -23,11 +28,11 @@ def http_bot(prompt):

 def build_demo():
    with gr.Blocks() as demo:
-        gr.Markdown(
-            "# vLLM text completion demo\n"
-        )
-        inputbox = gr.Textbox(label="Input", placeholder="Enter text and press ENTER")
-        outputbox = gr.Textbox(label="Output", placeholder="Generated result from the model")
+        gr.Markdown("# vLLM text completion demo\n")
+        inputbox = gr.Textbox(label="Input",
+                              placeholder="Enter text and press ENTER")
+        outputbox = gr.Textbox(label="Output",
+                               placeholder="Generated result from the model")
        inputbox.submit(http_bot, [inputbox], [outputbox])
    return demo

@ -36,7 +41,9 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--host", type=str, default="localhost")
    parser.add_argument("--port", type=int, default=8001)
-    parser.add_argument("--model-url", type=str, default="http://localhost:8000/generate")
+    parser.add_argument("--model-url",
+                        type=str,
+                        default="http://localhost:8000/generate")
    args = parser.parse_args()

    demo = build_demo()
--- a/examples/llm_engine_example.py
+++ b/examples/llm_engine_example.py
@ -10,19 +10,25 @@ def main(args: argparse.Namespace):

    # Test the following prompts.
    test_prompts = [
-        ("A robot may not injure a human being", SamplingParams()),
+        ("A robot may not injure a human being",
+         SamplingParams(temperature=0.0)),
        ("To be or not to be,",
         SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
        ("What is the meaning of life?",
-         SamplingParams(n=2, best_of=5, temperature=0.8, top_p=0.95, frequency_penalty=0.1)),
+         SamplingParams(n=2,
+                        best_of=5,
+                        temperature=0.8,
+                        top_p=0.95,
+                        frequency_penalty=0.1)),
        ("It is only with the heart that one can see rightly",
-         SamplingParams(n=3, best_of=3, use_beam_search=True, temperature=0.0)),
+         SamplingParams(n=3, best_of=3, use_beam_search=True,
+                        temperature=0.0)),
    ]

    # Run the engine by calling `engine.step()` manually.
    request_id = 0
    while True:
-        # To test iteration-level scheduling, we add one request at each step.
+        # To test continuous batching, we add one request at each step.
        if test_prompts:
            prompt, sampling_params = test_prompts.pop(0)
            engine.add_request(str(request_id), prompt, sampling_params)
--- a/examples/offline_inference.py
+++ b/examples/offline_inference.py
@ -1,6 +1,5 @@
 from vllm import LLM, SamplingParams

-
 # Sample prompts.
 prompts = [
    "Hello, my name is",
--- a/examples/openai_chatcompletion_client.py
+++ b/examples/openai_chatcompletion_client.py
@ -0,0 +1,33 @@
+import openai
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+# List models API
+models = openai.Model.list()
+print("Models:", models)
+
+model = models["data"][0]["id"]
+
+# Chat completion API
+chat_completion = openai.ChatCompletion.create(
+    model=model,
+    messages=[{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "Who won the world series in 2020?"
+    }, {
+        "role":
+        "assistant",
+        "content":
+        "The Los Angeles Dodgers won the World Series in 2020."
+    }, {
+        "role": "user",
+        "content": "Where was it played?"
+    }])
+
+print("Chat completion results:")
+print(chat_completion)
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@ -3,21 +3,26 @@ import openai
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai.api_key = "EMPTY"
 openai.api_base = "http://localhost:8000/v1"
-model = "facebook/opt-125m"

-# Test list models API
+# List models API
 models = openai.Model.list()
 print("Models:", models)

-# Test completion API
-stream = True
-completion = openai.Completion.create(
-    model=model, prompt="A robot may not injure a human being", echo=False, n=2,
-    best_of=3, stream=stream, logprobs=3)
+model = models["data"][0]["id"]

-# print the completion
+# Completion API
+stream = False
+completion = openai.Completion.create(
+    model=model,
+    prompt="A robot may not injure a human being",
+    echo=False,
+    n=2,
+    stream=stream,
+    logprobs=3)
+
+print("Completion results:")
 if stream:
    for c in completion:
        print(c)
 else:
-    print("Completion result:", completion)
+    print(completion)
--- a/format.sh
+++ b/format.sh
@ -0,0 +1,108 @@
+#!/usr/bin/env bash
+# YAPF formatter, adapted from ray and skypilot.
+#
+# Usage:
+#    # Do work and commit your work.
+
+#    # Format files that differ from origin/main.
+#    bash format.sh
+
+#    # Commit changed files with message 'Run yapf and pylint'
+#
+#
+# YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
+# You are encouraged to run this locally before pushing changes for review.
+
+# Cause the script to exit if a single command fails
+set -eo pipefail
+
+# this stops git rev-parse from failing if we run this from the .git directory
+builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
+ROOT="$(git rev-parse --show-toplevel)"
+builtin cd "$ROOT" || exit 1
+
+YAPF_VERSION=$(yapf --version | awk '{print $2}')
+PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
+MYPY_VERSION=$(mypy --version | awk '{print $2}')
+
+# # params: tool name, tool version, required version
+tool_version_check() {
+    if [[ $2 != $3 ]]; then
+        echo "Wrong $1 version installed: $3 is required, not $2."
+        exit 1
+    fi
+}
+
+tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
+
+YAPF_FLAGS=(
+    '--recursive'
+    '--parallel'
+)
+
+YAPF_EXCLUDES=(
+    '--exclude' 'build/**'
+    '--exclude' 'vllm/model_executor/parallel_utils/**'
+)
+
+# Format specified files
+format() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "$@"
+}
+
+# Format files that differ from main branch. Ignores dirs that are not slated
+# for autoformat yet.
+format_changed() {
+    # The `if` guard ensures that the list of filenames is not empty, which
+    # could cause yapf to receive 0 positional arguments, making it hang
+    # waiting for STDIN.
+    #
+    # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
+    # exist on both branches.
+    MERGEBASE="$(git merge-base origin/main HEAD)"
+
+    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
+        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
+             yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
+    fi
+
+}
+
+# Format all files
+format_all() {
+    yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm
+}
+
+## This flag formats individual files. --files *must* be the first command line
+## arg to use this option.
+if [[ "$1" == '--files' ]]; then
+   format "${@:2}"
+   # If `--all` is passed, then any further arguments are ignored and the
+   # entire python directory is formatted.
+elif [[ "$1" == '--all' ]]; then
+   format_all
+else
+   # Format only the files that changed in last commit.
+   format_changed
+fi
+echo 'vLLM yapf: Done'
+
+# Run mypy
+# TODO(zhuohan): Enable mypy
+# echo 'vLLM mypy:'
+# mypy
+
+# Run Pylint
+echo 'vLLM Pylint:'
+pylint vllm
+
+if ! git diff --quiet &>/dev/null; then
+    echo 'Reformatted files. Please review and stage the changes.'
+    echo 'Changes not staged for commit:'
+    echo
+    git --no-pager diff --name-only
+
+    exit 1
+fi
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,2 +1,12 @@
-mypy
+# formatting
+yapf==0.32.0
+pylint==2.8.2
+
+# type checking
+mypy==0.991
+types-PyYAML
+types-requests
+types-setuptools
+
+# testing
 pytest
--- a/requirements.txt
+++ b/requirements.txt
@ -1,11 +1,11 @@
 ninja  # For faster builds.
 psutil
-ray
+ray >= 2.5.1
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
 torch >= 2.0.0
-transformers >= 4.28.0  # Required for LLaMA.
-xformers >= 0.0.19
+transformers >= 4.31.0  # Required for LLaMA-2.
+xformers >= 0.0.21
 fastapi
 uvicorn
-pydantic  # Required for OpenAI server.
+pydantic < 2  # Required for OpenAI server.
--- a/setup.py
+++ b/setup.py
@ -20,10 +20,9 @@ ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]

-if not torch.cuda.is_available():
+if CUDA_HOME is None:
    raise RuntimeError(
-        f"Cannot find CUDA at CUDA_HOME: {CUDA_HOME}. "
-        "CUDA must be available in order to build the package.")
+        f"Cannot find CUDA_HOME. CUDA must be available to build the package.")


 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
@ -48,12 +47,6 @@ for i in range(device_count):
        raise RuntimeError(
            "GPUs with compute capability less than 7.0 are not supported.")
    compute_capabilities.add(major * 10 + minor)
-# If no GPU is available, add all supported compute capabilities.
-if not compute_capabilities:
-    compute_capabilities = {70, 75, 80, 86, 90}
-# Add target compute capabilities to NVCC flags.
-for capability in compute_capabilities:
-    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]

 # Validate the NVCC CUDA version.
 nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
@ -62,10 +55,31 @@ if nvcc_cuda_version < Version("11.0"):
 if 86 in compute_capabilities and nvcc_cuda_version < Version("11.1"):
    raise RuntimeError(
        "CUDA 11.1 or higher is required for GPUs with compute capability 8.6.")
+if 89 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
+    # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
+    # However, GPUs with compute capability 8.9 can also run the code generated by
+    # the previous versions of CUDA 11 and targeting compute capability 8.0.
+    # Therefore, if CUDA 11.8 is not available, we target compute capability 8.0
+    # instead of 8.9.
+    compute_capabilities.remove(89)
+    compute_capabilities.add(80)
 if 90 in compute_capabilities and nvcc_cuda_version < Version("11.8"):
    raise RuntimeError(
        "CUDA 11.8 or higher is required for GPUs with compute capability 9.0.")

+# If no GPU is available, add all supported compute capabilities.
+if not compute_capabilities:
+    compute_capabilities = {70, 75, 80}
+    if nvcc_cuda_version >= Version("11.1"):
+        compute_capabilities.add(86)
+    if nvcc_cuda_version >= Version("11.8"):
+        compute_capabilities.add(89)
+        compute_capabilities.add(90)
+
+# Add target compute capabilities to NVCC flags.
+for capability in compute_capabilities:
+    NVCC_FLAGS += ["-gencode", f"arch=compute_{capability},code=sm_{capability}"]
+
 # Use NVCC threads to parallelize the build.
 if nvcc_cuda_version >= Version("11.2"):
    num_threads = min(os.cpu_count(), 8)
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@ -1,6 +1,6 @@
 import torch
 import torch.nn.functional as F
-
+from transformers.activations import get_activation
 from vllm import activation_ops


@ -28,3 +28,45 @@ def test_silu_and_mul() -> None:
            for d in [512, 4096, 5120, 13824]:
                print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
                run_silu_and_mul(num_tokens, d, dtype)
+
+
+@torch.inference_mode()
+def run_gelu_new(
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+) -> None:
+    x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
+    out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
+    activation_ops.gelu_new(out, x)
+    ref_out = get_activation("gelu_new")(x)
+    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+
+
+def test_gelu_new() -> None:
+    for dtype in [torch.half, torch.bfloat16, torch.float]:
+        for num_tokens in [7, 83, 2048]:
+            for d in [512, 4096, 5120, 13824]:
+                print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
+                run_gelu_new(num_tokens, d, dtype)
+
+
+@torch.inference_mode()
+def run_gelu_fast(
+    num_tokens: int,
+    d: int,
+    dtype: torch.dtype,
+) -> None:
+    x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
+    out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
+    activation_ops.gelu_fast(out, x)
+    ref_out = get_activation("gelu_fast")(x)
+    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
+
+
+def test_gelu_fast() -> None:
+    for dtype in [torch.half, torch.bfloat16, torch.float]:
+        for num_tokens in [7, 83, 2048]:
+            for d in [512, 4096, 5120, 13824]:
+                print(f'Testing dtype={dtype}, num_tokens={num_tokens}, d={d}')
+                run_gelu_fast(num_tokens, d, dtype)
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -60,7 +60,7 @@ def ref_single_query_cached_kv_attention(
        keys = torch.stack(keys, dim=0)
        values = torch.stack(values, dim=0)

-        scale = 1.0 / (head_size ** 0.5)
+        scale = 1.0 / (head_size**0.5)
        out = ref_masked_attention(q, keys, values, scale)
        out = out.view(num_heads, head_size)
        output[i].copy_(out, non_blocking=True)
@ -74,7 +74,7 @@ def ref_multi_query_kv_attention(
    dtype: torch.dtype,
 ) -> torch.Tensor:
    head_size = query.shape[-1]
-    scale = 1.0 / (head_size ** 0.5)
+    scale = 1.0 / (head_size**0.5)

    num_seqs = len(cu_seq_lens) - 1
    ref_outputs = []
@ -84,8 +84,8 @@ def ref_multi_query_kv_attention(
        seq_len = end_idx - start_idx

        # Create attention mask.
-        attn_mask = torch.triu(
-            torch.ones(seq_len, seq_len, dtype=dtype), diagonal=1)
+        attn_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=dtype),
+                               diagonal=1)
        attn_mask = attn_mask * torch.finfo(dtype).min
        attn_mask = attn_mask.to(dtype=dtype, device='cuda')

@ -113,7 +113,7 @@ def ref_multi_query_cached_kv_attention(
    num_heads = value_cache.shape[1]
    head_size = value_cache.shape[2]
    block_size = value_cache.shape[3]
-    scale = 1.0 / (head_size ** 0.5)
+    scale = 1.0 / (head_size**0.5)

    num_queries = len(cu_query_lens) - 1
    ref_outputs = []
@ -125,8 +125,8 @@ def ref_multi_query_cached_kv_attention(
        block_table = block_tables[i]

        # Create attention mask
-        attn_mask = torch.triu(
-            torch.ones(query_len, context_len), diagonal=context_len - query_len + 1) * -1e5
+        attn_mask = torch.triu(torch.ones(query_len, context_len),
+                               diagonal=context_len - query_len + 1) * -1e5
        attn_mask = attn_mask.to(dtype=dtype, device='cuda')

        keys = []
@ -164,20 +164,27 @@ def run_single_query_cached_kv_attention(
    block_size: int,
    num_blocks: int,
    dtype: torch.dtype,
+    num_kv_heads: int = None,
 ) -> None:
-    qkv = torch.empty(
-        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
+    qkv = torch.empty(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device='cuda')
    qkv.uniform_(-1e-3, 1e-3)
    query, _, _ = qkv.unbind(dim=1)

    x = 16 // torch.tensor([], dtype=dtype).element_size()
    key_block_shape = (num_heads, head_size // x, block_size, x)
-    key_cache = torch.empty(
-        size=(num_blocks, *key_block_shape), dtype=dtype, device='cuda')
+    key_cache = torch.empty(size=(num_blocks, *key_block_shape),
+                            dtype=dtype,
+                            device='cuda')
    key_cache.uniform_(-1e-3, 1e-3)
    value_block_shape = (num_heads, head_size, block_size)
-    value_cache = torch.empty(
-        size=(num_blocks, *value_block_shape), dtype=dtype, device='cuda')
+    value_cache = torch.empty(size=(num_blocks, *value_block_shape),
+                              dtype=dtype,
+                              device='cuda')
    value_cache.uniform_(-1e-3, 1e-3)

    context_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_tokens)]
@ -193,20 +200,34 @@ def run_single_query_cached_kv_attention(
        ]
        block_tables.append(block_table)
    block_tables = torch.tensor(block_tables, dtype=torch.int, device='cuda')
+    head_mapping = torch.arange(num_heads, dtype=torch.int32, device="cuda")

-    scale = float(1.0 / (head_size ** 0.5))
-    output = torch.empty(
-        num_tokens, num_heads, head_size, dtype=dtype, device='cuda')
+    scale = float(1.0 / (head_size**0.5))
+
+    num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+    assert num_heads % num_kv_heads == 0
+    num_queries_per_kv = num_heads // num_kv_heads
+    head_mapping = torch.repeat_interleave(
+        torch.arange(num_kv_heads, dtype=torch.int32, device="cuda"),
+                     num_queries_per_kv)
+
+    output = torch.empty(num_tokens,
+                         num_heads,
+                         head_size,
+                         dtype=dtype,
+                         device='cuda')
    attention_ops.single_query_cached_kv_attention(
        output,
        query,
        key_cache,
        value_cache,
+        head_mapping,
        scale,
        block_tables,
        context_lens,
        block_size,
        max_context_len,
+        None,  # ALiBi slopes.
    )

    ref_output = torch.empty_like(query)
@ -235,9 +256,13 @@ def run_multi_query_kv_attention(
    seq_lens = random.sample(range(1, MAX_SEQ_LEN), num_seqs)
    num_tokens = sum(seq_lens)

-    scale = float(1.0 / (head_size ** 0.5))
-    qkv = torch.empty(
-        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
+    scale = float(1.0 / (head_size**0.5))
+    qkv = torch.empty(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device='cuda')
    qkv.uniform_(-1e-3, 1e-3)
    query, key, value = qkv.unbind(dim=1)

@ -272,7 +297,7 @@ def test_single_query_cached_kv_attention() -> None:
    torch.cuda.manual_seed(TEST_SEED)
    for dtype in [torch.half, torch.bfloat16, torch.float]:
        for block_size in [8, 16, 32]:
-            for head_size in [64, 80, 96, 128]:
+            for head_size in [64, 80, 96, 112, 128, 256]:
                print(f'Testing single_query_cached_kv_attention with '
                      f'dtype={dtype}, block_size={block_size}, '
                      f'head_size={head_size}')
@ -290,7 +315,7 @@ def test_multi_query_kv_attention() -> None:
    torch.random.manual_seed(TEST_SEED)
    torch.cuda.manual_seed(TEST_SEED)
    for dtype in [torch.half, torch.bfloat16, torch.float]:
-        for head_size in [64, 80, 96, 128]:
+        for head_size in [64, 80, 96, 112, 128, 256]:
            print(f'Testing multi_query_kv_attention with dtype={dtype}, '
                  f'head_size={head_size}')
            run_multi_query_kv_attention(
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -26,8 +26,9 @@ def run_copy_blocks(
    key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
    key_caches = []
    for _ in range(num_layers):
-        key_cache = torch.randn(
-            size=key_cache_shape, dtype=dtype, device='cuda')
+        key_cache = torch.randn(size=key_cache_shape,
+                                dtype=dtype,
+                                device='cuda')
        key_caches.append(key_cache)
    cloned_key_caches = []
    for key_cache in key_caches:
@ -36,8 +37,9 @@ def run_copy_blocks(
    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
    value_caches = []
    for _ in range(num_layers):
-        value_cache = torch.randn(
-            size=value_cache_shape, dtype=dtype, device='cuda')
+        value_cache = torch.randn(size=value_cache_shape,
+                                  dtype=dtype,
+                                  device='cuda')
        value_caches.append(value_cache)
    cloned_value_caches = []
    for value_cache in value_caches:
@ -49,15 +51,18 @@ def run_copy_blocks(
    # Reference implementation.
    for src, dsts in block_mapping.items():
        for dst in dsts:
-            for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
+            for key_cache, cloned_key_cache in zip(key_caches,
+                                                   cloned_key_caches):
                cloned_key_cache[dst] = cloned_key_cache[src]
-            for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
+            for value_cache, cloned_value_cache in zip(value_caches,
+                                                       cloned_value_caches):
                cloned_value_cache[dst] = cloned_value_cache[src]

    # Compare the results.
    for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
        assert torch.allclose(key_cache, cloned_key_cache)
-    for value_cache, cloned_value_cache in zip(value_caches, cloned_value_caches):
+    for value_cache, cloned_value_cache in zip(value_caches,
+                                               cloned_value_caches):
        assert torch.allclose(value_cache, cloned_value_cache)


@ -74,8 +79,12 @@ def run_reshape_and_cache(
    slot_mapping = random.sample(range(num_slots), num_tokens)
    slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')

-    qkv = torch.randn(
-        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
+    qkv = torch.randn(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device='cuda')
    _, key, value = qkv.unbind(dim=1)

    x = 16 // torch.tensor([], dtype=dtype).element_size()
@ -84,15 +93,19 @@ def run_reshape_and_cache(
    cloned_key_cache = key_cache.clone()

    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_cache = torch.randn(
-        size=value_cache_shape, dtype=dtype, device='cuda')
+    value_cache = torch.randn(size=value_cache_shape,
+                              dtype=dtype,
+                              device='cuda')
    cloned_value_cache = value_cache.clone()

-    cache_ops.reshape_and_cache(key, value, key_cache, value_cache, slot_mapping)
+    cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
+                                slot_mapping)

    for i in range(num_tokens):
        reshaped_key = key.reshape(num_tokens, num_heads, head_size // x, x)
-        block_idx = torch.div(slot_mapping[i], block_size, rounding_mode='floor')
+        block_idx = torch.div(slot_mapping[i],
+                              block_size,
+                              rounding_mode='floor')
        block_offset = slot_mapping[i] % block_size
        cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
        cloned_value_cache[block_idx, :, :, block_offset] = value[i]
@ -114,8 +127,12 @@ def run_gather_cached_kv(
    slot_mapping = random.sample(range(num_slots), num_tokens)
    slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')

-    qkv = torch.randn(
-        num_tokens, 3, num_heads, head_size, dtype=dtype, device='cuda')
+    qkv = torch.randn(num_tokens,
+                      3,
+                      num_heads,
+                      head_size,
+                      dtype=dtype,
+                      device='cuda')
    _, key, value = qkv.unbind(dim=1)

    qkv_clone = qkv.clone()
@ -126,15 +143,20 @@ def run_gather_cached_kv(
    key_cache = torch.randn(size=key_cache_shape, dtype=dtype, device='cuda')

    value_cache_shape = (num_blocks, num_heads, head_size, block_size)
-    value_cache = torch.randn(
-        size=value_cache_shape, dtype=dtype, device='cuda')
+    value_cache = torch.randn(size=value_cache_shape,
+                              dtype=dtype,
+                              device='cuda')

-    cache_ops.gather_cached_kv(key, value, key_cache, value_cache, slot_mapping)
+    cache_ops.gather_cached_kv(key, value, key_cache, value_cache,
+                               slot_mapping)

    # Reference implementation.
    for i in range(num_tokens):
-        reshaped_key = cloned_key.reshape(num_tokens, num_heads, head_size // x, x)
-        block_idx = torch.div(slot_mapping[i], block_size, rounding_mode='floor')
+        reshaped_key = cloned_key.reshape(num_tokens, num_heads,
+                                          head_size // x, x)
+        block_idx = torch.div(slot_mapping[i],
+                              block_size,
+                              rounding_mode='floor')
        block_offset = slot_mapping[i] % block_size
        reshaped_key[i] = key_cache[block_idx, :, :, block_offset, :]
        cloned_value[i] = value_cache[block_idx, :, :, block_offset]
@ -145,20 +167,30 @@ def run_gather_cached_kv(

 def test_copy_blocks() -> None:
    for dtype in [torch.half, torch.bfloat16, torch.float]:
-        run_copy_blocks(
-            num_mappings=23, num_layers=7, num_heads=17, head_size=16,
-            block_size=8, num_blocks=1024, dtype=dtype)
+        run_copy_blocks(num_mappings=23,
+                        num_layers=7,
+                        num_heads=17,
+                        head_size=16,
+                        block_size=8,
+                        num_blocks=1024,
+                        dtype=dtype)


 def test_reshape_and_cache() -> None:
    for dtype in [torch.half, torch.bfloat16, torch.float]:
-        run_reshape_and_cache(
-            num_tokens=3, num_heads=2, head_size=16, block_size=8, num_blocks=2,
-            dtype=dtype)
+        run_reshape_and_cache(num_tokens=3,
+                              num_heads=2,
+                              head_size=16,
+                              block_size=8,
+                              num_blocks=2,
+                              dtype=dtype)


 def test_gather_cached_kv() -> None:
    for dtype in [torch.half, torch.bfloat16, torch.float]:
-        run_gather_cached_kv(
-            num_tokens=3, num_heads=2, head_size=16, block_size=8, num_blocks=2,
-            dtype=dtype)
+        run_gather_cached_kv(num_tokens=3,
+                             num_heads=2,
+                             head_size=16,
+                             block_size=8,
+                             num_blocks=2,
+                             dtype=dtype)
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@ -14,8 +14,10 @@ class RefRMSNorm(nn.Module):
        self.variance_epsilon = eps

    def forward(self, hidden_states):
-        variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
+                                                               keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
        if self.weight.dtype in [torch.half, torch.float16, torch.bfloat16]:
            hidden_states = hidden_states.to(self.weight.dtype)
        return self.weight * hidden_states
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@ -8,8 +8,8 @@ from vllm import pos_encoding_ops


 def rotate_half(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., : x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2 :]
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
    return torch.cat((-x2, x1), dim=-1)


@ -38,7 +38,7 @@ class RefRotaryEmbeddingNeox(nn.Module):
        self.max_position_embeddings = max_position_embeddings

        # Create cos and sin embeddings.
-        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2) / dim))
+        inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
        t = torch.arange(max_position_embeddings).float()
        freqs = torch.einsum("i,j->ij", t, inv_freq.float())
        emb = torch.cat((freqs, freqs), dim=-1)
@ -49,16 +49,15 @@ class RefRotaryEmbeddingNeox(nn.Module):

    def forward(
        self,
-        positions: torch.Tensor,        # [num_tokens]
-        query: torch.Tensor,            # [num_tokens, num_heads, head_size]
-        key: torch.Tensor,              # [num_tokens, num_heads, head_size]
+        positions: torch.Tensor,  # [num_tokens]
+        query: torch.Tensor,  # [num_tokens, num_heads, head_size]
+        key: torch.Tensor,  # [num_tokens, num_heads, head_size]
    ) -> Tuple[torch.Tensor, torch.Tensor]:

-        query_rot = query[..., : self.rotary_dim]
-        query_pass = query[..., self.rotary_dim :]
-        key_rot = key[..., : self.rotary_dim]
-        key_pass = key[..., self.rotary_dim :]
-
+        query_rot = query[..., :self.rotary_dim]
+        query_pass = query[..., self.rotary_dim:]
+        key_rot = key[..., :self.rotary_dim]
+        key_pass = key[..., self.rotary_dim:]

        query_rot = query_rot.transpose(0, 1)
        key_rot = key_rot.transpose(0, 1)
@ -85,12 +84,18 @@ def run_rotary_embedding_neox(
    dtype: torch.dtype,
    base: int = 10000,
 ) -> None:
-    positions = torch.randint(0, max_position, (num_tokens,), device='cuda')
-    query = torch.randn(num_tokens, num_heads * head_size, dtype=dtype, device='cuda')
-    key = torch.randn(num_tokens, num_heads * head_size, dtype=dtype, device='cuda')
+    positions = torch.randint(0, max_position, (num_tokens, ), device='cuda')
+    query = torch.randn(num_tokens,
+                        num_heads * head_size,
+                        dtype=dtype,
+                        device='cuda')
+    key = torch.randn(num_tokens,
+                      num_heads * head_size,
+                      dtype=dtype,
+                      device='cuda')

    # Create the rotary embedding.
-    inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))
+    inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
    t = torch.arange(max_position).float()
    freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
    cos = freqs.cos()
--- a/vllm/init.py
+++ b/vllm/init.py
@ -1,3 +1,5 @@
+"""vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
+
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine
@ -6,7 +8,7 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams

-__version__ = "0.1.1"
+__version__ = "0.1.4"

 __all__ = [
    "LLM",
--- a/vllm/block.py
+++ b/vllm/block.py
@ -35,7 +35,8 @@ class LogicalTokenBlock:

    def append_tokens(self, token_ids: List[int]) -> None:
        assert len(token_ids) <= self.get_num_empty_slots()
-        self.token_ids[self.num_tokens:self.num_tokens + len(token_ids)] = token_ids
+        curr_idx = self.num_tokens
+        self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
        self.num_tokens += len(token_ids)

    def get_token_ids(self) -> List[int]:
--- a/vllm/config.py
+++ b/vllm/config.py
@ -1,14 +1,15 @@
 from typing import Optional

 import torch
-from transformers import AutoConfig, PretrainedConfig
+from transformers import PretrainedConfig

 from vllm.logger import init_logger
+from vllm.transformers_utils.config import get_config
 from vllm.utils import get_cpu_memory

 logger = init_logger(__name__)

-_GiB = 1 << 30
+_GB = 1 << 30


 class ModelConfig:
@ -16,6 +17,11 @@ class ModelConfig:

    Args:
        model: Name or path of the huggingface model to use.
+        tokenizer: Name or path of the huggingface tokenizer to use.
+        tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
+            available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
        download_dir: Directory to download and load the weights, default to the
            default cache directory of huggingface.
        use_np_weights: Save a numpy copy of model weights for faster loading.
@ -30,6 +36,9 @@ class ModelConfig:
    def __init__(
        self,
        model: str,
+        tokenizer: str,
+        tokenizer_mode: str,
+        trust_remote_code: bool,
        download_dir: Optional[str],
        use_np_weights: bool,
        use_dummy_weights: bool,
@ -37,13 +46,25 @@ class ModelConfig:
        seed: int,
    ) -> None:
        self.model = model
+        self.tokenizer = tokenizer
+        self.tokenizer_mode = tokenizer_mode
+        self.trust_remote_code = trust_remote_code
        self.download_dir = download_dir
        self.use_np_weights = use_np_weights
        self.use_dummy_weights = use_dummy_weights
        self.seed = seed

-        self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
+        self.hf_config = get_config(model, trust_remote_code)
        self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
+        self._verify_tokenizer_mode()
+
+    def _verify_tokenizer_mode(self) -> None:
+        tokenizer_mode = self.tokenizer_mode.lower()
+        if tokenizer_mode not in ["auto", "slow"]:
+            raise ValueError(
+                f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
+                "either 'auto' or 'slow'.")
+        self.tokenizer_mode = tokenizer_mode

    def verify_with_parallel_config(
        self,
@ -73,9 +94,48 @@ class ModelConfig:
        return self.hf_config.hidden_size // self.hf_config.num_attention_heads

    def get_num_heads(self, parallel_config: "ParallelConfig") -> int:
+        # For GPTBigCode & Falcon:
+        # Note: for falcon, when new_decoder_architecture is True, the
+        # multi_query flag is ignored and we use n_head_kv for the number of
+        # KV heads.
+        new_decoder_arch_falcon = (
+            self.hf_config.model_type == "falcon"
+            and getattr(self.hf_config, "new_decoder_architecture", False))
+        if not new_decoder_arch_falcon and getattr(self.hf_config,
+                                                   "multi_query", False):
+            # Multi-query attention, only one KV head.
+            return 1
+        # For Falcon:
+        if getattr(self.hf_config, "n_head_kv", None) is not None:
+            return (self.hf_config.n_head_kv //
+                    parallel_config.tensor_parallel_size)
+        # For LLaMA-2:
+        if getattr(self.hf_config, "num_key_value_heads", None) is not None:
+            return (self.hf_config.num_key_value_heads //
+                    parallel_config.tensor_parallel_size)
        total_num_attention_heads = self.hf_config.num_attention_heads
        return total_num_attention_heads // parallel_config.tensor_parallel_size

+    def get_max_model_len(self) -> int:
+        max_model_len = float("inf")
+        possible_keys = [
+            # OPT
+            "max_position_embeddings",
+            # GPT-2
+            "n_positions",
+            # MPT
+            "max_seq_len",
+            # Others
+            "max_sequence_length",
+            "max_seq_length",
+            "seq_len",
+        ]
+        for key in possible_keys:
+            max_len_key = getattr(self.hf_config, key, None)
+            if max_len_key is not None:
+                max_model_len = min(max_model_len, max_len_key)
+        return max_model_len
+
    def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
        total_num_hidden_layers = self.hf_config.num_hidden_layers
        return total_num_hidden_layers // parallel_config.pipeline_parallel_size
@ -90,6 +150,7 @@ class CacheConfig:
            vLLM execution.
        swap_space: Size of the CPU swap space per GPU (in GiB).
    """
+
    def __init__(
        self,
        block_size: int,
@ -98,7 +159,7 @@ class CacheConfig:
    ) -> None:
        self.block_size = block_size
        self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * _GiB
+        self.swap_space_bytes = swap_space * _GB
        self._verify_args()

        # Will be set after profiling.
@ -121,14 +182,13 @@ class CacheConfig:
        num_gpus_per_node = parallel_config.tensor_parallel_size
        cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node

-        msg = (
-            f"{cpu_memory_usage / _GiB:.2f} GiB out of "
-            f"the {total_cpu_memory / _GiB:.2f} GiB total CPU memory is "
-            "allocated for the swap space.")
+        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
+               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
+               "allocated for the swap space.")
        if cpu_memory_usage > 0.7 * total_cpu_memory:
            raise ValueError("Too large swap space. " + msg)
        elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warn("Possibly too large swap space. " + msg)
+            logger.warning("Possibly too large swap space. " + msg)


 class ParallelConfig:
@ -141,6 +201,7 @@ class ParallelConfig:
            True if either pipeline_parallel_size or tensor_parallel_size is
            greater than 1.
    """
+
    def __init__(
        self,
        pipeline_parallel_size: int,
@ -170,14 +231,15 @@ class SchedulerConfig:
            a single iteration.
        max_num_seqs: Maximum number of sequences to be processed in a single
            iteration.
+        max_model_len: Maximum length of a sequence (including prompt
+            and generated text).
    """
-    def __init__(
-        self,
-        max_num_batched_tokens: int,
-        max_num_seqs: int,
-    ) -> None:
+
+    def __init__(self, max_num_batched_tokens: int, max_num_seqs: int,
+                 max_model_len: int) -> None:
        self.max_num_batched_tokens = max_num_batched_tokens
        self.max_num_seqs = max_num_seqs
+        self.max_model_len = max_model_len


 _STR_DTYPE_TO_TORCH_DTYPE = {
@ -221,7 +283,7 @@ def _get_and_verify_dtype(
            pass
        else:
            # Casting between float16 and bfloat16 is allowed with a warning.
-            logger.warn(f"Casting {config_dtype} to {torch_dtype}.")
+            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")

    # Check if the GPU supports the dtype.
    if torch_dtype == torch.bfloat16:
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -27,8 +27,9 @@ class BlockAllocator:
        # Initialize the free blocks.
        self.free_blocks: List[PhysicalTokenBlock] = []
        for i in range(num_blocks):
-            block = PhysicalTokenBlock(
-                device=device, block_number=i, block_size=block_size)
+            block = PhysicalTokenBlock(device=device,
+                                       block_number=i,
+                                       block_size=block_size)
            self.free_blocks.append(block)

    def allocate(self) -> PhysicalTokenBlock:
@ -84,10 +85,12 @@ class BlockSpaceManager:
        num_required_blocks = len(seq.logical_token_blocks)
        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
        # Use watermark to avoid frequent cache eviction.
-        return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks
+        return (num_free_gpu_blocks - num_required_blocks >=
+                self.watermark_blocks)

    def allocate(self, seq_group: SequenceGroup) -> None:
-        # NOTE: Here we assume that all sequences in the group have the same prompt.
+        # NOTE: Here we assume that all sequences in the group have the same
+        # prompt.
        seq = seq_group.get_seqs()[0]

        # Allocate new physical token blocks that will store the prompt tokens.
@ -143,7 +146,8 @@ class BlockSpaceManager:
        for block in src_block_table:
            block.ref_count += 1

-    def _get_physical_blocks(self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
+    def _get_physical_blocks(
+            self, seq_group: SequenceGroup) -> List[PhysicalTokenBlock]:
        # NOTE: Here, we assume that the physical blocks are only shared by
        # the sequences in the same group.
        blocks: Set[PhysicalTokenBlock] = set()
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -12,8 +12,6 @@ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,

 logger = init_logger(__name__)

-_LOGGING_INTERVAL_SEC = 5
-

 class PreemptionMode(enum.Enum):
    """Preemption modes.
@ -32,20 +30,28 @@ class SchedulerOutputs:

    def __init__(
        self,
+        scheduled_seq_groups: List[SequenceGroup],
+        prompt_run: bool,
+        num_batched_tokens: int,
        blocks_to_swap_in: Dict[int, int],
        blocks_to_swap_out: Dict[int, int],
        blocks_to_copy: Dict[int, List[int]],
+        ignored_seq_groups: List[SequenceGroup],
    ) -> None:
+        self.scheduled_seq_groups = scheduled_seq_groups
+        self.prompt_run = prompt_run
+        self.num_batched_tokens = num_batched_tokens
        self.blocks_to_swap_in = blocks_to_swap_in
        self.blocks_to_swap_out = blocks_to_swap_out
        self.blocks_to_copy = blocks_to_copy
        # Swap in and swap out should never happen at the same time.
        assert not (blocks_to_swap_in and blocks_to_swap_out)
+        self.ignored_seq_groups = ignored_seq_groups

    def is_empty(self) -> bool:
-        return (not self.blocks_to_swap_in
-                and not self.blocks_to_swap_out
-                and not self.blocks_to_copy)
+        # NOTE: We do not consider the ignored sequence groups.
+        return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
+                and not self.blocks_to_swap_out and not self.blocks_to_copy)


 class Scheduler:
@ -54,14 +60,12 @@ class Scheduler:
        self,
        scheduler_config: SchedulerConfig,
        cache_config: CacheConfig,
-        log_stats: bool,
    ) -> None:
        self.scheduler_config = scheduler_config
        self.cache_config = cache_config
-        self.log_stats = log_stats

        # Instantiate the scheduling policy.
-        self.policy = PolicyFactory.get_policy(policy_name='fcfs')
+        self.policy = PolicyFactory.get_policy(policy_name="fcfs")
        # Create the block space manager.
        self.block_manager = BlockSpaceManager(
            block_size=self.cache_config.block_size,
@ -76,10 +80,6 @@ class Scheduler:
        # Sequence groups in the SWAPPED state.
        self.swapped: List[SequenceGroup] = []

-        self.last_logging_time: float = 0.0
-        # List[timestamp, num_tokens]
-        self.num_input_tokens: List[Tuple[float, int]] = []
-
    def add_seq_group(self, seq_group: SequenceGroup) -> None:
        # Add sequence groups to the waiting queue.
        self.waiting.append(seq_group)
@ -102,7 +102,7 @@ class Scheduler:
    def get_num_unfinished_seq_groups(self) -> int:
        return len(self.waiting) + len(self.running) + len(self.swapped)

-    def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
+    def _schedule(self) -> SchedulerOutputs:
        # Blocks that need to be swaped or copied before model execution.
        blocks_to_swap_in: Dict[int, int] = {}
        blocks_to_swap_out: Dict[int, int] = {}
@ -111,10 +111,71 @@ class Scheduler:
        # Fix the current time.
        now = time.time()

-        # NOTE(woosuk): We prioritize the sequence groups in the RUNNING state
-        # in order to minimize the preemption overheads.
-        # Preemption happens only when there is no available slot to keep all
-        # the sequence groups in the RUNNING state.
+        # Join waiting sequences if possible.
+        if not self.swapped:
+            ignored_seq_groups: List[SequenceGroup] = []
+            scheduled: List[SequenceGroup] = []
+            num_batched_tokens = 0
+            # Optimization: We do not sort the waiting queue since the preempted
+            # sequence groups are added to the front and the new sequence groups
+            # are added to the back.
+            while self.waiting:
+                seq_group = self.waiting[0]
+
+                num_prompt_tokens = seq_group.get_seqs()[0].get_len()
+                prompt_limit = min(
+                    self.scheduler_config.max_model_len,
+                    self.scheduler_config.max_num_batched_tokens)
+                if num_prompt_tokens > prompt_limit:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds limit of {prompt_limit}")
+                    for seq in seq_group.get_seqs():
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    break
+
+                # If the sequence group cannot be allocated, stop.
+                if not self.block_manager.can_allocate(seq_group):
+                    break
+
+                # If the number of batched tokens exceeds the limit, stop.
+                if (num_batched_tokens + num_prompt_tokens >
+                        self.scheduler_config.max_num_batched_tokens):
+                    break
+
+                # The total number of sequences in the RUNNING state should not
+                # exceed the maximum number of sequences.
+                num_new_seqs = seq_group.num_seqs(
+                    status=SequenceStatus.WAITING)
+                num_curr_seqs = sum(
+                    seq_group.num_seqs(status=SequenceStatus.RUNNING)
+                    for seq_group in self.running)
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
+                    break
+
+                seq_group = self.waiting.pop(0)
+                self._allocate(seq_group)
+                self.running.append(seq_group)
+                num_batched_tokens += num_prompt_tokens
+                scheduled.append(seq_group)
+
+            if scheduled:
+                scheduler_outputs = SchedulerOutputs(
+                    scheduled_seq_groups=scheduled,
+                    prompt_run=True,
+                    num_batched_tokens=num_batched_tokens,
+                    blocks_to_swap_in=blocks_to_swap_in,
+                    blocks_to_swap_out=blocks_to_swap_out,
+                    blocks_to_copy=blocks_to_copy,
+                    ignored_seq_groups=ignored_seq_groups,
+                )
+                return scheduler_outputs
+
+        # NOTE(woosuk): Preemption happens only when there is no available slot
+        # to keep all the sequence groups in the RUNNING state.
        # In this case, the policy is responsible for deciding which sequence
        # groups to preempt.
        self.running = self.policy.sort_by_priority(now, self.running)
@ -156,8 +217,11 @@ class Scheduler:
            # The total number of sequences in the RUNNING state should not
            # exceed the maximum number of sequences.
            num_new_seqs = seq_group.num_seqs(status=SequenceStatus.SWAPPED)
-            num_curr_seqs = len(self.running)
-            if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
+            num_curr_seqs = sum(
+                seq_group.num_seqs(status=SequenceStatus.RUNNING)
+                for seq_group in self.running)
+            if (num_curr_seqs + num_new_seqs >
+                    self.scheduler_config.max_num_seqs):
                break

            seq_group = self.swapped.pop(0)
@ -167,106 +231,28 @@ class Scheduler:

        num_batched_tokens = sum(
            seq_group.num_seqs(status=SequenceStatus.RUNNING)
-            for seq_group in self.running
-        )
-
-        # Join waiting sequences if possible.
-        prompt_group_ids: List[str] = []
-        # NOTE(woosuk): The sequence groups in the SWAPPED state are strictly
-        # prioritized over the sequence groups in the WAITING state.
-        # This is because we want to bound the amount of CPU memory taken by
-        # the swapped sequence groups.
-        if not self.swapped:
-            # Optimization: We do not sort the waiting queue since the preempted
-            # sequence groups are added to the front and the new sequence groups
-            # are added to the back.
-            while self.waiting:
-                seq_group = self.waiting[0]
-                # If the sequence group has been preempted in this step, stop.
-                if seq_group in preempted:
-                    break
-                # If the sequence group cannot be allocated, stop.
-                if not self.block_manager.can_allocate(seq_group):
-                    break
-
-                # If the number of batched tokens exceeds the limit, stop.
-                num_prompt_tokens = seq_group.get_seqs()[0].get_len()
-                if (num_batched_tokens + num_prompt_tokens
-                    > self.scheduler_config.max_num_batched_tokens):
-                    break
-
-                # The total number of sequences in the RUNNING state should not
-                # exceed the maximum number of sequences.
-                num_new_seqs = seq_group.num_seqs(status=SequenceStatus.WAITING)
-                num_curr_seqs = len(self.running)
-                if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
-                    break
-
-                seq_group = self.waiting.pop(0)
-                self._allocate(seq_group)
-                self.running.append(seq_group)
-                num_batched_tokens += num_prompt_tokens
-                prompt_group_ids.append(seq_group.request_id)
+            for seq_group in self.running)

        scheduler_outputs = SchedulerOutputs(
+            scheduled_seq_groups=self.running,
+            prompt_run=False,
+            num_batched_tokens=num_batched_tokens,
            blocks_to_swap_in=blocks_to_swap_in,
            blocks_to_swap_out=blocks_to_swap_out,
            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=[],
        )
-        if not self.log_stats:
-            return scheduler_outputs, prompt_group_ids
-
-        # TODO(woosuk): Move the below code to the engine.
-        now = time.time()
-        if num_batched_tokens > 0:
-            self.num_input_tokens.append((now, num_batched_tokens))
-        elapsed_time = now - self.last_logging_time
-        if elapsed_time > _LOGGING_INTERVAL_SEC:
-            self.last_logging_time = now
-            self.num_input_tokens = [
-                (t, n) for t, n in self.num_input_tokens
-                if now - t < _LOGGING_INTERVAL_SEC
-            ]
-            if len(self.num_input_tokens) > 1:
-                total_num_tokens = sum(n for _, n in self.num_input_tokens[:-1])
-                window = now - self.num_input_tokens[0][0]
-                avg_throughput = total_num_tokens / window
-            else:
-                avg_throughput = 0.0
-
-            total_num_gpu_blocks = self.cache_config.num_gpu_blocks
-            num_free_gpu_blocks = self.block_manager.get_num_free_gpu_blocks()
-            num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
-            gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks
-
-            total_num_cpu_blocks = self.cache_config.num_cpu_blocks
-            if total_num_cpu_blocks > 0:
-                num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks()
-                num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
-                cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
-            else:
-                cpu_cache_usage = 0.0
-
-            logger.info(
-                f"Throughput: {avg_throughput:.1f} tokens/s, "
-                f"Running: {len(self.running)} reqs, "
-                f"Swapped: {len(self.swapped)} reqs, "
-                f"Pending: {len(self.waiting)} reqs, "
-                f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
-                f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
-        return scheduler_outputs, prompt_group_ids
+        return scheduler_outputs

    def schedule(self) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs]:
        # Schedule sequence groups.
        # This function call changes the internal states of the scheduler
        # such as self.running, self.swapped, and self.waiting.
-        scheduler_outputs, prompt_group_ids = self._schedule()
+        scheduler_outputs = self._schedule()

        # Create input data structures.
        seq_group_metadata_list: List[SequenceGroupMetadata] = []
-        for seq_group in self.running:
-            is_prompt = seq_group.request_id in prompt_group_ids
-
+        for seq_group in scheduler_outputs.scheduled_seq_groups:
            seq_data: Dict[int, List[SequenceData]] = {}
            block_tables: Dict[int, List[int]] = {}
            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
@ -276,7 +262,7 @@ class Scheduler:

            seq_group_metadata = SequenceGroupMetadata(
                request_id=seq_group.request_id,
-                is_prompt=is_prompt,
+                is_prompt=scheduler_outputs.prompt_run,
                seq_data=seq_data,
                sampling_params=seq_group.sampling_params,
                block_tables=block_tables,
@ -288,14 +274,21 @@ class Scheduler:
        self,
        seq_outputs: Dict[int, SequenceOutputs],
    ) -> List[SequenceGroup]:
-        # Update the running sequences and free blocks.
+        scheduled: List[SequenceGroup] = []
        for seq_group in self.running:
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                if seq.seq_id in seq_outputs:
+                    scheduled.append(seq_group)
+                    break
+
+        # Update the scheduled sequences and free blocks.
+        for seq_group in scheduled:
            # Process beam search results before processing the new tokens.
            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                output = seq_outputs[seq.seq_id]
                if seq.seq_id != output.parent_seq_id:
-                    # The sequence is a fork of the parent sequence (beam search).
-                    # Free the current sequence.
+                    # The sequence is a fork of the parent sequence (beam
+                    # search). Free the current sequence.
                    self.block_manager.free(seq)
                    # Fork the parent sequence.
                    parent_seq = seq_group.find(output.parent_seq_id)
@ -307,9 +300,7 @@ class Scheduler:
                # Append a new token to the sequence.
                output = seq_outputs[seq.seq_id]
                seq.append_token_id(output.output_token, output.logprobs)
-        # Return a shallow copy of the running queue to prevent the queue
-        # from being modified by the caller.
-        return self.running.copy()
+        return scheduled

    def free_seq(self, seq: Sequence, finish_status: SequenceStatus) -> None:
        seq.status = finish_status
@ -368,7 +359,7 @@ class Scheduler:
        elif preemption_mode == PreemptionMode.SWAP:
            self._preempt_by_swap(seq_group, blocks_to_swap_out)
        else:
-            assert False, 'Invalid preemption mode.'
+            assert False, "Invalid preemption mode."

    def _preempt_by_recompute(
        self,
@ -388,9 +379,6 @@ class Scheduler:
        seq_group: SequenceGroup,
        blocks_to_swap_out: Dict[int, int],
    ) -> None:
-        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
-        for seq in seqs:
-            seq.status = SequenceStatus.SWAPPED
        self._swap_out(seq_group, blocks_to_swap_out)
        self.swapped.append(seq_group)

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -11,10 +11,13 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
 class EngineArgs:
    """Arguments for vLLM engine."""
    model: str
+    tokenizer: Optional[str] = None
+    tokenizer_mode: str = 'auto'
+    trust_remote_code: bool = False
    download_dir: Optional[str] = None
    use_np_weights: bool = False
    use_dummy_weights: bool = False
-    dtype: str = "auto"
+    dtype: str = 'auto'
    seed: int = 0
    worker_use_ray: bool = False
    pipeline_parallel_size: int = 1
@ -27,72 +30,110 @@ class EngineArgs:
    disable_log_stats: bool = False

    def __post_init__(self):
+        if self.tokenizer is None:
+            self.tokenizer = self.model
        self.max_num_seqs = min(self.max_num_seqs, self.max_num_batched_tokens)

    @staticmethod
    def add_cli_args(
-        parser: argparse.ArgumentParser,
-    ) -> argparse.ArgumentParser:
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Shared CLI arguments for vLLM engine."""
        # Model arguments
-        parser.add_argument('--model', type=str, default='facebook/opt-125m',
-                            help='name or path of the huggingface model to use')
-        parser.add_argument('--download-dir', type=str,
+        parser.add_argument(
+            '--model',
+            type=str,
+            default='facebook/opt-125m',
+            help='name or path of the huggingface model to use')
+        parser.add_argument(
+            '--tokenizer',
+            type=str,
+            default=EngineArgs.tokenizer,
+            help='name or path of the huggingface tokenizer to use')
+        parser.add_argument('--tokenizer-mode',
+                            type=str,
+                            default=EngineArgs.tokenizer_mode,
+                            choices=['auto', 'slow'],
+                            help='tokenizer mode. "auto" will use the fast '
+                            'tokenizer if available, and "slow" will '
+                            'always use the slow tokenizer.')
+        parser.add_argument('--trust-remote-code',
+                            action='store_true',
+                            help='trust remote code from huggingface')
+        parser.add_argument('--download-dir',
+                            type=str,
                            default=EngineArgs.download_dir,
                            help='directory to download and load the weights, '
-                                 'default to the default cache dir of '
-                                 'huggingface')
-        parser.add_argument('--use-np-weights', action='store_true',
+                            'default to the default cache dir of '
+                            'huggingface')
+        parser.add_argument('--use-np-weights',
+                            action='store_true',
                            help='save a numpy copy of model weights for '
-                                 'faster loading. This can increase the disk '
-                                 'usage by up to 2x.')
-        parser.add_argument('--use-dummy-weights', action='store_true',
+                            'faster loading. This can increase the disk '
+                            'usage by up to 2x.')
+        parser.add_argument('--use-dummy-weights',
+                            action='store_true',
                            help='use dummy values for model weights')
        # TODO(woosuk): Support FP32.
-        parser.add_argument('--dtype', type=str, default=EngineArgs.dtype,
-                            choices=['auto', 'half', 'bfloat16', 'float'],
-                            help='data type for model weights and activations. '
-                                 'The "auto" option will use FP16 precision '
-                                 'for FP32 and FP16 models, and BF16 precision '
-                                 'for BF16 models.')
+        parser.add_argument(
+            '--dtype',
+            type=str,
+            default=EngineArgs.dtype,
+            choices=['auto', 'half', 'bfloat16', 'float'],
+            help='data type for model weights and activations. '
+            'The "auto" option will use FP16 precision '
+            'for FP32 and FP16 models, and BF16 precision '
+            'for BF16 models.')
        # Parallel arguments
-        parser.add_argument('--worker-use-ray', action='store_true',
+        parser.add_argument('--worker-use-ray',
+                            action='store_true',
                            help='use Ray for distributed serving, will be '
-                                 'automatically set when using more than 1 GPU')
-        parser.add_argument('--pipeline-parallel-size', '-pp', type=int,
+                            'automatically set when using more than 1 GPU')
+        parser.add_argument('--pipeline-parallel-size',
+                            '-pp',
+                            type=int,
                            default=EngineArgs.pipeline_parallel_size,
                            help='number of pipeline stages')
-        parser.add_argument('--tensor-parallel-size', '-tp', type=int,
+        parser.add_argument('--tensor-parallel-size',
+                            '-tp',
+                            type=int,
                            default=EngineArgs.tensor_parallel_size,
                            help='number of tensor parallel replicas')
        # KV cache arguments
-        parser.add_argument('--block-size', type=int,
+        parser.add_argument('--block-size',
+                            type=int,
                            default=EngineArgs.block_size,
                            choices=[8, 16, 32],
                            help='token block size')
        # TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
-        parser.add_argument('--seed', type=int, default=EngineArgs.seed,
+        parser.add_argument('--seed',
+                            type=int,
+                            default=EngineArgs.seed,
                            help='random seed')
-        parser.add_argument('--swap-space', type=int,
+        parser.add_argument('--swap-space',
+                            type=int,
                            default=EngineArgs.swap_space,
                            help='CPU swap space size (GiB) per GPU')
-        parser.add_argument('--gpu-memory-utilization', type=float,
+        parser.add_argument('--gpu-memory-utilization',
+                            type=float,
                            default=EngineArgs.gpu_memory_utilization,
                            help='the percentage of GPU memory to be used for'
-                                 'the model executor')
-        parser.add_argument('--max-num-batched-tokens', type=int,
+                            'the model executor')
+        parser.add_argument('--max-num-batched-tokens',
+                            type=int,
                            default=EngineArgs.max_num_batched_tokens,
                            help='maximum number of batched tokens per '
-                                 'iteration')
-        parser.add_argument('--max-num-seqs', type=int,
+                            'iteration')
+        parser.add_argument('--max-num-seqs',
+                            type=int,
                            default=EngineArgs.max_num_seqs,
                            help='maximum number of sequences per iteration')
-        parser.add_argument('--disable-log-stats', action='store_true',
+        parser.add_argument('--disable-log-stats',
+                            action='store_true',
                            help='disable logging statistics')
        return parser

    @classmethod
-    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
+    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
        # Get the list of attributes of this dataclass.
        attrs = [attr.name for attr in dataclasses.fields(cls)]
        # Set the attributes from the parsed arguments.
@ -103,16 +144,20 @@ class EngineArgs:
        self,
    ) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
        # Initialize the configs.
-        model_config = ModelConfig(
-            self.model, self.download_dir, self.use_np_weights,
-            self.use_dummy_weights, self.dtype, self.seed)
-        cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
+        model_config = ModelConfig(self.model, self.tokenizer,
+                                   self.tokenizer_mode, self.trust_remote_code,
+                                   self.download_dir, self.use_np_weights,
+                                   self.use_dummy_weights, self.dtype,
+                                   self.seed)
+        cache_config = CacheConfig(self.block_size,
+                                   self.gpu_memory_utilization,
                                   self.swap_space)
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
                                         self.worker_use_ray)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
-                                           self.max_num_seqs)
+                                           self.max_num_seqs,
+                                           model_config.get_max_model_len())
        return model_config, cache_config, parallel_config, scheduler_config


@ -124,12 +169,13 @@ class AsyncEngineArgs(EngineArgs):

    @staticmethod
    def add_cli_args(
-        parser: argparse.ArgumentParser,
-    ) -> argparse.ArgumentParser:
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        parser = EngineArgs.add_cli_args(parser)
-        parser.add_argument('--engine-use-ray', action='store_true',
+        parser.add_argument('--engine-use-ray',
+                            action='store_true',
                            help='use Ray to start the LLM engine in a '
-                                 'separate process as the server process.')
-        parser.add_argument('--disable-log-requests', action='store_true',
+                            'separate process as the server process.')
+        parser.add_argument('--disable-log-requests',
+                            action='store_true',
                            help='disable logging requests')
        return parser
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -2,6 +2,7 @@ import asyncio
 import time
 from typing import Dict, List, Optional

+from vllm.config import ModelConfig
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.llm_engine import LLMEngine
 from vllm.engine.ray_utils import initialize_cluster, ray
@ -11,7 +12,7 @@ from vllm.sampling_params import SamplingParams

 logger = init_logger(__name__)

-TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
+TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds


 class AsyncLLMEngine:
@ -35,8 +36,13 @@ class AsyncLLMEngine:
        log_requests: Whether to log the requests.
        *args, *kwargs: Arguments for LLMEngine.
    """
-    def __init__(self, worker_use_ray: bool, engine_use_ray: bool,
-                 log_requests: bool = True, *args, **kwargs) -> None:
+
+    def __init__(self,
+                 worker_use_ray: bool,
+                 engine_use_ray: bool,
+                 *args,
+                 log_requests: bool = True,
+                 **kwargs) -> None:
        self.worker_use_ray = worker_use_ray
        self.engine_use_ray = engine_use_ray
        self.log_requests = log_requests
@ -76,12 +82,11 @@ class AsyncLLMEngine:
            self.request_events[request_id].set()

    async def generate(
-        self,
-        prompt: Optional[str],
-        sampling_params: SamplingParams,
-        request_id: str,
-        prompt_token_ids: Optional[List[int]] = None
-    ) -> RequestOutput:
+            self,
+            prompt: Optional[str],
+            sampling_params: SamplingParams,
+            request_id: str,
+            prompt_token_ids: Optional[List[int]] = None) -> RequestOutput:
        """Generate outputs for a request.

        Generate outputs for a request. This method is a coroutine. It adds the
@ -117,14 +122,17 @@ class AsyncLLMEngine:
        # Add the request into the vLLM engine's waiting queue.
        if self.engine_use_ray:
            await self.engine.add_request.remote(
-                request_id, prompt, sampling_params,
+                request_id,
+                prompt,
+                sampling_params,
                prompt_token_ids=prompt_token_ids,
                arrival_time=arrival_time)
        else:
-            self.engine.add_request(
-                request_id, prompt, sampling_params,
-                prompt_token_ids=prompt_token_ids,
-                arrival_time=arrival_time)
+            self.engine.add_request(request_id,
+                                    prompt,
+                                    sampling_params,
+                                    prompt_token_ids=prompt_token_ids,
+                                    arrival_time=arrival_time)

        # The vLLM engine does not have a background loop that keeps
        # processing incoming requests. Therefore, we need to keep kicking
@ -136,7 +144,11 @@ class AsyncLLMEngine:

            # Kick the engine if the engine is not running.
            if not self.is_engine_running:
-                await self.engine_step(request_id)
+                try:
+                    await self.engine_step(request_id)
+                except RuntimeError as e:
+                    await self.abort(request_id)
+                    raise e

            # Wait for new output. The group_event will be set in engine_step
            # when there is new output available for the sequence group.
@ -199,20 +211,29 @@ class AsyncLLMEngine:
            self.is_engine_running = False
            self.kicking_request_id = None

+    async def get_model_config(self) -> ModelConfig:
+        """Get the model configuration of the vLLM engine."""
+        if self.engine_use_ray:
+            return await self.engine.get_model_config.remote()
+        else:
+            return self.engine.get_model_config()
+
    @classmethod
-    def from_engine_args(cls, engine_args: AsyncEngineArgs) -> "AsyncLLMEngine":
+    def from_engine_args(cls,
+                         engine_args: AsyncEngineArgs) -> "AsyncLLMEngine":
        """Creates an async LLM engine from the engine arguments."""
        # Create the engine configs.
        engine_configs = engine_args.create_engine_configs()
        parallel_config = engine_configs[2]
        # Initialize the cluster.
-        distributed_init_method, devices = initialize_cluster(
+        distributed_init_method, placement_group = initialize_cluster(
            parallel_config, engine_args.engine_use_ray)
        # Create the async LLM engine.
        engine = cls(engine_args.worker_use_ray,
                     engine_args.engine_use_ray,
-                     not engine_args.disable_log_requests,
                     *engine_configs,
-                     distributed_init_method, devices,
+                     distributed_init_method,
+                     placement_group,
+                     log_requests=not engine_args.disable_log_requests,
                     log_stats=not engine_args.disable_log_stats)
        return engine
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -1,21 +1,32 @@
 import time
-from typing import Any, List, Optional
+import copy
+from functools import partial
+from typing import Any, List, Optional, Tuple, TYPE_CHECKING

 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
 from vllm.core.scheduler import Scheduler
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.ray_utils import DeviceID, initialize_cluster, ray
-from vllm.engine.tokenizer_utils import detokenize_incrementally, get_tokenizer
+from vllm.engine.ray_utils import initialize_cluster, ray, RayWorker
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
+from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
+                                               get_tokenizer)
 from vllm.utils import Counter
-from vllm.worker.worker import Worker
+
+if ray:
+    from ray.air.util.torch_dist import init_torch_dist_process_group
+    from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup

 logger = init_logger(__name__)

+_LOGGING_INTERVAL_SEC = 5
+

 class LLMEngine:
    """An LLM engine that receives requests and generates texts.
@ -53,19 +64,21 @@ class LLMEngine:
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
        distributed_init_method: str,
-        stage_devices: List[List[DeviceID]],
+        placement_group: Optional["PlacementGroup"],
        log_stats: bool,
    ) -> None:
        logger.info(
            "Initializing an LLM engine with config: "
            f"model={model_config.model!r}, "
+            f"tokenizer={model_config.tokenizer!r}, "
+            f"tokenizer_mode={model_config.tokenizer_mode}, "
+            f"trust_remote_code={model_config.trust_remote_code}, "
            f"dtype={model_config.dtype}, "
            f"use_dummy_weights={model_config.use_dummy_weights}, "
            f"download_dir={model_config.download_dir!r}, "
            f"use_np_weights={model_config.use_np_weights}, "
            f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
-            f"seed={model_config.seed})"
-        )
+            f"seed={model_config.seed})")
        # TODO(woosuk): Print more configs in debug mode.

        self.model_config = model_config
@ -75,34 +88,89 @@ class LLMEngine:
        self.log_stats = log_stats
        self._verify_args()

-        self.tokenizer = get_tokenizer(model_config.model)
+        self.tokenizer = get_tokenizer(
+            model_config.tokenizer,
+            tokenizer_mode=model_config.tokenizer_mode,
+            trust_remote_code=model_config.trust_remote_code)
        self.seq_counter = Counter()

        # Create the parallel GPU workers.
-        self.workers: List[Worker] = []
-        assert len(stage_devices) == 1, "Only support one stage for now."
-        for rank, node_resource, _ in stage_devices[0]:
-            worker_cls = Worker
-            if self.parallel_config.worker_use_ray:
-                worker_cls = ray.remote(
-                    num_cpus=0,
-                    num_gpus=1,
-                    resources={node_resource: 1e-3},
-                )(worker_cls).remote
+        if self.parallel_config.worker_use_ray:
+            self._init_workers_ray(placement_group)
+        else:
+            self._init_workers(distributed_init_method)

-            worker = worker_cls(
-                model_config,
-                parallel_config,
-                scheduler_config,
-                rank,
-                distributed_init_method,
-            )
-            self.workers.append(worker)
        # Profile the memory usage and initialize the cache.
        self._init_cache()

        # Create the scheduler.
-        self.scheduler = Scheduler(scheduler_config, cache_config, log_stats)
+        self.scheduler = Scheduler(scheduler_config, cache_config)
+
+        # Logging.
+        self.last_logging_time = 0.0
+        # List of (timestamp, num_tokens)
+        self.num_prompt_tokens: List[Tuple[float, int]] = []
+        # List of (timestamp, num_tokens)
+        self.num_generation_tokens: List[Tuple[float, int]] = []
+
+    def _init_workers(self, distributed_init_method: str):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+
+        assert self.parallel_config.world_size == 1, (
+            "Ray is required if parallel_config.world_size > 1.")
+
+        self.workers: List[Worker] = []
+        worker = Worker(
+            self.model_config,
+            self.parallel_config,
+            self.scheduler_config,
+            0,
+            distributed_init_method,
+        )
+        self.workers.append(worker)
+        self._run_workers(
+            "init_model",
+            get_all_outputs=True,
+        )
+
+    def _init_workers_ray(self, placement_group: "PlacementGroup"):
+        # Lazy import the Worker to avoid importing torch.cuda/xformers
+        # before CUDA_VISIBLE_DEVICES is set in the Worker
+        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+
+        self.workers: List[Worker] = []
+        for bundle in placement_group.bundle_specs:
+            if not bundle.get("GPU", 0):
+                continue
+            worker = ray.remote(
+                num_cpus=0,
+                num_gpus=1,
+                scheduling_strategy=PlacementGroupSchedulingStrategy(
+                    placement_group=placement_group,
+                    placement_group_capture_child_tasks=True),
+            )(RayWorker).remote()
+            self.workers.append(worker)
+
+        # Initialize torch distributed process group for the workers.
+        init_torch_dist_process_group(self.workers, backend="nccl")
+        model_config = copy.deepcopy(self.model_config)
+        parallel_config = copy.deepcopy(self.parallel_config)
+        scheduler_config = copy.deepcopy(self.scheduler_config)
+        self._run_workers("init_worker",
+                          get_all_outputs=True,
+                          worker_init_fn=lambda: Worker(
+                              model_config,
+                              parallel_config,
+                              scheduler_config,
+                              None,
+                              None,
+                          ))
+        self._run_workers(
+            "init_model",
+            get_all_outputs=True,
+        )

    def _verify_args(self) -> None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
@ -125,10 +193,10 @@ class LLMEngine:
        num_gpu_blocks = min(b[0] for b in num_blocks)
        num_cpu_blocks = min(b[1] for b in num_blocks)
        # FIXME(woosuk): Change to debug log.
-        logger.info(f'# GPU blocks: {num_gpu_blocks}, '
-                    f'# CPU blocks: {num_cpu_blocks}')
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")

-        if num_gpu_blocks <= 0 or num_cpu_blocks <= 0:
+        if num_gpu_blocks <= 0:
            raise ValueError("No available memory for the cache blocks. "
                             "Try increasing `gpu_memory_utilization` when "
                             "initializing the engine.")
@ -146,9 +214,12 @@ class LLMEngine:
        engine_configs = engine_args.create_engine_configs()
        parallel_config = engine_configs[2]
        # Initialize the cluster.
-        distributed_init_method, devices = initialize_cluster(parallel_config)
+        distributed_init_method, placement_group = initialize_cluster(
+            parallel_config)
        # Create the LLM engine.
-        engine = cls(*engine_configs, distributed_init_method, devices,
+        engine = cls(*engine_configs,
+                     distributed_init_method,
+                     placement_group,
                     log_stats=not engine_args.disable_log_stats)
        return engine

@ -205,6 +276,10 @@ class LLMEngine:
        """
        self.scheduler.abort_seq_group(request_id)

+    def get_model_config(self) -> ModelConfig:
+        """Gets the model configuration."""
+        return self.model_config
+
    def get_num_unfinished_requests(self) -> int:
        """Gets the number of unfinished requests."""
        return self.scheduler.get_num_unfinished_seq_groups()
@ -223,9 +298,16 @@ class LLMEngine:
        the sequences and returns the newly generated results.
        """
        seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
-        if (not seq_group_metadata_list) and scheduler_outputs.is_empty():
-            # Nothing to do.
-            return []
+        if scheduler_outputs.is_empty():
+            if not scheduler_outputs.ignored_seq_groups:
+                # Nothing to do.
+                return []
+            # If there are ignored seq groups, we need to return them as the
+            # request outputs.
+            return [
+                RequestOutput.from_seq_group(seq_group)
+                for seq_group in scheduler_outputs.ignored_seq_groups
+            ]

        # Execute the model.
        output = self._run_workers(
@ -247,11 +329,79 @@ class LLMEngine:

        # Create the outputs.
        request_outputs: List[RequestOutput] = []
-        for seq_group in seq_groups:
+        for seq_group in seq_groups + scheduler_outputs.ignored_seq_groups:
            request_output = RequestOutput.from_seq_group(seq_group)
            request_outputs.append(request_output)
+
+        if self.log_stats:
+            # Log the system stats.
+            self._log_system_stats(scheduler_outputs.prompt_run,
+                                   scheduler_outputs.num_batched_tokens)
        return request_outputs

+    def _log_system_stats(
+        self,
+        prompt_run: bool,
+        num_batched_tokens: int,
+    ) -> None:
+        now = time.time()
+        # Log the number of batched input tokens.
+        if prompt_run:
+            self.num_prompt_tokens.append((now, num_batched_tokens))
+        else:
+            self.num_generation_tokens.append((now, num_batched_tokens))
+
+        elapsed_time = now - self.last_logging_time
+        if elapsed_time < _LOGGING_INTERVAL_SEC:
+            return
+
+        # Discard the old stats.
+        self.num_prompt_tokens = [(t, n) for t, n in self.num_prompt_tokens
+                                  if now - t < _LOGGING_INTERVAL_SEC]
+        self.num_generation_tokens = [(t, n)
+                                      for t, n in self.num_generation_tokens
+                                      if now - t < _LOGGING_INTERVAL_SEC]
+
+        if len(self.num_prompt_tokens) > 1:
+            total_num_tokens = sum(n for _, n in self.num_prompt_tokens[:-1])
+            window = now - self.num_prompt_tokens[0][0]
+            avg_prompt_throughput = total_num_tokens / window
+        else:
+            avg_prompt_throughput = 0.0
+        if len(self.num_generation_tokens) > 1:
+            total_num_tokens = sum(n
+                                   for _, n in self.num_generation_tokens[:-1])
+            window = now - self.num_generation_tokens[0][0]
+            avg_generation_throughput = total_num_tokens / window
+        else:
+            avg_generation_throughput = 0.0
+
+        total_num_gpu_blocks = self.cache_config.num_gpu_blocks
+        num_free_gpu_blocks = (
+            self.scheduler.block_manager.get_num_free_gpu_blocks())
+        num_used_gpu_blocks = total_num_gpu_blocks - num_free_gpu_blocks
+        gpu_cache_usage = num_used_gpu_blocks / total_num_gpu_blocks
+
+        total_num_cpu_blocks = self.cache_config.num_cpu_blocks
+        if total_num_cpu_blocks > 0:
+            num_free_cpu_blocks = (
+                self.scheduler.block_manager.get_num_free_cpu_blocks())
+            num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
+            cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
+        else:
+            cpu_cache_usage = 0.0
+
+        logger.info("Avg prompt throughput: "
+                    f"{avg_prompt_throughput:.1f} tokens/s, "
+                    "Avg generation throughput: "
+                    f"{avg_generation_throughput:.1f} tokens/s, "
+                    f"Running: {len(self.scheduler.running)} reqs, "
+                    f"Swapped: {len(self.scheduler.swapped)} reqs, "
+                    f"Pending: {len(self.scheduler.waiting)} reqs, "
+                    f"GPU KV cache usage: {gpu_cache_usage * 100:.1f}%, "
+                    f"CPU KV cache usage: {cpu_cache_usage * 100:.1f}%")
+        self.last_logging_time = now
+
    def _decode_sequences(self, seq_groups: List[SequenceGroup]) -> None:
        """Decodes the sequence outputs."""
        for seq_group in seq_groups:
@ -262,8 +412,9 @@ class LLMEngine:
                    seq.get_last_token_id(),
                    skip_special_tokens=True,
                )
-                seq.output_tokens.append(new_token)
-                seq.output_text = new_output_text
+                if new_token is not None:
+                    seq.output_tokens.append(new_token)
+                    seq.output_text = new_output_text

    def _stop_sequences(self, seq_groups: List[SequenceGroup]) -> None:
        """Stop the finished sequences."""
@ -277,13 +428,18 @@ class LLMEngine:
                        # Truncate the output text so that the stop string is
                        # not included in the output.
                        seq.output_text = seq.output_text[:-len(stop_str)]
-                        self.scheduler.free_seq(seq,
-                                                SequenceStatus.FINISHED_STOPPED)
+                        self.scheduler.free_seq(
+                            seq, SequenceStatus.FINISHED_STOPPED)
                        stopped = True
                        break
                if stopped:
                    continue

+                # Check if the sequence has reached max_model_len.
+                if seq.get_len() > self.scheduler_config.max_model_len:
+                    self.scheduler.free_seq(
+                        seq, SequenceStatus.FINISHED_LENGTH_CAPPED)
+                    continue
                # Check if the sequence has reached max_tokens.
                if seq.get_output_len() == sampling_params.max_tokens:
                    self.scheduler.free_seq(
@ -292,23 +448,24 @@ class LLMEngine:
                # Check if the sequence has generated the EOS token.
                if not sampling_params.ignore_eos:
                    if seq.get_last_token_id() == self.tokenizer.eos_token_id:
-                        self.scheduler.free_seq(seq,
-                                                SequenceStatus.FINISHED_STOPPED)
+                        self.scheduler.free_seq(
+                            seq, SequenceStatus.FINISHED_STOPPED)
                        continue

    def _run_workers(
        self,
        method: str,
-        get_all_outputs: bool = False,
        *args,
+        get_all_outputs: bool = False,
        **kwargs,
    ) -> Any:
        """Runs the given method on all workers."""
        all_outputs = []
        for worker in self.workers:
-            executor = getattr(worker, method)
            if self.parallel_config.worker_use_ray:
-                executor = executor.remote
+                executor = partial(worker.execute_method.remote, method)
+            else:
+                executor = getattr(worker, method)

            output = executor(*args, **kwargs)
            all_outputs.append(output)
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@ -1,21 +1,49 @@
-import random
-from typing import List, Optional, Tuple
-
-try:
-    import ray
-except ImportError:
-    ray = None
+import socket
+from typing import Optional, Tuple, TYPE_CHECKING

 from vllm.config import ParallelConfig

-DeviceID = Tuple[int, Optional[str], int]  # rank, node resource (node IP), device id
+try:
+    import ray
+    from ray.air.util.torch_dist import TorchDistributedWorker
+
+    class RayWorker(TorchDistributedWorker):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+
+        def __init__(self) -> None:
+            self.worker = None
+
+        def init_worker(self, worker_init_fn):
+            self.worker = worker_init_fn()
+
+        def __getattr__(self, name):
+            return getattr(self.worker, name)
+
+        def execute_method(self, method, *args, **kwargs):
+            executor = getattr(self, method)
+            return executor(*args, **kwargs)
+
+except ImportError:
+    ray = None
+    TorchDistributedWorker = None
+    RayWorker = None  # pylint: disable=invalid-name
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup
+
+
+def get_open_port():
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]


 def initialize_cluster(
    parallel_config: ParallelConfig,
    engine_use_ray: bool = False,
    ray_address: Optional[str] = None,
-) -> Tuple[str, List[List[DeviceID]]]:
+) -> Tuple[str, Optional["PlacementGroup"]]:
    """Initialize the distributed cluster probably with Ray.

    Args:
@ -37,71 +65,46 @@ def initialize_cluster(
                "Ray is not installed. Please install Ray to use distributed "
                "serving.")
        # Connect to a ray cluster.
-        ray.init(address=ray_address)
+        ray.init(address=ray_address, ignore_reinit_error=True)

    if not parallel_config.worker_use_ray:
        # Initialize cluster locally.
-        port = random.randint(10000, 20000)
+        port = get_open_port()
        # We need to setup the distributed init method to make sure
        # the distributed megatron code (e.g., get world size) works correctly.
        distributed_init_method = f"tcp://localhost:{port}"
-        all_stage_devices = [[(0, None, 0)]]
-        return distributed_init_method, all_stage_devices
+        return distributed_init_method, None

-    # Assume we have a uniform cluster that each node has the same number of
-    # GPUs for now.
-    valid_node_resources = []
-    num_devices_per_node = None
-    for node in ray.nodes():
-        if (not node['Alive']) or node['Resources']['GPU'] <= 0:
-            continue
-        if num_devices_per_node is None:
-            num_devices_per_node = node['Resources']['GPU']
-        else:
-            assert num_devices_per_node == node['Resources']['GPU'], (
-                "The number of GPUs per node is not uniform.")
-        for key in node['Resources']:
-            if key.startswith('node:'):
-                valid_node_resources.append(key)
-
-    # Verify the parallel config.
-    num_nodes = len(valid_node_resources)
-    if parallel_config.world_size > num_nodes * num_devices_per_node:
-        raise ValueError(
-            "The number of required GPUs exceeds the total number of "
-            "available GPUs.")
-    if parallel_config.tensor_parallel_size >= num_devices_per_node:
-        if parallel_config.tensor_parallel_size % num_devices_per_node != 0:
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        gpu_bundles = 0
+        for bundle in bundles:
+            bundle_gpus = bundle.get("GPU", 0)
+            if bundle_gpus > 1:
+                raise ValueError(
+                    "Placement group bundle cannot have more than 1 GPU.")
+            if bundle_gpus:
+                gpu_bundles += 1
+        if parallel_config.world_size > gpu_bundles:
            raise ValueError(
-                "The number of tensor parallelism is not divisible by the "
-                "number of GPUs per node.")
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the placement group.")
    else:
-        if num_devices_per_node % parallel_config.tensor_parallel_size != 0:
+        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
+        if parallel_config.world_size > num_gpus_in_cluster:
            raise ValueError(
-                "The number of GPUs per node is not divisible by the number "
-                "of tensor parallelism.")
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the cluster.")
+        # Create a new placement group
+        current_placement_group = ray.util.placement_group([{
+            "GPU": 1
+        }] * parallel_config.world_size)
+        # Wait until PG is ready - this will block until all
+        # requested resources are available, and will timeout
+        # if they cannot be provisioned.
+        ray.get(current_placement_group.ready(), timeout=1800)

-    # Assign GPUs to pipeline stages.
-    rank = 0
-    current_node_id = 0
-    current_device_id = 0
-    distributed_init_method = None
-    all_stage_devices = []
-
-    for _ in range(parallel_config.pipeline_parallel_size):
-        stage_devices = []
-        for _ in range(parallel_config.tensor_parallel_size):
-            node_resource = valid_node_resources[current_node_id]
-            stage_devices.append((rank, node_resource, current_device_id))
-            if distributed_init_method is None:
-                ip = node_resource.split("node:")[-1]
-                port = random.randint(10000, 20000)
-                distributed_init_method = f"tcp://{ip}:{port}"
-            rank += 1
-            current_device_id += 1
-            if current_device_id >= num_devices_per_node:
-                current_node_id += 1
-                current_device_id = 0
-        all_stage_devices.append(stage_devices)
-
-    return distributed_init_method, all_stage_devices
+    return None, current_placement_group
--- a/vllm/engine/tokenizer_utils.py
+++ b/vllm/engine/tokenizer_utils.py
@ -1,92 +0,0 @@
-from typing import List, Tuple, Union
-
-from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer,
-                          PreTrainedTokenizerFast)
-
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
-_MODEL_TYPES_WITH_SLOW_TOKENIZER = []
-
-
-def get_tokenizer(
-    model_name: str,
-    *args,
-    **kwargs,
-) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
-    """Gets a tokenizer for the given model name via Huggingface."""
-    config = AutoConfig.from_pretrained(model_name)
-    if "open_llama" in model_name:
-        kwargs["use_fast"] = False
-        logger.info(
-            "OpenLLaMA models do not support the fast tokenizer. "
-            "Using the slow tokenizer instead.")
-    elif config.model_type == "llama" and getattr(kwargs, "use_fast", True):
-        # LLaMA fast tokenizer causes protobuf errors in some environments.
-        # However, we found that the below LLaMA fast tokenizer works well in
-        # most environments.
-        model_name = "hf-internal-testing/llama-tokenizer"
-        logger.info(
-            f"Using the LLaMA fast tokenizer in '{model_name}' to avoid "
-            "potential protobuf errors.")
-    elif config.model_type in _MODEL_TYPES_WITH_SLOW_TOKENIZER:
-        if getattr(kwargs, "use_fast", False) == True:
-            raise ValueError(
-                f"Cannot use the fast tokenizer for {config.model_type} due to "
-                "bugs in the fast tokenizer.")
-        logger.info(
-            f"Using the slow tokenizer for {config.model_type} due to bugs in "
-            "the fast tokenizer. This could potentially lead to performance "
-            "degradation.")
-        kwargs["use_fast"] = False
-    return AutoTokenizer.from_pretrained(model_name, *args, **kwargs)
-
-
-def detokenize_incrementally(
-    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
-    prev_output_tokens: List[str],
-    new_token_id: int,
-    skip_special_tokens: bool,
-) -> Tuple[str, str]:
-    """Detokenizes the new token in conjuction with the previous output tokens.
-
-    NOTE: This function does not update prev_output_tokens.
-
-    Returns:
-        new_token: The new token as a string.
-        output_text: The new output text as a string.
-    """
-    new_token = tokenizer.convert_ids_to_tokens(
-        new_token_id, skip_special_tokens=skip_special_tokens)
-    output_tokens = prev_output_tokens + [new_token]
-
-    # Convert the tokens to a string.
-    # Optimization: If the tokenizer does not have `added_tokens_encoder`,
-    # then we can directly use `convert_tokens_to_string`.
-    if not getattr(tokenizer, "added_tokens_encoder", {}):
-        output_text = tokenizer.convert_tokens_to_string(output_tokens)
-        return new_token, output_text
-
-    # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
-    # NOTE(woosuk): The following code is slow because it runs a for loop over
-    # the output_tokens. In Python, running a for loop over a list can be slow
-    # even when the loop body is very simple.
-    sub_texts = []
-    current_sub_text = []
-    for token in output_tokens:
-        if skip_special_tokens and token in tokenizer.all_special_ids:
-            continue
-        if token in tokenizer.added_tokens_encoder:
-            if current_sub_text:
-                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-                sub_texts.append(sub_text)
-                current_sub_text = []
-            sub_texts.append(token)
-        else:
-            current_sub_text.append(token)
-    if current_sub_text:
-        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
-        sub_texts.append(sub_text)
-    output_text = " ".join(sub_texts)
-    return new_token, output_text
--- a/vllm/entrypoints/api_server.py
+++ b/vllm/entrypoints/api_server.py
@ -3,7 +3,7 @@ import json
 from typing import AsyncGenerator

 from fastapi import BackgroundTasks, FastAPI, Request
-from fastapi.responses import Response, StreamingResponse
+from fastapi.responses import JSONResponse, Response, StreamingResponse
 import uvicorn

 from vllm.engine.arg_utils import AsyncEngineArgs
@ -11,8 +11,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.sampling_params import SamplingParams
 from vllm.utils import random_uuid

-TIMEOUT_KEEP_ALIVE = 5 # seconds.
-TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds
+TIMEOUT_KEEP_ALIVE = 5  # seconds.
+TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
 app = FastAPI()


@ -37,8 +37,7 @@ async def generate(request: Request) -> Response:
        async for request_output in results_generator:
            prompt = request_output.prompt
            text_outputs = [
-                prompt + output.text
-                for output in request_output.outputs
+                prompt + output.text for output in request_output.outputs
            ]
            ret = {"text": text_outputs}
            yield (json.dumps(ret) + "\0").encode("utf-8")
@ -63,12 +62,9 @@ async def generate(request: Request) -> Response:

    assert final_output is not None
    prompt = final_output.prompt
-    text_outputs = [
-        prompt + output.text
-        for output in final_output.outputs
-    ]
+    text_outputs = [prompt + output.text for output in final_output.outputs]
    ret = {"text": text_outputs}
-    return Response(content=json.dumps(ret))
+    return JSONResponse(ret)


 if __name__ == "__main__":
@ -81,5 +77,8 @@ if __name__ == "__main__":
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngine.from_engine_args(engine_args)

-    uvicorn.run(app, host=args.host, port=args.port, log_level="debug",
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="debug",
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -25,6 +25,11 @@ class LLM:

    Args:
        model: The name or path of a HuggingFace Transformers model.
+        tokenizer: The name or path of a HuggingFace Transformers tokenizer.
+        tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
+            if available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Trust remote code (e.g., from HuggingFace) when
+            downloading the model and tokenizer.
        tensor_parallel_size: The number of GPUs to use for distributed
            execution with tensor parallelism.
        dtype: The data type for the model weights and activations. Currently,
@ -38,6 +43,9 @@ class LLM:
    def __init__(
        self,
        model: str,
+        tokenizer: Optional[str] = None,
+        tokenizer_mode: str = "auto",
+        trust_remote_code: bool = False,
        tensor_parallel_size: int = 1,
        dtype: str = "auto",
        seed: int = 0,
@ -47,6 +55,9 @@ class LLM:
            kwargs["disable_log_stats"] = True
        engine_args = EngineArgs(
            model=model,
+            tokenizer=tokenizer,
+            tokenizer_mode=tokenizer_mode,
+            trust_remote_code=trust_remote_code,
            tensor_parallel_size=tensor_parallel_size,
            dtype=dtype,
            seed=seed,
@ -56,10 +67,15 @@ class LLM:
        self.request_counter = Counter()

    def get_tokenizer(
-        self,
-    ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+            self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
        return self.llm_engine.tokenizer

+    def set_tokenizer(
+        self,
+        tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    ) -> None:
+        self.llm_engine.tokenizer = tokenizer
+
    def generate(
        self,
        prompts: Optional[Union[str, List[str]]] = None,
@ -139,4 +155,8 @@ class LLM:
                        pbar.update(1)
        if use_tqdm:
            pbar.close()
+        # Sort the outputs by request ID.
+        # This is necessary because some requests may be finished earlier than
+        # its previous requests.
+        outputs = sorted(outputs, key=lambda x: int(x.request_id))
        return outputs
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -1,31 +1,45 @@
-# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py

 import argparse
-from http import HTTPStatus
+import asyncio
 import json
 import time
-from typing import AsyncGenerator, Dict, List, Optional
+from http import HTTPStatus
+from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union

 import fastapi
+import uvicorn
 from fastapi import BackgroundTasks, Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse
-import uvicorn
+from packaging import version

 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
-from vllm.engine.tokenizer_utils import get_tokenizer
 from vllm.entrypoints.openai.protocol import (
    CompletionRequest, CompletionResponse, CompletionResponseChoice,
-    CompletionResponseStreamChoice, CompletionStreamResponse, ErrorResponse,
+    CompletionResponseStreamChoice, CompletionStreamResponse,
+    ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
+    ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
    LogProbs, ModelCard, ModelList, ModelPermission, UsageInfo)
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import random_uuid

-TIMEOUT_KEEP_ALIVE = 5 # seconds
+try:
+    import fastchat
+    from fastchat.conversation import Conversation, SeparatorStyle
+    from fastchat.model.model_adapter import get_conversation_template
+    _fastchat_available = True
+except ImportError:
+    _fastchat_available = False
+
+TIMEOUT_KEEP_ALIVE = 5  # seconds

 logger = init_logger(__name__)
 served_model = None
@ -34,14 +48,13 @@ app = fastapi.FastAPI()

 def create_error_response(status_code: HTTPStatus,
                          message: str) -> JSONResponse:
-    return JSONResponse(
-        ErrorResponse(message=message, type="invalid_request_error").dict(),
-        status_code=status_code.value
-    )
+    return JSONResponse(ErrorResponse(message=message,
+                                      type="invalid_request_error").dict(),
+                        status_code=status_code.value)


@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request, exc):
+async def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))


@ -55,11 +68,88 @@ async def check_model(request) -> Optional[JSONResponse]:
    return ret


+async def get_gen_prompt(request) -> str:
+    if not _fastchat_available:
+        raise ModuleNotFoundError(
+            "fastchat is not installed. Please install fastchat to use "
+            "the chat completion and conversation APIs: `$ pip install fschat`"
+        )
+    if version.parse(fastchat.__version__) < version.parse("0.2.23"):
+        raise ImportError(
+            f"fastchat version is low. Current version: {fastchat.__version__} "
+            "Please upgrade fastchat to use: `$ pip install -U fschat`")
+
+    conv = get_conversation_template(request.model)
+    conv = Conversation(
+        name=conv.name,
+        system_template=conv.system_template,
+        system_message=conv.system_message,
+        roles=conv.roles,
+        messages=list(conv.messages),  # prevent in-place modification
+        offset=conv.offset,
+        sep_style=SeparatorStyle(conv.sep_style),
+        sep=conv.sep,
+        sep2=conv.sep2,
+        stop_str=conv.stop_str,
+        stop_token_ids=conv.stop_token_ids,
+    )
+
+    if isinstance(request.messages, str):
+        prompt = request.messages
+    else:
+        for message in request.messages:
+            msg_role = message["role"]
+            if msg_role == "system":
+                conv.system_message = message["content"]
+            elif msg_role == "user":
+                conv.append_message(conv.roles[0], message["content"])
+            elif msg_role == "assistant":
+                conv.append_message(conv.roles[1], message["content"])
+            else:
+                raise ValueError(f"Unknown role: {msg_role}")
+
+        # Add a blank message for the assistant.
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+    return prompt
+
+
+async def check_length(
+    request: Union[ChatCompletionRequest, CompletionRequest],
+    prompt: Optional[str] = None,
+    prompt_ids: Optional[List[int]] = None
+) -> Tuple[List[int], Optional[JSONResponse]]:
+    assert (not (prompt is None and prompt_ids is None)
+            and not (prompt is not None and prompt_ids is not None)
+            ), "Either prompt or prompt_ids should be provided."
+    if prompt_ids is not None:
+        input_ids = prompt_ids
+    else:
+        input_ids = tokenizer(prompt).input_ids
+    token_num = len(input_ids)
+
+    if token_num + request.max_tokens > max_model_len:
+        return input_ids, create_error_response(
+            HTTPStatus.BAD_REQUEST,
+            f"This model's maximum context length is {max_model_len} tokens. "
+            f"However, you requested {request.max_tokens + token_num} tokens "
+            f"({token_num} in the messages, "
+            f"{request.max_tokens} in the completion). "
+            f"Please reduce the length of the messages or completion.",
+        )
+    else:
+        return input_ids, None
+
+
@app.get("/v1/models")
 async def show_available_models():
    """Show available models. Right now we only have one model."""
-    model_cards = [ModelCard(id=served_model, root=served_model,
-                             permission=[ModelPermission()])]
+    model_cards = [
+        ModelCard(id=served_model,
+                  root=served_model,
+                  permission=[ModelPermission()])
+    ]
    return ModelList(data=model_cards)


@ -76,15 +166,187 @@ def create_logprobs(token_ids: List[int],
        if len(logprobs.text_offset) == 0:
            logprobs.text_offset.append(initial_text_offset)
        else:
-            logprobs.text_offset.append(logprobs.text_offset[-1] + last_token_len)
+            logprobs.text_offset.append(logprobs.text_offset[-1] +
+                                        last_token_len)
        last_token_len = len(token)

-        logprobs.top_logprobs.append(
-            {tokenizer.convert_ids_to_tokens(i): p
-             for i, p in id_logprob.items()})
+        logprobs.top_logprobs.append({
+            tokenizer.convert_ids_to_tokens(i): p
+            for i, p in id_logprob.items()
+        })
    return logprobs


+@app.post("/v1/chat/completions")
+async def create_chat_completion(raw_request: Request):
+    """Completion API similar to OpenAI's API.
+
+    See  https://platform.openai.com/docs/api-reference/chat/create
+    for the API specification. This API mimics the OpenAI ChatCompletion API.
+
+    NOTE: Currently we do not support the following features:
+        - function_call (Users should implement this by themselves)
+        - logit_bias (to be supported by vLLM engine)
+    """
+    request = ChatCompletionRequest(**await raw_request.json())
+    logger.info(f"Received chat completion request: {request}")
+
+    error_check_ret = await check_model(request)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    if request.logit_bias is not None:
+        # TODO: support logit_bias in vLLM engine.
+        return create_error_response(HTTPStatus.BAD_REQUEST,
+                                     "logit_bias is not currently supported")
+
+    prompt = await get_gen_prompt(request)
+    token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
+    model_name = request.model
+    request_id = f"cmpl-{random_uuid()}"
+    created_time = int(time.time())
+    try:
+        sampling_params = SamplingParams(
+            n=request.n,
+            presence_penalty=request.presence_penalty,
+            frequency_penalty=request.frequency_penalty,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stop=request.stop,
+            max_tokens=request.max_tokens,
+            best_of=request.best_of,
+            top_k=request.top_k,
+            ignore_eos=request.ignore_eos,
+            use_beam_search=request.use_beam_search,
+        )
+    except ValueError as e:
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
+    result_generator = engine.generate(prompt, sampling_params, request_id,
+                                       token_ids)
+
+    async def abort_request() -> None:
+        await engine.abort(request_id)
+
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        finish_reason: Optional[str] = None,
+    ) -> str:
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=index,
+            delta=DeltaMessage(content=text),
+            finish_reason=finish_reason,
+        )
+        response = ChatCompletionStreamResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=[choice_data],
+        )
+        response_json = response.json(ensure_ascii=False)
+
+        return response_json
+
+    async def completion_stream_generator() -> AsyncGenerator[str, None]:
+        # First chunk with role
+        for i in range(request.n):
+            choice_data = ChatCompletionResponseStreamChoice(
+                index=i,
+                delta=DeltaMessage(role="assistant"),
+                finish_reason=None,
+            )
+            chunk = ChatCompletionStreamResponse(id=request_id,
+                                                 choices=[choice_data],
+                                                 model=model_name)
+            data = chunk.json(exclude_unset=True, ensure_ascii=False)
+            yield f"data: {data}\n\n"
+
+        previous_texts = [""] * request.n
+        previous_num_tokens = [0] * request.n
+        async for res in result_generator:
+            res: RequestOutput
+            for output in res.outputs:
+                i = output.index
+                delta_text = output.text[len(previous_texts[i]):]
+                previous_texts[i] = output.text
+                previous_num_tokens[i] = len(output.token_ids)
+                response_json = create_stream_response_json(
+                    index=i,
+                    text=delta_text,
+                )
+                yield f"data: {response_json}\n\n"
+                if output.finish_reason is not None:
+                    response_json = create_stream_response_json(
+                        index=i,
+                        text="",
+                        finish_reason=output.finish_reason,
+                    )
+                    yield f"data: {response_json}\n\n"
+        yield "data: [DONE]\n\n"
+
+    # Streaming response
+    if request.stream:
+        background_tasks = BackgroundTasks()
+        # Abort the request if the client disconnects.
+        background_tasks.add_task(abort_request)
+        return StreamingResponse(completion_stream_generator(),
+                                 media_type="text/event-stream",
+                                 background=background_tasks)
+
+    # Non-streaming response
+    final_res: RequestOutput = None
+    async for res in result_generator:
+        if await raw_request.is_disconnected():
+            # Abort the request if the client disconnects.
+            await abort_request()
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "Client disconnected")
+        final_res = res
+    assert final_res is not None
+    choices = []
+    for output in final_res.outputs:
+        choice_data = ChatCompletionResponseChoice(
+            index=output.index,
+            message=ChatMessage(role="assistant", content=output.text),
+            finish_reason=output.finish_reason,
+        )
+        choices.append(choice_data)
+
+    num_prompt_tokens = len(final_res.prompt_token_ids)
+    num_generated_tokens = sum(
+        len(output.token_ids) for output in final_res.outputs)
+    usage = UsageInfo(
+        prompt_tokens=num_prompt_tokens,
+        completion_tokens=num_generated_tokens,
+        total_tokens=num_prompt_tokens + num_generated_tokens,
+    )
+    response = ChatCompletionResponse(
+        id=request_id,
+        created=created_time,
+        model=model_name,
+        choices=choices,
+        usage=usage,
+    )
+
+    if request.stream:
+        # When user requests streaming but we don't stream, we still need to
+        # return a streaming response with a single event.
+        response_json = response.json(ensure_ascii=False)
+
+        async def fake_stream_generator() -> AsyncGenerator[str, None]:
+            yield f"data: {response_json}\n\n"
+            yield "data: [DONE]\n\n"
+
+        return StreamingResponse(fake_stream_generator(),
+                                 media_type="text/event-stream")
+
+    return response
+
+
@app.post("/v1/completions")
 async def create_completion(raw_request: Request):
    """Completion API similar to OpenAI's API.
@ -115,7 +377,7 @@ async def create_completion(raw_request: Request):
    if request.suffix is not None:
        # The language models we currently support do not support suffix.
        return create_error_response(HTTPStatus.BAD_REQUEST,
-                                    "suffix is not currently supported")
+                                     "suffix is not currently supported")

    if request.logit_bias is not None:
        # TODO: support logit_bias in vLLM engine.
@ -124,7 +386,34 @@ async def create_completion(raw_request: Request):

    model_name = request.model
    request_id = f"cmpl-{random_uuid()}"
-    prompt = request.prompt
+
+    use_token_ids = False
+    if isinstance(request.prompt, list):
+        if len(request.prompt) == 0:
+            return create_error_response(HTTPStatus.BAD_REQUEST,
+                                         "please provide at least one prompt")
+        first_element = request.prompt[0]
+        if isinstance(first_element, int):
+            use_token_ids = True
+            prompt = request.prompt
+        elif isinstance(first_element, (str, list)):
+            # TODO: handles multiple prompt case in list[list[int]]
+            if len(request.prompt) > 1:
+                return create_error_response(
+                    HTTPStatus.BAD_REQUEST,
+                    "multiple prompts in a batch is not currently supported")
+            use_token_ids = not isinstance(first_element, str)
+            prompt = request.prompt[0]
+    else:
+        prompt = request.prompt
+
+    if use_token_ids:
+        _, error_check_ret = await check_length(request, prompt_ids=prompt)
+    else:
+        token_ids, error_check_ret = await check_length(request, prompt=prompt)
+    if error_check_ret is not None:
+        return error_check_ret
+
    created_time = int(time.time())
    try:
        sampling_params = SamplingParams(
@ -144,22 +433,30 @@ async def create_completion(raw_request: Request):
    except ValueError as e:
        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))

-    result_generator = engine.generate(prompt, sampling_params,
-                                       request_id)
+    if use_token_ids:
+        result_generator = engine.generate(None,
+                                           sampling_params,
+                                           request_id,
+                                           prompt_token_ids=prompt)
+    else:
+        result_generator = engine.generate(prompt, sampling_params, request_id,
+                                           token_ids)

    # Similar to the OpenAI API, when n != best_of, we do not stream the
    # results. In addition, we do not stream the results when use beam search.
-    stream = (request.stream and
-              (request.best_of is None or request.n == request.best_of) and
-              not request.use_beam_search)
+    stream = (request.stream
+              and (request.best_of is None or request.n == request.best_of)
+              and not request.use_beam_search)

    async def abort_request() -> None:
        await engine.abort(request_id)

-    def create_stream_response_json(index: int,
-                                    text: str,
-                                    logprobs: Optional[LogProbs] = None,
-                                    finish_reason: Optional[str] = None) -> str:
+    def create_stream_response_json(
+        index: int,
+        text: str,
+        logprobs: Optional[LogProbs] = None,
+        finish_reason: Optional[str] = None,
+    ) -> str:
        choice_data = CompletionResponseStreamChoice(
            index=index,
            text=text,
@ -200,7 +497,8 @@ async def create_completion(raw_request: Request):
                )
                yield f"data: {response_json}\n\n"
                if output.finish_reason is not None:
-                    logprobs = LogProbs() if request.logprobs is not None else None
+                    logprobs = (LogProbs()
+                                if request.logprobs is not None else None)
                    response_json = create_stream_response_json(
                        index=i,
                        text="",
@ -208,7 +506,7 @@ async def create_completion(raw_request: Request):
                        finish_reason=output.finish_reason,
                    )
                    yield f"data: {response_json}\n\n"
-            yield "data: [DONE]\n\n"
+        yield "data: [DONE]\n\n"

    # Streaming response
    if stream:
@ -244,8 +542,8 @@ async def create_completion(raw_request: Request):
        choices.append(choice_data)

    num_prompt_tokens = len(final_res.prompt_token_ids)
-    num_generated_tokens = sum(len(output.token_ids)
-                               for output in final_res.outputs)
+    num_generated_tokens = sum(
+        len(output.token_ids) for output in final_res.outputs)
    usage = UsageInfo(
        prompt_tokens=num_prompt_tokens,
        completion_tokens=num_generated_tokens,
@ -263,9 +561,11 @@ async def create_completion(raw_request: Request):
        # When user requests streaming but we don't stream, we still need to
        # return a streaming response with a single event.
        response_json = response.json(ensure_ascii=False)
+
        async def fake_stream_generator() -> AsyncGenerator[str, None]:
            yield f"data: {response_json}\n\n"
            yield "data: [DONE]\n\n"
+
        return StreamingResponse(fake_stream_generator(),
                                 media_type="text/event-stream")

@ -274,26 +574,34 @@ async def create_completion(raw_request: Request):

 if __name__ == "__main__":
    parser = argparse.ArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server."
-    )
-    parser.add_argument("--host", type=str, default="localhost", help="host name")
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser.add_argument("--host",
+                        type=str,
+                        default="localhost",
+                        help="host name")
    parser.add_argument("--port", type=int, default=8000, help="port number")
-    parser.add_argument(
-        "--allow-credentials", action="store_true", help="allow credentials"
-    )
-    parser.add_argument(
-        "--allowed-origins", type=json.loads, default=["*"], help="allowed origins"
-    )
-    parser.add_argument(
-        "--allowed-methods", type=json.loads, default=["*"], help="allowed methods"
-    )
-    parser.add_argument(
-        "--allowed-headers", type=json.loads, default=["*"], help="allowed headers"
-    )
-    parser.add_argument("--served-model-name", type=str, default=None,
-                        help="The model name used in the API. If not specified, "
-                             "the model name will be the same as the "
-                             "huggingface name.")
+    parser.add_argument("--allow-credentials",
+                        action="store_true",
+                        help="allow credentials")
+    parser.add_argument("--allowed-origins",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed origins")
+    parser.add_argument("--allowed-methods",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed methods")
+    parser.add_argument("--allowed-headers",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed headers")
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. If not "
+                        "specified, the model name will be the same as "
+                        "the huggingface name.")
+
    parser = AsyncEngineArgs.add_cli_args(parser)
    args = parser.parse_args()

@ -307,13 +615,23 @@ if __name__ == "__main__":

    logger.info(f"args: {args}")

-    served_model = args.served_model_name or args.model
+    if args.served_model_name is not None:
+        served_model = args.served_model_name
+    else:
+        served_model = args.model

    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngine.from_engine_args(engine_args)
+    engine_model_config = asyncio.run(engine.get_model_config())
+    max_model_len = engine_model_config.get_max_model_len()

    # A separate tokenizer to map token IDs to strings.
-    tokenizer = get_tokenizer(args.model)
+    tokenizer = get_tokenizer(engine_args.tokenizer,
+                              tokenizer_mode=engine_args.tokenizer_mode,
+                              trust_remote_code=engine_args.trust_remote_code)

-    uvicorn.run(app, host=args.host, port=args.port, log_level="info",
+    uvicorn.run(app,
+                host=args.host,
+                port=args.port,
+                log_level="info",
                timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -1,4 +1,5 @@
-# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
 from typing import Dict, List, Literal, Optional, Union

@ -53,21 +54,28 @@ class UsageInfo(BaseModel):

 class ChatCompletionRequest(BaseModel):
    model: str
-    messages: List[Dict[str, str]]
+    messages: Union[str, List[Dict[str, str]]]
    temperature: Optional[float] = 0.7
    top_p: Optional[float] = 1.0
    n: Optional[int] = 1
-    max_tokens: Optional[int] = None
-    stop: Optional[Union[str, List[str]]] = None
+    max_tokens: Optional[int] = 16
+    stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
    stream: Optional[bool] = False
    presence_penalty: Optional[float] = 0.0
    frequency_penalty: Optional[float] = 0.0
+    logit_bias: Optional[Dict[str, float]] = None
    user: Optional[str] = None
+    # Additional parameters supported by vLLM
+    best_of: Optional[int] = None
+    top_k: Optional[int] = -1
+    ignore_eos: Optional[bool] = False
+    use_beam_search: Optional[bool] = False


 class CompletionRequest(BaseModel):
    model: str
-    prompt: str
+    # a string, array of strings, array of tokens, or array of token arrays
+    prompt: Union[List[int], List[List[int]], str, List[str]]
    suffix: Optional[str] = None
    max_tokens: Optional[int] = 16
    temperature: Optional[float] = 1.0
@ -92,7 +100,8 @@ class LogProbs(BaseModel):
    text_offset: List[int] = Field(default_factory=list)
    token_logprobs: List[Optional[float]] = Field(default_factory=list)
    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: List[Optional[Dict[str, float]]] = Field(default_factory=list)
+    top_logprobs: List[Optional[Dict[str,
+                                     float]]] = Field(default_factory=list)


 class CompletionResponseChoice(BaseModel):
@ -124,3 +133,42 @@ class CompletionStreamResponse(BaseModel):
    created: int = Field(default_factory=lambda: int(time.time()))
    model: str
    choices: List[CompletionResponseStreamChoice]
+
+
+class ChatMessage(BaseModel):
+    role: str
+    content: str
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseChoice]
+    usage: UsageInfo
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[str] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+    finish_reason: Optional[Literal["stop", "length"]] = None
+
+
+class ChatCompletionStreamResponse(BaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
+    object: str = "chat.completion.chunk"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    model: str
+    choices: List[ChatCompletionResponseStreamChoice]
--- a/vllm/logger.py
+++ b/vllm/logger.py
@ -1,9 +1,9 @@
-# Adapted from https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
-
+# Adapted from
+# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
+"""Logging configuration for vLLM."""
 import logging
 import sys

-
 _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 _DATE_FORMAT = "%m-%d %H:%M:%S"

--- a/vllm/model_executor/init.py
+++ b/vllm/model_executor/init.py
@ -2,7 +2,6 @@ from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.utils import set_random_seed

-
 __all__ = [
    "InputMetadata",
    "get_model",
--- a/vllm/model_executor/input_metadata.py
+++ b/vllm/model_executor/input_metadata.py
@ -1,18 +1,29 @@
 from typing import Dict, List, Tuple

 import torch
-from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask
+from xformers.ops import AttentionBias

 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceData


 class InputMetadata:
+    """Metadata for input sequences. Used for PagedAttention.
+
+    Args:
+        seq_groups: List of (seq_ids, sampling_params).
+        seq_data: Seq_id -> SequenceData.
+        prompt_lens: Lengths of prompts.
+        slot_mapping: The address to write the new KV to of each token.
+        context_lens: the length of attention context for each generation token.
+        max_context_len: The maximum context length.
+        block_tables: The block tables. (Seq id -> list of physical block)
+    """

    def __init__(
        self,
-        seq_groups: List[Tuple[List[int], SamplingParams]],     # List of (seq_ids, sampling_params).
-        seq_data: Dict[int, SequenceData],                      # Seq_id -> SequenceData.
+        seq_groups: List[Tuple[List[int], SamplingParams]],
+        seq_data: Dict[int, SequenceData],
        prompt_lens: List[int],
        slot_mapping: torch.Tensor,
        context_lens: torch.Tensor,
@ -27,7 +38,6 @@ class InputMetadata:
        self.max_context_len = max_context_len
        self.block_tables = block_tables

-        self.attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
        self.num_prompts = len(prompt_lens)
        self.num_prompt_tokens = sum(prompt_lens)
        self.num_generation_tokens = context_lens.shape[0]
@ -39,6 +49,9 @@ class InputMetadata:
        assert block_tables.shape[0] == self.num_generation_tokens
        assert context_lens.shape[0] == self.num_generation_tokens

+        # Set during the execution of the first attention op.
+        self.attn_bias: List[AttentionBias] = []
+
    def __repr__(self) -> str:
        # Print only useful metadata.
        return (f'InputMetadata('
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@ -4,10 +4,50 @@ import torch.nn as nn

 from vllm import activation_ops

+
+class SiluAndMul(nn.Module):
+    """An activation function for SwiGLU.
+
+    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
+
+    Shapes:
+        x: (num_tokens, 2 * d)
+        return: (num_tokens, d)
+    """
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        num_tokens = x.shape[0]
+        d = x.shape[1] // 2
+        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
+        activation_ops.silu_and_mul(out, x)
+        return out
+
+
+class NewGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        num_tokens = x.shape[0]
+        d = x.shape[1]
+        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
+        activation_ops.gelu_new(out, x)
+        return out
+
+
+class FastGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        num_tokens = x.shape[0]
+        d = x.shape[1]
+        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
+        activation_ops.gelu_fast(out, x)
+        return out
+
+
 _ACTIVATION_REGISTRY = {
    "gelu": nn.GELU(),
-    "gelu_new": nn.GELU(approximate="tanh"),   # NOTE: This may introduce small rounding errors.
-    "gelu_fast": nn.GELU(approximate="tanh"),  # NOTE: This may introduce small rounding errors.
+    "gelu_fast": FastGELU(),
+    "gelu_new": NewGELU(),
+    "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
    "relu": nn.ReLU(),
 }

@ -18,23 +58,3 @@ def get_act_fn(act_fn: str) -> nn.Module:
    if act_fn in _ACTIVATION_REGISTRY:
        return _ACTIVATION_REGISTRY[act_fn]
    raise ValueError(f"Activation function {act_fn!r} is not supported.")
-
-
-class SiluAndMul(nn.Module):
-    """An activation function for SwiGLU.
-
-    The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def forward(
-        self,
-        x: torch.Tensor,        # (num_tokens, 2 * d)
-    ) -> torch.Tensor:          # (num_tokens, d)
-        num_tokens = x.shape[0]
-        d = x.shape[1] // 2
-        out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
-        activation_ops.silu_and_mul(out, x)
-        return out
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@ -1,28 +1,39 @@
 """Multi-head attention."""
-from typing import Optional
+from typing import List, Optional

 import torch
 import torch.nn as nn
 from xformers import ops as xops
+from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
+                                         LowerTriangularMaskWithTensorBias)

 from vllm import attention_ops
 from vllm import cache_ops
 from vllm import pos_encoding_ops
 from vllm.model_executor.input_metadata import InputMetadata

-_SUPPORTED_HEAD_SIZES = [64, 80, 96, 128]
+_SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]


 class PagedAttention(nn.Module):
+    # pylint: disable=line-too-long
    """GPT-style multi-head PagedAttention.

    This class takes flattened 1D query, key, and value tensors as input. The
-    input 1D tensors can be split into three parts: the prompt tokens, the
-    generation tokens, and the paddings.
+    input 1D tensors can either contain prompt tokens or generation tokens, in
+    addition to paddings.

-    |<------------------------------------- num_valid_tokens ------------------------------------->|
-    |<--------------- num_prompt_tokens -------------->|<------- num_generation_tokens (M) ------->|
-    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|<--generation_0-->|...|<--generation_M-1-->|<--padding-->|
+    If the input tensors contain prompt tokens, the layout is as follows:
+
+    |<---------------------- num_valid_tokens ---------------------->|
+    |<--------------- num_prompt_tokens -------------->|
+    |<--prompt_0-->|<--prompt_1-->|...|<--prompt_N-1-->|<--padding-->|
+
+    Otherwise, the layout is as follows:
+
+    |<------------------ num_valid_tokens ------------------->|
+    |<------- num_generation_tokens (M) ------->|
+    |<--generation_0-->|...|<--generation_M-1-->|<--padding-->|

    The prompts might have different lengths, while the generation tokens always
    have length 1. The paddings are appended to make the input length a multiple
@ -41,31 +52,67 @@ class PagedAttention(nn.Module):
    5. Output a flattened 1D tensor.
    """

-    def __init__(self, num_heads: int, head_size: int, scale: float) -> None:
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 num_kv_heads: Optional[int] = None) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
        self.attn_op = xops.fmha.cutlass.FwOp()
+        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
+
+        assert self.num_heads % self.num_kv_heads == 0
+        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
+        self.head_mapping = torch.repeat_interleave(
+            torch.arange(self.num_kv_heads, dtype=torch.int32, device="cuda"),
+            self.num_queries_per_kv)

        if self.head_size not in _SUPPORTED_HEAD_SIZES:
            raise ValueError(f"head_size ({self.head_size}) is not supported. "
                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")

+    def set_attn_bias(self, input_metadata: InputMetadata) -> None:
+        if input_metadata.attn_bias:
+            # Already set by a previous layer.
+            return
+        prompt_lens = input_metadata.prompt_lens
+        attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
+        input_metadata.attn_bias.append(attn_bias)
+
    def multi_query_kv_attention(
        self,
-        output: torch.Tensor,                   # [num_prompt_tokens, num_heads, head_size]
-        query: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
-        key: torch.Tensor,                      # [num_prompt_tokens, num_heads, head_size]
-        value: torch.Tensor,                    # [num_prompt_tokens, num_heads, head_size]
-        attn_bias: xops.AttentionBias,
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        input_metadata: InputMetadata,
    ) -> torch.Tensor:
+        """Normal attention for the prompt tokens.
+
+        Args:
+            output: shape = [num_prompt_tokens, num_heads, head_size]
+            query: shape = [num_prompt_tokens, num_heads, head_size]
+            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
+            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
+            input_metadata: metadata for paged attention.
+        """
+
+        if self.num_kv_heads != self.num_heads:
+            # Project the key and value tensors to the desired number of heads.
+            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
+            value = torch.repeat_interleave(value,
+                                            self.num_queries_per_kv,
+                                            dim=1)
+
        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
        out = xops.memory_efficient_attention_forward(
            query.unsqueeze(0),
            key.unsqueeze(0),
            value.unsqueeze(0),
-            attn_bias=attn_bias,
+            attn_bias=input_metadata.attn_bias[0],
            p=0.0,
            scale=self.scale,
            op=self.attn_op,
@ -76,42 +123,72 @@ class PagedAttention(nn.Module):

    def single_query_cached_kv_attention(
        self,
-        output: torch.Tensor,           # [num_generation_tokens, num_heads, head_size]
-        query: torch.Tensor,            # [num_generation_tokens, num_heads, head_size]
-        key_cache: torch.Tensor,        # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: torch.Tensor,      # [num_blocks, num_heads, head_size, block_size]
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
        input_metadata: InputMetadata,
    ) -> None:
+        """PagedAttention for the generation tokens.
+
+        Args:
+            output: shape = [num_generation_tokens, num_heads, head_size]
+            query: shape = [num_generation_tokens, num_heads, head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for paged attention.
+        """
        block_size = value_cache.shape[3]
        attention_ops.single_query_cached_kv_attention(
            output,
            query,
            key_cache,
            value_cache,
+            self.head_mapping,
            self.scale,
            input_metadata.block_tables,
            input_metadata.context_lens,
            block_size,
            input_metadata.max_context_len,
+            None,  # alibi_slopes
        )

    def forward(
        self,
-        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
-        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key_cache: Optional[torch.Tensor],      # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: Optional[torch.Tensor],    # [num_blocks, num_heads, head_size, block_size]
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: Optional[torch.Tensor],
+        value_cache: Optional[torch.Tensor],
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
-        # NOTE: The query, key, and value tensors must be sliced from a qkv
-        # tensor of shape [num_tokens, 3 * num_heads * head_size].
+    ) -> torch.Tensor:
+        """PagedAttention forward pass.
+
+        NOTE: The query, key, and value tensors must be sliced from a qkv
+        tensor of shape [num_tokens, 3 * num_heads * head_size].
+
+        Args:
+            query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for paged attention.
+            cache_event: event to wait for the cache operations to finish.
+
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """

        # Reshape the query, key, and value tensors.
        query = query.view(-1, self.num_heads, self.head_size)
-        key = key.view(-1, self.num_heads, self.head_size)
-        value = value.view(-1, self.num_heads, self.head_size)
+        key = key.view(-1, self.num_kv_heads, self.head_size)
+        value = value.view(-1, self.num_kv_heads, self.head_size)

        # Pre-allocate the output tensor.
        output = torch.empty_like(query)
@ -119,12 +196,15 @@ class PagedAttention(nn.Module):
        # Compute the attention op for prompts.
        num_prompt_tokens = input_metadata.num_prompt_tokens
        if num_prompt_tokens > 0:
+            # Prompt run.
+            assert input_metadata.num_generation_tokens == 0
+            self.set_attn_bias(input_metadata)
            self.multi_query_kv_attention(
                output[:num_prompt_tokens],
                query[:num_prompt_tokens],
                key[:num_prompt_tokens],
                value[:num_prompt_tokens],
-                input_metadata.attn_bias,
+                input_metadata,
            )

        # Wait until the cache op is done.
@ -136,7 +216,7 @@ class PagedAttention(nn.Module):
        # and value vectors will not be cached.
        num_valid_tokens = input_metadata.num_valid_tokens
        if (num_valid_tokens > 0 and key_cache is not None
-            and value_cache is not None):
+                and value_cache is not None):
            # The stride is 3 because the key and value are sliced from qkv.
            cache_ops.reshape_and_cache(
                key[:num_valid_tokens],
@ -147,17 +227,16 @@ class PagedAttention(nn.Module):
            )

        if input_metadata.num_generation_tokens > 0:
+            # Decoding run.
+            assert input_metadata.num_prompt_tokens == 0
            assert key_cache is not None and value_cache is not None, (
                "key_cache and value_cache must be provided when "
-                "generating tokens."
-            )
+                "generating tokens.")
            # Compute the attention op for generation tokens.
            self.single_query_cached_kv_attention(
                output[num_prompt_tokens:num_valid_tokens],
-                query[num_prompt_tokens:num_valid_tokens],
-                key_cache,
-                value_cache,
-                input_metadata)
+                query[num_prompt_tokens:num_valid_tokens], key_cache,
+                value_cache, input_metadata)

        # Reshape the output tensor.
        # NOTE(woosuk): The output tensor may include paddings.
@ -175,19 +254,21 @@ class PagedAttentionWithRoPE(PagedAttention):
        rotary_dim: int,
        max_position: int = 8192,
        base: int = 10000,
+        num_kv_heads: Optional[int] = None,
    ) -> None:
-        super().__init__(num_heads, head_size, scale)
+        super().__init__(num_heads, head_size, scale, num_kv_heads)

        # Create the cos and sin cache.
-        inv_freq = 1.0 / (base ** (torch.arange(0, rotary_dim, 2) / rotary_dim))
+        inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
        t = torch.arange(max_position).float()
-        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
+        freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
        cos = freqs.cos()
        sin = freqs.sin()
        cache = torch.cat((cos, sin), dim=-1)

        # FIXME(woosuk): This assumes that we configure the default dtype when
-        # initializing the model. Make it more robust.
+        # initializing the model.
+        # TODO(woosuk): Make it more robust.
        torch_dtype = torch.get_default_dtype()
        cache = cache.to(torch_dtype)
        # Embedding size: [max_position, rotary_dim]
@ -195,15 +276,33 @@ class PagedAttentionWithRoPE(PagedAttention):

    def forward(
        self,
-        positions: torch.Tensor,                # [num_tokens]
-        query: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key: torch.Tensor,                      # [num_tokens, num_heads * head_size]
-        value: torch.Tensor,                    # [num_tokens, num_heads * head_size]
-        key_cache: torch.Tensor,                # [num_blocks, num_heads, head_size/x, block_size, x]
-        value_cache: torch.Tensor,              # [num_blocks, num_heads, head_size, block_size]
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
        input_metadata: InputMetadata,
        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:                          # [num_tokens, num_heads * head_size]
+    ) -> torch.Tensor:
+        """ PagedAttention forward pass with rotary embedding.
+
+        Args:
+            positions: shape = [num_tokens]
+                        query: shape = [num_tokens, num_heads * head_size]
+            key: shape = [num_tokens, num_kv_heads * head_size]
+            value: shape = [num_tokens, num_kv_heads * head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for paged attention.
+            cache_event: event to wait for the cache operations to finish.
+
+        Returns:
+            shape = [num_tokens, num_heads * head_size]
+        """
+
        # Apply rotary embedding to the query and key before passing them
        # to the attention op.
        pos_encoding_ops.rotary_embedding_neox(
@ -222,3 +321,126 @@ class PagedAttentionWithRoPE(PagedAttention):
            input_metadata,
            cache_event,
        )
+
+
+class PagedAttentionWithALiBi(PagedAttention):
+    """PagedAttention with ALiBi attention bias."""
+
+    def __init__(self,
+                 num_heads: int,
+                 head_size: int,
+                 scale: float,
+                 slopes: List[float],
+                 num_kv_heads: Optional[int] = None) -> None:
+        super().__init__(num_heads, head_size, scale, num_kv_heads)
+        assert len(slopes) == num_heads
+
+        slopes = torch.tensor(slopes, dtype=torch.float32)
+        self.register_buffer("alibi_slopes", slopes, persistent=False)
+
+    def set_attn_bias(self, input_metadata: InputMetadata) -> None:
+        if input_metadata.attn_bias:
+            # Already set by a previous layer.
+            return
+        # Generates ALiBi mask for each prompt.
+        for prompt_len in input_metadata.prompt_lens:
+            bias = torch.arange(prompt_len)
+            # Note(zhuohan): HF uses
+            #     `bias = bias[None, :].repeat(prompt_len, 1)`
+            # here. We find that both biases give the same results, but
+            # the bias below more accurately follows the original ALiBi
+            # paper.
+            bias = bias[None, :] - bias[:, None]
+            bias = bias.to(self.alibi_slopes.device)
+
+            # When using custom attention bias, xformers requires the bias to
+            # be sliced from a tensor whose length is a multiple of 8.
+            padded_len = (prompt_len + 7) // 8 * 8
+            bias = torch.empty(
+                1,  # batch_size
+                self.num_heads,
+                prompt_len,
+                padded_len,
+                device=self.alibi_slopes.device,
+            )[:, :, :, :prompt_len].copy_(bias)
+            bias.mul_(self.alibi_slopes[:, None, None])
+            attn_bias = LowerTriangularMaskWithTensorBias(bias)
+            input_metadata.attn_bias.append(attn_bias)
+
+    def multi_query_kv_attention(
+        self,
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> torch.Tensor:
+        """Attention with ALiBi bias for the prompt tokens.
+
+        Args:
+            output: shape = [num_prompt_tokens, num_heads, head_size]
+            query: shape = [num_prompt_tokens, num_heads, head_size]
+            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
+            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
+            input_metadata: metadata for paged attention.
+        """
+        if self.num_kv_heads != self.num_heads:
+            # Project the key and value tensors to the desired number of heads.
+            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
+            value = torch.repeat_interleave(value,
+                                            self.num_queries_per_kv,
+                                            dim=1)
+
+        # FIXME(woosuk): Because xformers does not support dynamic sequence
+        # lengths with custom attention bias, we process each prompt one by
+        # one. This is inefficient, especially when we have many short prompts.
+        start = 0
+        for i, prompt_len in enumerate(input_metadata.prompt_lens):
+            end = start + prompt_len
+            out = xops.memory_efficient_attention_forward(
+                query[None, start:end],
+                key[None, start:end],
+                value[None, start:end],
+                attn_bias=input_metadata.attn_bias[i],
+                p=0.0,
+                scale=self.scale,
+                op=self.attn_op,
+            )
+            # TODO(woosuk): Unnecessary copy. Optimize.
+            output[start:end].copy_(out.squeeze(0))
+            start += prompt_len
+        return output
+
+    def single_query_cached_kv_attention(
+        self,
+        output: torch.Tensor,
+        query: torch.Tensor,
+        key_cache: torch.Tensor,
+        value_cache: torch.Tensor,
+        input_metadata: InputMetadata,
+    ) -> None:
+        """PagedAttention with ALiBi bias for the generation tokens.
+
+        Args:
+            output: shape = [num_generation_tokens, num_heads, head_size]
+            query: shape = [num_generation_tokens, num_heads, head_size]
+            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
+                block_size, x]
+            value_cache: shape = [num_blocks, num_kv_heads, head_size,
+                block_size]
+            input_metadata: metadata for paged attention.
+        """
+        block_size = value_cache.shape[3]
+        attention_ops.single_query_cached_kv_attention(
+            output,
+            query,
+            key_cache,
+            value_cache,
+            self.head_mapping,
+            self.scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+            self.alibi_slopes,
+        )
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@ -11,6 +11,8 @@ from vllm.model_executor.parallel_utils.tensor_parallel import (
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceOutputs

+_SAMPLING_EPS = 1e-5
+

 class Sampler(nn.Module):
    """Samples the next tokens from the model's outputs.
@ -36,12 +38,15 @@ class Sampler(nn.Module):
        embedding: torch.Tensor,
        hidden_states: torch.Tensor,
        input_metadata: InputMetadata,
+        embedding_bias: Optional[torch.Tensor] = None,
    ) -> Dict[int, SequenceOutputs]:
        # Get the hidden states that we use for sampling.
        hidden_states = _prune_hidden_states(hidden_states, input_metadata)

        # Get the logits for the next tokens.
        logits = torch.matmul(hidden_states, embedding.t())
+        if embedding_bias is not None:
+            logits += embedding_bias
        logits = gather_from_tensor_model_parallel_region(logits)
        # Remove paddings in vocab (if any).
        logits = logits[:, :self.vocab_size]
@ -49,34 +54,37 @@ class Sampler(nn.Module):
        # Apply presence and frequency penalties.
        output_tokens = _get_output_tokens(input_metadata)
        assert len(output_tokens) == logits.shape[0]
-        presence_penalties, frequency_penalties = _get_penalties(input_metadata)
+        presence_penalties, frequency_penalties = _get_penalties(
+            input_metadata)
        assert len(presence_penalties) == logits.shape[0]
        assert len(frequency_penalties) == logits.shape[0]
-        logits = _apply_penalties(
-            logits, output_tokens, presence_penalties, frequency_penalties,
-            self.vocab_size)
+        logits = _apply_penalties(logits, output_tokens, presence_penalties,
+                                  frequency_penalties, self.vocab_size)

        # Apply temperature scaling.
        temperatures = _get_temperatures(input_metadata)
        assert len(temperatures) == logits.shape[0]
        if any(t != 1.0 for t in temperatures):
-            t = torch.tensor(
-                temperatures, dtype=logits.dtype, device=logits.device)
+            t = torch.tensor(temperatures,
+                             dtype=logits.dtype,
+                             device=logits.device)
            # Use in-place division to avoid creating a new tensor.
            logits.div_(t.unsqueeze(dim=1))

+        # Apply top-p and top-k truncation.
+        top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
+        assert len(top_ps) == len(top_ks) == logits.shape[0]
+        do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
+        do_top_k = any(k != self.vocab_size for k in top_ks)
+        if do_top_p or do_top_k:
+            logits = _apply_top_p_top_k(logits, top_ps, top_ks)
+
        # We use float32 for probabilities and log probabilities.
        # Compute the probabilities.
        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
        # Compute the log probabilities (before applying top-p and top-k).
        logprobs = torch.log(probs)

-        # Apply top-p and top-k truncation.
-        top_ps, top_ks = _get_top_p_top_k(input_metadata, self.vocab_size)
-        assert len(top_ps) == len(top_ks) == probs.shape[0]
-        if any(p < 1.0 for p in top_ps) or any(k != self.vocab_size for k in top_ks):
-            probs = _apply_top_p_top_k(probs, top_ps, top_ks)
-
        # Sample the next tokens.
        return _sample(probs, logprobs, input_metadata)

@ -96,8 +104,7 @@ def _prune_hidden_states(


 def _get_penalties(
-    input_metadata: InputMetadata,
-) -> Tuple[List[float], List[float]]:
+        input_metadata: InputMetadata) -> Tuple[List[float], List[float]]:
    # Collect the presence and frequency penalties.
    presence_penalties: List[float] = []
    frequency_penalties: List[float] = []
@ -116,9 +123,7 @@ def _get_penalties(
    return presence_penalties, frequency_penalties


-def _get_output_tokens(
-    input_metadata: InputMetadata,
-) -> List[List[int]]:
+def _get_output_tokens(input_metadata: InputMetadata) -> List[List[int]]:
    output_tokens: List[List[int]] = []
    for i, seq_group in enumerate(input_metadata.seq_groups):
        seq_ids, _ = seq_group
@ -152,7 +157,7 @@ def _apply_penalties(
            continue
        p = presence_penalties[i]
        f = frequency_penalties[i]
-        if p == 0.0 and f == 0.0:
+        if p < _SAMPLING_EPS and f < _SAMPLING_EPS:
            continue
        indices.append(i)

@ -168,11 +173,13 @@ def _apply_penalties(
                                                 device=logits.device)

    frequency_penalties = [frequency_penalties[i] for i in indices]
-    frequency_penalties = torch.tensor(
-        frequency_penalties, dtype=logits.dtype, device=logits.device)
+    frequency_penalties = torch.tensor(frequency_penalties,
+                                       dtype=logits.dtype,
+                                       device=logits.device)
    presence_penalties = [presence_penalties[i] for i in indices]
-    presence_penalties = torch.tensor(
-        presence_penalties, dtype=logits.dtype, device=logits.device)
+    presence_penalties = torch.tensor(presence_penalties,
+                                      dtype=logits.dtype,
+                                      device=logits.device)

    # We follow the definition in OpenAI API.
    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
@ -182,15 +189,13 @@ def _apply_penalties(
    return logits


-def _get_temperatures(
-    input_metadata: InputMetadata,
-) -> List[float]:
+def _get_temperatures(input_metadata: InputMetadata) -> List[float]:
    # Collect the temperatures for the logits.
    temperatures: List[float] = []
    for i, seq_group in enumerate(input_metadata.seq_groups):
        seq_ids, sampling_params = seq_group
        temperature = sampling_params.temperature
-        if temperature == 0.0:
+        if temperature < _SAMPLING_EPS:
            # NOTE: Zero temperature means deterministic sampling
            # (i.e., greedy sampling or beam search).
            # Set the temperature to 1 to avoid division by zero.
@ -230,30 +235,32 @@ def _get_top_p_top_k(


 def _apply_top_p_top_k(
-    probs: torch.Tensor,
+    logits: torch.Tensor,
    top_ps: List[float],
    top_ks: List[int],
 ) -> torch.Tensor:
-    p = torch.tensor(top_ps, dtype=probs.dtype, device=probs.device)
-    k = torch.tensor(top_ks, dtype=torch.int, device=probs.device)
-    probs_sort, probs_idx = probs.sort(dim=-1, descending=True)
+    p = torch.tensor(top_ps, dtype=logits.dtype, device=logits.device)
+    k = torch.tensor(top_ks, dtype=torch.int, device=logits.device)
+    logits_sort, logits_idx = logits.sort(dim=-1, descending=True)

    # Apply top-p.
-    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    probs_sort = logits_sort.softmax(dim=-1)
+    probs_sum = probs_sort.cumsum(dim=-1)
    top_p_mask = (probs_sum - probs_sort) > p.unsqueeze(dim=1)
-    probs_sort[top_p_mask] = 0.0
+    logits_sort[top_p_mask] = -float("inf")

    # Apply top-k.
    # Create a mask for the top-k elements.
-    top_k_mask = torch.arange(probs_idx.shape[-1], device=probs_idx.device)
-    top_k_mask = top_k_mask.expand(probs_idx.shape[0], -1)
+    top_k_mask = torch.arange(logits_idx.shape[-1], device=logits_idx.device)
+    top_k_mask = top_k_mask.expand(logits_idx.shape[0], -1)
    top_k_mask = top_k_mask >= k.unsqueeze(dim=1)
-    probs_sort[top_k_mask] = 0.0
+    logits_sort[top_k_mask] = -float("inf")

    # Re-sort the probabilities.
-    probs = torch.gather(
-        probs_sort, dim=-1, index=torch.argsort(probs_idx, dim=-1))
-    return probs
+    logits = torch.gather(logits_sort,
+                          dim=-1,
+                          index=torch.argsort(logits_idx, dim=-1))
+    return logits


 def _get_topk_logprobs(
@ -286,7 +293,7 @@ def _sample_from_prompt(
        beam_width = sampling_params.best_of
        _, next_token_ids = torch.topk(prob, beam_width)
        next_token_ids = next_token_ids.tolist()
-    elif sampling_params.temperature == 0.0:
+    elif sampling_params.temperature < _SAMPLING_EPS:
        # Greedy sampling.
        assert sampling_params.best_of == 1
        next_token_id = torch.argmax(prob)
@ -295,8 +302,9 @@ def _sample_from_prompt(
        # Random sampling.
        # Sample `best_of` tokens for the prompt.
        num_seqs = sampling_params.best_of
-        next_token_ids = torch.multinomial(
-            prob, num_samples=num_seqs, replacement=True)
+        next_token_ids = torch.multinomial(prob,
+                                           num_samples=num_seqs,
+                                           replacement=True)
        next_token_ids = next_token_ids.tolist()
    return next_token_ids

@ -314,8 +322,9 @@ def _sample_from_generation_tokens(
    if sampling_params.use_beam_search:
        # Beam search.
        # Add cumulative logprobs for the sequences in the group.
-        seq_logprobs = torch.tensor(
-            seq_logprobs, dtype=torch.float, device=logprobs.device)
+        seq_logprobs = torch.tensor(seq_logprobs,
+                                    dtype=torch.float,
+                                    device=logprobs.device)
        logprobs = logprobs + seq_logprobs.unsqueeze(dim=1)

        vocab_size = logprobs.size(-1)
@ -343,7 +352,7 @@ def _sample_from_generation_tokens(

        parent_seq_ids = [beam_outputs[seq_id][0] for seq_id in seq_ids]
        next_token_ids = [beam_outputs[seq_id][1] for seq_id in seq_ids]
-    elif sampling_params.temperature == 0.0:
+    elif sampling_params.temperature < _SAMPLING_EPS:
        # Greedy sampling.
        assert len(seq_ids) == 1
        next_token_id = torch.argmax(probs, dim=-1)
@ -352,8 +361,9 @@ def _sample_from_generation_tokens(
    else:
        # Random sampling.
        # Sample 1 token for each sequence in the group.
-        next_token_ids = torch.multinomial(
-            probs, num_samples=1, replacement=True)
+        next_token_ids = torch.multinomial(probs,
+                                           num_samples=1,
+                                           replacement=True)
        next_token_ids = next_token_ids.squeeze(dim=-1).tolist()
        parent_seq_ids = seq_ids
    return parent_seq_ids, next_token_ids
@ -380,15 +390,16 @@ def _sample(
            # Sample the next tokens.
            next_token_ids = _sample_from_prompt(prob, sampling_params)
            # Get top-k log probabilities for the next tokens.
-            next_logprobs = _get_topk_logprobs(
-                logprob, sampling_params.logprobs)
+            next_logprobs = _get_topk_logprobs(logprob,
+                                               sampling_params.logprobs)

            # Build the output.
            for seq_id, next_token_id in zip(seq_ids, next_token_ids):
                output_logprobs = next_logprobs.copy()
                output_logprobs[next_token_id] = logprob[next_token_id].item()
-                seq_outputs[seq_id] = SequenceOutputs(
-                    seq_id, seq_id, next_token_id, output_logprobs)
+                seq_outputs[seq_id] = SequenceOutputs(seq_id, seq_id,
+                                                      next_token_id,
+                                                      output_logprobs)
        else:
            # Generate the next tokens for generation tokens.
            prob = probs[idx:idx + len(seq_ids)]
@ -398,22 +409,24 @@ def _sample(
            # Sample the next tokens.
            seq_logprobs = [
                input_metadata.seq_data[seq_id].cumulative_logprob
-                for seq_id in seq_ids]
+                for seq_id in seq_ids
+            ]
            parent_seq_ids, next_token_ids = _sample_from_generation_tokens(
                seq_ids, prob, logprob, seq_logprobs, sampling_params)

            # Get top-k log probabilities for the next tokens.
            next_logprobs: Dict[int, Dict[int, float]] = {}
-            for i, seq_id in enumerate(seq_ids):
+            for j, seq_id in enumerate(seq_ids):
                next_logprobs[seq_id] = _get_topk_logprobs(
-                    logprob[i], sampling_params.logprobs)
+                    logprob[j], sampling_params.logprobs)

            # Build the output.
            for seq_id, parent_seq_id, next_token_id in zip(
-                seq_ids, parent_seq_ids, next_token_ids):
-                i = seq_ids.index(parent_seq_id)
+                    seq_ids, parent_seq_ids, next_token_ids):
+                j = seq_ids.index(parent_seq_id)
                output_logprobs = next_logprobs[parent_seq_id].copy()
-                output_logprobs[next_token_id] = logprob[i, next_token_id].item()
+                output_logprobs[next_token_id] = logprob[j,
+                                                         next_token_id].item()
                seq_outputs[seq_id] = SequenceOutputs(
                    seq_id,
                    parent_seq_id,
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@ -6,16 +6,27 @@ import torch.nn as nn
 from transformers import PretrainedConfig

 from vllm.config import ModelConfig
-from vllm.model_executor.models import (GPT2LMHeadModel, GPTNeoXForCausalLM,
-                                        LlamaForCausalLM, OPTForCausalLM)
+from vllm.model_executor.models import *  # pylint: disable=wildcard-import
 from vllm.model_executor.weight_utils import initialize_dummy_weights

 # TODO(woosuk): Lazy-load the model classes.
 _MODEL_REGISTRY = {
+    "AquilaModel": AquilaForCausalLM,
+    "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
+    "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
+    "BloomForCausalLM": BloomForCausalLM,
+    "FalconForCausalLM": FalconForCausalLM,
    "GPT2LMHeadModel": GPT2LMHeadModel,
+    "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
+    "GPTJForCausalLM": GPTJForCausalLM,
    "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
+    "InternLMForCausalLM": InternLMForCausalLM,
    "LlamaForCausalLM": LlamaForCausalLM,
+    "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
+    "MPTForCausalLM": MPTForCausalLM,
    "OPTForCausalLM": OPTForCausalLM,
+    "QWenLMHeadModel": QWenLMHeadModel,
+    "RWForCausalLM": FalconForCausalLM,
 }


@ -26,8 +37,7 @@ def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
            return _MODEL_REGISTRY[arch]
    raise ValueError(
        f"Model architectures {architectures} are not supported for now. "
-        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}"
-    )
+        f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")


 def get_model(model_config: ModelConfig) -> nn.Module:
@ -44,8 +54,7 @@ def get_model(model_config: ModelConfig) -> nn.Module:
        initialize_dummy_weights(model)
    else:
        # Load the weights from the cached or downloaded files.
-        model.load_weights(
-            model_config.model, model_config.download_dir,
-            model_config.use_np_weights)
+        model.load_weights(model_config.model, model_config.download_dir,
+                           model_config.use_np_weights)
        model = model.cuda()
    return model.eval()
--- a/vllm/model_executor/models/init.py
+++ b/vllm/model_executor/models/init.py
@ -1,12 +1,31 @@
-from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
+from vllm.model_executor.models.aquila import AquilaForCausalLM
+from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
+                                                 BaichuanForCausalLM)
+from vllm.model_executor.models.bloom import BloomForCausalLM
+from vllm.model_executor.models.falcon import FalconForCausalLM
 from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
+from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
+from vllm.model_executor.models.gpt_j import GPTJForCausalLM
+from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
+from vllm.model_executor.models.internlm import InternLMForCausalLM
 from vllm.model_executor.models.llama import LlamaForCausalLM
+from vllm.model_executor.models.mpt import MPTForCausalLM
 from vllm.model_executor.models.opt import OPTForCausalLM
-
+from vllm.model_executor.models.qwen import QWenLMHeadModel

 __all__ = [
+    "AquilaForCausalLM",
+    "BaiChuanForCausalLM",
+    "BaichuanForCausalLM",
+    "BloomForCausalLM",
+    "FalconForCausalLM",
    "GPT2LMHeadModel",
+    "GPTBigCodeForCausalLM",
+    "GPTJForCausalLM",
    "GPTNeoXForCausalLM",
+    "InternLMForCausalLM",
    "LlamaForCausalLM",
+    "MPTForCausalLM",
    "OPTForCausalLM",
+    "QWenLMHeadModel",
 ]
--- a/vllm/model_executor/models/aquila.py
+++ b/vllm/model_executor/models/aquila.py
@ -0,0 +1,362 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only LLaMA model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.sequence import SequenceOutputs
+from vllm.transformers_utils.configs.aquila import AquilaConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class AquilaMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_up_proj = ColumnParallelLinear(hidden_size,
+                                                 2 * intermediate_size,
+                                                 bias=False,
+                                                 gather_output=False,
+                                                 perform_initialization=False)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           input_is_parallel=True,
+                                           perform_initialization=False)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class AquilaRMSNorm(nn.Module):
+
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        AquilaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        variance = hidden_states.to(torch.float32).pow(2).mean(-1,
+                                                               keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance +
+                                                    self.variance_epsilon)
+
+        return (self.weight * hidden_states).to(input_dtype)
+
+
+class AquilaAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        num_kv_heads: int,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        assert self.total_num_kv_heads % tp_size == 0
+        self.num_kv_heads = self.total_num_kv_heads // tp_size
+        self.head_dim = hidden_size // self.total_num_heads
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            (self.total_num_heads + 2 * self.total_num_kv_heads) *
+            self.head_dim,
+            bias=False,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        self.attn = PagedAttentionWithRoPE(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            rotary_dim=self.head_dim,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class AquilaDecoderLayer(nn.Module):
+
+    def __init__(self, config: AquilaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = AquilaAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_attention_heads,
+        )
+        self.mlp = AquilaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = AquilaRMSNorm(config.hidden_size,
+                                             eps=config.rms_norm_eps)
+        self.post_attention_layernorm = AquilaRMSNorm(config.hidden_size,
+                                                      eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class AquilaModel(nn.Module):
+
+    def __init__(self, config: AquilaConfig):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        #vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            perform_initialization=False)
+        self.layers = nn.ModuleList([
+            AquilaDecoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = AquilaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.norm(hidden_states)
+
+        return hidden_states
+
+
+class AquilaForCausalLM(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model = AquilaModel(config)
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.lm_head = ColumnParallelLinear(config.hidden_size,
+                                            vocab_size,
+                                            bias=False,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = [
+        "embed_tokens.weight", "lm_head.weight", "qkv_proj.weight",
+        "gate_proj.weight", "up_proj.weight"
+    ]
+    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        q_proj_shard_size = (self.config.hidden_size // tp_size)
+        kv_proj_shard_size = (self.config.hidden_size //
+                              self.config.num_attention_heads *
+                              self.config.num_attention_heads // tp_size)
+        attention_weight_specs = [
+            # (weight_name, shard_size, offset)
+            ("q_proj", q_proj_shard_size, 0),
+            ("k_proj", kv_proj_shard_size, q_proj_shard_size),
+            ("v_proj", kv_proj_shard_size,
+             q_proj_shard_size + kv_proj_shard_size),
+        ]
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "embed_tokens" in name or "lm_head" in name:
+                param = state_dict[name]
+                # Consider padding in the vocab size.
+                padded_vocab_size = (param.shape[0] * tp_size)
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            is_attention_weight = False
+            for weight_name, shard_size, offset in attention_weight_specs:
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "qkv_proj")]
+
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[offset:offset + shard_size]
+                assert param_slice.shape == loaded_weight.shape
+
+                param_slice.copy_(loaded_weight)
+                is_attention_weight = True
+                break
+            if is_attention_weight:
+                continue
+
+            is_gate_up_weight = False
+            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "gate_up_proj")]
+                shard_size = param.shape[0] // 2
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_gate_up_weight = True
+                break
+            if is_gate_up_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -0,0 +1,377 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BaiChuan model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.sequence import SequenceOutputs
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.attention import PagedAttentionWithRoPE, PagedAttentionWithALiBi
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BaiChuanMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_up_proj = ColumnParallelLinear(hidden_size,
+                                                 2 * intermediate_size,
+                                                 bias=False,
+                                                 gather_output=False,
+                                                 perform_initialization=False)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           input_is_parallel=True,
+                                           perform_initialization=False)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class BaiChuanAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        position_embedding: str,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.postion_embedding = position_embedding
+
+        # pylint: disable=invalid-name
+        self.W_pack = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            bias=False,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        # Create the alibi slopes and slice them.
+        if self.postion_embedding == "ALIBI":
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+            scaling = self.head_dim**-0.5
+            self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
+                                                scaling, alibi_slopes)
+        else:
+            self.scaling = self.head_dim**-0.5
+            self.attn = PagedAttentionWithRoPE(self.num_heads,
+                                               self.head_dim,
+                                               self.scaling,
+                                               rotary_dim=self.head_dim)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.W_pack(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        if self.postion_embedding == "ALIBI":
+            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                    cache_event)
+        else:
+            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                    input_metadata, cache_event)
+
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class BaiChuanDecoderLayer(nn.Module):
+
+    def __init__(self, config: BaiChuanConfig, position_embedding: str):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = BaiChuanAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+            position_embedding=position_embedding,
+        )
+        self.mlp = BaiChuanMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class BaiChuanModel(nn.Module):
+
+    def __init__(self, config: BaiChuanConfig, position_embedding: str):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+            perform_initialization=False)
+        self.layers = nn.ModuleList([
+            BaiChuanDecoderLayer(config, position_embedding)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class BaiChuanBaseForCausalLM(nn.Module):
+
+    def __init__(self, config, position_embedding: str):
+        super().__init__()
+        self.config = config
+        self.model = BaiChuanModel(config, position_embedding)
+        self.lm_head = ColumnParallelLinear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = [
+        "embed_tokens.weight",
+        "lm_head.weight",
+    ]
+    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_world_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "embed_tokens" in name or "lm_head" in name:
+                # Consider padding in the vocab size.
+                param = state_dict[name]
+                padded_vocab_size = param.shape[0] * tp_world_size
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            if "W_pack" in name:
+                total_num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // total_num_heads
+                num_heads = total_num_heads // tp_world_size
+                head_start = tp_rank * num_heads
+                head_end = (tp_rank + 1) * num_heads
+
+                loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                   head_size, hidden_size)
+                loaded_weight = loaded_weight[:, head_start:head_end, :, :]
+                loaded_weight = loaded_weight.reshape(-1, hidden_size)
+
+            is_gate_up_weight = False
+            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "gate_up_proj")]
+                shard_size = param.shape[0] // 2
+                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
+                                              (tp_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_gate_up_weight = True
+                break
+            if is_gate_up_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(
+                param,
+                loaded_weight,
+                name,
+                self._column_parallel_weights,
+                self._row_parallel_weights,
+                tp_rank,
+            )
+
+
+class BaichuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 13b
+
+    def __init__(self, config):
+        super().__init__(config, "ALIBI")
+
+
+class BaiChuanForCausalLM(BaiChuanBaseForCausalLM):  # baichuan 7b
+
+    def __init__(self, config):
+        super().__init__(config, "ROPE")
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@ -0,0 +1,324 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/bloom/modeling_bloom.py
+# Copyright 2023 The CacheFlow team.
+# Copyright 2022 HuggingFace Inc. team and BigScience workshop.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only BLOOM model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import BloomConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(
+        2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+        dtype=torch.float32,
+    )
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32,
+        )
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(start=1,
+                                    end=1 + 2 * num_remaining_heads,
+                                    step=2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+    return slopes
+
+
+class BloomAttention(nn.Module):
+
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.total_num_heads = config.n_head
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        self.query_key_value = ColumnParallelLinear(
+            self.hidden_size,
+            3 * self.hidden_size,
+            bias=True,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+
+        # Create the alibi slopes and slice them.
+        tp_rank = get_tensor_model_parallel_rank()
+        head_start = tp_rank * self.num_heads
+        head_end = (tp_rank + 1) * self.num_heads
+        alibi_slopes = _get_alibi_slopes(self.total_num_heads)
+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+        scaling = self.head_dim**-0.5
+        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
+                                            scaling, alibi_slopes)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        del position_ids  # Unused.
+        qkv, _ = self.query_key_value(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
+        output, _ = self.dense(attn_output)
+        return output
+
+
+class BloomMLP(nn.Module):
+
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
+                                                  4 * hidden_size,
+                                                  gather_output=False,
+                                                  perform_initialization=False)
+        self.act = get_act_fn("gelu")
+        self.dense_4h_to_h = RowParallelLinear(4 * hidden_size,
+                                               hidden_size,
+                                               input_is_parallel=True,
+                                               perform_initialization=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.dense_h_to_4h(x)
+        x = self.act(x)
+        x, _ = self.dense_4h_to_h(x)
+        return x
+
+
+class BloomBlock(nn.Module):
+
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.input_layernorm = nn.LayerNorm(hidden_size,
+                                            eps=config.layer_norm_epsilon)
+        self.self_attention = BloomAttention(config)
+        self.post_attention_layernorm = nn.LayerNorm(
+            hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = BloomMLP(config)
+        self.apply_residual_connection_post_layernorm = (
+            config.apply_residual_connection_post_layernorm)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Layer norm at the beginning of the transformer layer.
+        layernorm_output = self.input_layernorm(hidden_states)
+
+        # Layer norm post the self attention.
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = hidden_states
+
+        # Self attention.
+        attention_output = self.self_attention(
+            position_ids=position_ids,
+            hidden_states=layernorm_output,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        attention_output = attention_output + residual
+        layernorm_output = self.post_attention_layernorm(attention_output)
+
+        # Get residual
+        if self.apply_residual_connection_post_layernorm:
+            residual = layernorm_output
+        else:
+            residual = attention_output
+
+        # MLP.
+        output = self.mlp(layernorm_output) + residual
+        return output
+
+
+class BloomModel(nn.Module):
+
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+
+        # Embedding + LN Embedding
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, self.embed_dim, perform_initialization=False)
+        self.word_embeddings_layernorm = nn.LayerNorm(
+            self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Transformer blocks
+        self.h = nn.ModuleList(
+            [BloomBlock(config) for _ in range(config.num_hidden_layers)])
+
+        # Final Layer Norm
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.word_embeddings(input_ids)
+        hidden_states = self.word_embeddings_layernorm(hidden_states)
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(
+                position_ids,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class BloomForCausalLM(nn.Module):
+
+    def __init__(self, config: BloomConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = BloomModel(config)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.transformer.word_embeddings.weight
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = [
+        "word_embeddings.weight", "dense_h_to_4h.weight", "dense_h_to_4h.bias"
+    ]
+    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if name == "lm_head.weight":
+                # Since hidden_states are parallelized, we need to
+                # load lm_head.weight in parallel.
+                self._column_parallel_weights.append(name)
+                # If lm_head is provided, use it instead.
+                param = self.lm_head_weight
+            else:
+                if not name.startswith("transformer."):
+                    name = "transformer." + name
+                param = state_dict[name]
+
+            if "query_key_value" in name:
+                # NOTE(woosuk): BLOOM's fused QKV has the shape of
+                # [num_heads * 3 * head_size, hidden_size], while the
+                # required shape is [3 * num_heads * head_size, hidden_size].
+                # Thus, we need weight conversion.
+                shard_size = param.shape[0]
+                start = shard_size * tp_rank
+                end = shard_size * (tp_rank + 1)
+                loaded_weight = loaded_weight[start:end]
+
+                num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // num_heads
+                if "query_key_value.weight" in name:
+                    loaded_weight = loaded_weight.view(-1, 3, head_size,
+                                                       hidden_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
+                elif "query_key_value.bias" in name:
+                    loaded_weight = loaded_weight.view(-1, 3, head_size)
+                    loaded_weight = loaded_weight.transpose(0, 1)
+                    loaded_weight = loaded_weight.reshape(-1)
+                else:
+                    raise ValueError(f"Unexpected weight name: {name}")
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights, tp_rank)
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@ -0,0 +1,496 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/a5cc30d72ae2dc19af534e4b35c986cc28db1275/src/transformers/models/falcon/modeling_falcon.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 the Falcon authors and HuggingFace Inc. team.  All rights
+# reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Falcon model."""
+
+import math
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import LayerNorm
+from transformers import FalconConfig as HF_FalconConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.attention import (PagedAttention,
+                                                  PagedAttentionWithALiBi,
+                                                  PagedAttentionWithRoPE)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear,
+    reduce_from_tensor_model_parallel_region)
+from vllm.sequence import SequenceOutputs
+from vllm.transformers_utils.configs import RWConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+FalconConfig = Union[HF_FalconConfig, RWConfig]
+
+
+# NOTE(Hesslow): Unfortunately we did not fuse matmul and bias during
+# training, this means that there's one additional quantization to bfloat16
+# between the operations. In order not to degrade the quality of our HF-port,
+# we keep these characteristics in the final model.
+class FalconLinear(nn.Linear):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        hidden_states = x @ self.weight.T
+        if self.bias is None:
+            return hidden_states
+        return hidden_states + self.bias
+
+
+def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
+    closest_power_of_2 = 2**math.floor(math.log2(total_num_heads))
+    base = torch.tensor(2**(-(2**-(math.log2(closest_power_of_2) - 3))),
+                        dtype=torch.float32)
+    powers = torch.arange(1, 1 + closest_power_of_2, dtype=torch.int32)
+    slopes = torch.pow(base, powers)
+
+    if closest_power_of_2 != total_num_heads:
+        extra_base = torch.tensor(
+            2**(-(2**-(math.log2(2 * closest_power_of_2) - 3))),
+            dtype=torch.float32)
+        num_remaining_heads = min(closest_power_of_2,
+                                  total_num_heads - closest_power_of_2)
+        extra_powers = torch.arange(1,
+                                    1 + 2 * num_remaining_heads,
+                                    2,
+                                    dtype=torch.int32)
+        slopes = torch.cat(
+            [slopes, torch.pow(extra_base, extra_powers)], dim=0)
+
+    return slopes
+
+
+class FalconAttention(nn.Module):
+
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.head_dim = self.hidden_size // self.total_num_heads
+        assert self.head_dim * self.total_num_heads == self.hidden_size
+
+        self.new_decoder_architecture = config.new_decoder_architecture
+        self.multi_query = config.multi_query
+
+        if self.new_decoder_architecture:
+            self.total_num_kv_heads = config.num_kv_heads
+            assert self.total_num_heads % tp_size == 0
+            self.num_kv_heads = self.total_num_kv_heads // tp_size
+            self.query_key_value = ColumnParallelLinear(
+                self.hidden_size,
+                (self.total_num_heads + 2 * self.total_num_kv_heads) *
+                self.head_dim,
+                bias=config.bias,
+                gather_output=False,
+                perform_initialization=False,
+                skip_bias_add=True,
+            )
+        elif self.multi_query:
+            self.total_num_kv_heads = 1
+            self.num_kv_heads = 1
+            self.query = ColumnParallelLinear(
+                self.hidden_size,
+                self.total_num_heads * self.head_dim,
+                bias=config.bias,
+                gather_output=False,
+                perform_initialization=False,
+                skip_bias_add=True,
+            )
+            self.key_value = FalconLinear(self.hidden_size,
+                                          2 * self.head_dim,
+                                          bias=config.bias)
+        else:
+            self.total_num_kv_heads = self.total_num_heads
+            self.num_kv_heads = self.num_heads
+            self.query_key_value = ColumnParallelLinear(
+                self.hidden_size,
+                (self.total_num_heads + 2 * self.total_num_kv_heads) *
+                self.head_dim,
+                bias=config.bias,
+                gather_output=False,
+                perform_initialization=False,
+                skip_bias_add=True,
+            )
+
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+
+        # Layer-wise attention scaling
+        self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim)
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+        self.dense = RowParallelLinear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=config.bias,
+            input_is_parallel=True,
+            perform_initialization=False,
+            skip_bias_add=True,
+            reduce_results=self.reduce_row_parallel_results)
+
+        self.use_rotary = config.rotary
+        self.use_alibi = config.alibi
+        assert not (self.use_rotary and self.use_alibi), (
+            "Rotary and alibi are mutually exclusive.")
+
+        if self.use_rotary:
+            # TODO(zhuohan): Pass in correct `max_position``
+            self.attn = PagedAttentionWithRoPE(self.num_heads,
+                                               self.head_dim,
+                                               self.inv_norm_factor,
+                                               rotary_dim=self.head_dim,
+                                               num_kv_heads=self.num_kv_heads)
+        elif self.use_alibi:
+            tp_rank = get_tensor_model_parallel_rank()
+            head_start = tp_rank * self.num_heads
+            head_end = (tp_rank + 1) * self.num_heads
+            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
+                            self.inv_norm_factor)
+            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+            self.attn = PagedAttentionWithALiBi(self.num_heads,
+                                                self.head_dim,
+                                                self.inv_norm_factor,
+                                                alibi_slopes,
+                                                num_kv_heads=self.num_kv_heads)
+        else:
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       scale=self.inv_norm_factor,
+                                       num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        if not self.new_decoder_architecture and self.multi_query:
+            q, bias = self.query(hidden_states)
+            if bias is not None:
+                q += bias
+            kv = self.key_value(hidden_states)
+            k, v = kv.split([self.kv_size, self.kv_size], dim=-1)
+        else:
+            qkv, bias = self.query_key_value(hidden_states)
+            if bias is not None:
+                qkv += bias
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size],
+                                dim=-1)
+        k_cache, v_cache = kv_cache
+        if self.use_rotary:
+            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                    input_metadata, cache_event)
+        else:
+            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                    cache_event)
+        attn_output, bias = self.dense(attn_output)
+        return attn_output, bias
+
+
+class FalconMLP(nn.Module):
+
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.dense_h_to_4h = ColumnParallelLinear(hidden_size,
+                                                  4 * hidden_size,
+                                                  bias=config.bias,
+                                                  gather_output=False,
+                                                  perform_initialization=False,
+                                                  skip_bias_add=True)
+        self.act = nn.GELU()
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+        self.dense_4h_to_h = RowParallelLinear(
+            4 * hidden_size,
+            hidden_size,
+            bias=config.bias,
+            input_is_parallel=True,
+            perform_initialization=False,
+            skip_bias_add=True,
+            reduce_results=self.reduce_row_parallel_results)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # NOTE(zhuohan): Following huggingface, we do not fuse bias add here.
+        x, bias = self.dense_h_to_4h(x)
+        if bias is not None:
+            x += bias
+        x = self.act(x)
+        x, bias = self.dense_4h_to_h(x)
+        return x, bias
+
+
+class FalconDecoderLayer(nn.Module):
+
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.self_attention = FalconAttention(config)
+        self.mlp = FalconMLP(config)
+        self.config = config
+
+        if config.new_decoder_architecture:
+            # The layer norm before self-attention
+            self.ln_attn = LayerNorm(hidden_size,
+                                     eps=config.layer_norm_epsilon)
+            # The layer norm before the MLP
+            self.ln_mlp = LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        else:
+            self.input_layernorm = LayerNorm(hidden_size,
+                                             eps=config.layer_norm_epsilon)
+            if not config.parallel_attn:
+                self.post_attention_layernorm = LayerNorm(
+                    hidden_size, eps=config.layer_norm_epsilon)
+
+        self.reduce_row_parallel_results = not (config.new_decoder_architecture
+                                                or config.parallel_attn)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ):
+        residual = hidden_states
+
+        if self.config.new_decoder_architecture:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+
+        # Self attention.
+        attention_output, attention_bias = self.self_attention(
+            positions=positions,
+            hidden_states=attention_layernorm_out,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        if self.reduce_row_parallel_results and attention_bias is not None:
+            attention_output += attention_bias
+
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual += attention_output
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
+
+        # MLP.
+        mlp_output, mlp_bias = self.mlp(mlp_layernorm_out)
+        if self.reduce_row_parallel_results and mlp_bias is not None:
+            mlp_output += mlp_bias
+
+        if not self.reduce_row_parallel_results:
+            # When MLP and Attention layers are parallel, we can use
+            # only one all-reduce operator to reduce the results from
+            # both MLP and Attention layers.
+            mlp_output += attention_output
+            mlp_output = reduce_from_tensor_model_parallel_region(mlp_output)
+            if attention_bias is not None:
+                mlp_output += attention_bias
+            if mlp_bias is not None:
+                mlp_output += mlp_bias
+
+        output = mlp_output + residual
+
+        return output
+
+
+class FalconModel(nn.Module):
+
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.use_alibi = config.alibi
+
+        # Embedding + LN Embedding
+        self.word_embeddings = VocabParallelEmbedding(
+            config.vocab_size, self.embed_dim, perform_initialization=False)
+
+        # Transformer blocks
+        self.h = nn.ModuleList([
+            FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
+
+        # Final Layer Norm
+        self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.word_embeddings(input_ids)
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class FalconForCausalLM(nn.Module):
+
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = FalconModel(config)
+        self.lm_head = ColumnParallelLinear(config.hidden_size,
+                                            config.vocab_size,
+                                            bias=False,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(
+            input_ids,
+            positions,
+            kv_caches,
+            input_metadata,
+            cache_events,
+        )
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
+
+        return next_tokens
+
+    _column_parallel_weights = [
+        "word_embeddings.weight", "lm_head.weight", "dense_h_to_4h.weight",
+        "dense_h_to_4h.bias"
+    ]
+    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_size = (get_tensor_model_parallel_world_size())
+        tp_rank = get_tensor_model_parallel_rank()
+
+        hidden_size = self.config.hidden_size
+        total_num_heads = self.config.num_attention_heads
+        num_heads = total_num_heads // tp_size
+        head_size = hidden_size // total_num_heads
+        head_start = tp_rank * num_heads
+        head_end = (tp_rank + 1) * num_heads
+        if self.config.new_decoder_architecture:
+            total_num_kv_heads = self.config.num_kv_heads
+            num_kv_heads = total_num_kv_heads // tp_size
+            separated_q_kv = False
+            kv_head_start = tp_rank * num_kv_heads
+            kv_head_end = (tp_rank + 1) * num_kv_heads
+        elif self.config.multi_query:
+            total_num_kv_heads = 1
+            num_kv_heads = 1
+            separated_q_kv = True
+            kv_head_start = 0
+            kv_head_end = 1
+        else:
+            total_num_kv_heads = total_num_heads
+            num_kv_heads = total_num_kv_heads // tp_size
+            separated_q_kv = False
+            kv_head_start = tp_rank * num_kv_heads
+            kv_head_end = (tp_rank + 1) * num_kv_heads
+        num_query_heads_per_kv_head = total_num_heads // total_num_kv_heads
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "query_key_value" in name:
+                loaded_weight_size = loaded_weight.size()
+                loaded_weight = loaded_weight.view(
+                    total_num_kv_heads, num_query_heads_per_kv_head + 2,
+                    head_size, *loaded_weight_size[1:])
+
+                wq = loaded_weight[:, :-2].reshape(-1, *loaded_weight_size[1:])
+                wk = loaded_weight[:, [-2]].reshape(-1,
+                                                    *loaded_weight_size[1:])
+                wv = loaded_weight[:, [-1]].reshape(-1,
+                                                    *loaded_weight_size[1:])
+
+                wq = wq[head_size * head_start:head_size * head_end]
+                wk = wk[head_size * kv_head_start:head_size * kv_head_end]
+                wv = wv[head_size * kv_head_start:head_size * kv_head_end]
+
+                if separated_q_kv:
+                    loaded_weight_q = wq
+                    loaded_weight_kv = torch.cat([wk, wv], dim=0)
+                    q_weight_name = name.replace("query_key_value", "query")
+                    kv_weight_name = name.replace("query_key_value",
+                                                  "key_value")
+                    load_tensor_parallel_weights(state_dict[q_weight_name],
+                                                 loaded_weight_q,
+                                                 q_weight_name,
+                                                 self._column_parallel_weights,
+                                                 self._row_parallel_weights,
+                                                 tp_rank)
+                    load_tensor_parallel_weights(state_dict[kv_weight_name],
+                                                 loaded_weight_kv,
+                                                 kv_weight_name,
+                                                 self._column_parallel_weights,
+                                                 self._row_parallel_weights,
+                                                 tp_rank)
+                    continue
+                else:
+                    loaded_weight = torch.cat([wq, wk, wv], dim=0)
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights, tp_rank)
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
@ -47,19 +48,25 @@ class GPT2Attention(nn.Module):
        super().__init__()
        self.hidden_size = config.hidden_size
        total_num_heads = config.num_attention_heads
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
        assert total_num_heads % tensor_model_parallel_world_size == 0
        self.num_heads = total_num_heads // tensor_model_parallel_world_size
        self.head_dim = self.hidden_size // total_num_heads
-        self.scale = self.head_dim ** -0.5
+        self.scale = self.head_dim**-0.5

-        self.c_attn = ColumnParallelLinear(self.hidden_size, 3 * self.hidden_size,
-                                           bias=True, gather_output=False,
+        self.c_attn = ColumnParallelLinear(self.hidden_size,
+                                           3 * self.hidden_size,
+                                           bias=True,
+                                           gather_output=False,
                                           perform_initialization=False)
-        self.c_proj = RowParallelLinear(self.hidden_size, self.hidden_size,
-                                        bias=True, input_is_parallel=True,
+        self.c_proj = RowParallelLinear(self.hidden_size,
+                                        self.hidden_size,
+                                        bias=True,
+                                        input_is_parallel=True,
                                        perform_initialization=False)
-        self.attn = PagedAttention(self.num_heads, self.head_dim,
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
                                   scale=self.scale)

    def forward(
@ -72,8 +79,8 @@ class GPT2Attention(nn.Module):
        qkv, _ = self.c_attn(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
        key_cache, value_cache = kv_cache
-        attn_output = self.attn(
-            q, k, v, key_cache, value_cache, input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, key_cache, value_cache,
+                                input_metadata, cache_event)
        attn_output, _ = self.c_proj(attn_output)
        return attn_output

@ -87,11 +94,15 @@ class GPT2MLP(nn.Module):
    ):
        super().__init__()
        hidden_size = config.hidden_size
-        self.c_fc = ColumnParallelLinear(hidden_size, intermediate_size,
-                                         bias=True, gather_output=False,
+        self.c_fc = ColumnParallelLinear(hidden_size,
+                                         intermediate_size,
+                                         bias=True,
+                                         gather_output=False,
                                         perform_initialization=False)
-        self.c_proj = RowParallelLinear(intermediate_size, hidden_size,
-                                        bias=True, input_is_parallel=True,
+        self.c_proj = RowParallelLinear(intermediate_size,
+                                        hidden_size,
+                                        bias=True,
+                                        input_is_parallel=True,
                                        perform_initialization=False)
        self.act = get_act_fn(config.activation_function)

@ -107,7 +118,8 @@ class GPT2Block(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)

        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
        self.attn = GPT2Attention(config)
@ -145,9 +157,9 @@ class GPT2Model(nn.Module):
    def __init__(self, config: GPT2Config):
        super().__init__()
        self.config = config
-        assert config.add_cross_attention == False
-        assert config.scale_attn_by_inverse_layer_idx == False
-        assert config.reorder_and_upcast_attn == False
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
        self.embed_dim = config.hidden_size

        # Optimization: While the vocab size of GPT-2 is 50257, we extend it
@ -180,8 +192,8 @@ class GPT2Model(nn.Module):
            else:
                cache_event = cache_events[i]
            layer = self.h[i]
-            hidden_states = layer(
-                hidden_states, kv_caches[i], input_metadata, cache_event)
+            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
+                                  cache_event)

        hidden_states = self.ln_f(hidden_states)
        return hidden_states
@ -206,33 +218,37 @@ class GPT2LMHeadModel(nn.Module):
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.transformer(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.lm_head_weight, hidden_states, input_metadata)
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   input_metadata)
        return next_tokens

    _column_parallel_weights = ["wte.weight", "c_fc.weight", "c_fc.bias"]
    _row_parallel_weights = ["c_proj.weight"]

-    def load_weights(self, model_name_or_path: str,
+    def load_weights(self,
+                     model_name_or_path: str,
                     cache_dir: Optional[str] = None,
                     use_np_cache: bool = False):
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
        state_dict = self.state_dict()

        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, use_np_cache):
+                model_name_or_path, cache_dir, use_np_cache):
            if "lm_head.weight" in name:
                # GPT-2 ties the weights of the embedding layer and the final
                # linear layer.
                continue
-            if ".attn.bias" in name:
+            if ".attn.bias" in name or ".attn.masked_bias" in name:
                # Skip attention mask.
                # NOTE: "c_attn.bias" should not be skipped.
                continue
-            name = "transformer." + name
+
+            if not name.startswith("transformer."):
+                name = "transformer." + name

            # The HF's GPT-2 implementation uses Conv1D instead of Linear.
            # Because of this, we need to transpose the weights.
@ -246,16 +262,20 @@ class GPT2LMHeadModel(nn.Module):

            if name == "transformer.wte.weight":
                # Consider padding in the vocab size.
-                padded_vocab_size = param.shape[0] * tensor_model_parallel_world_size
+                padded_vocab_size = (param.shape[0] *
+                                     tensor_model_parallel_world_size)
                num_extra_rows = padded_vocab_size - self.config.vocab_size
-                extra_rows = torch.empty(num_extra_rows, loaded_weight.shape[1])
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
                extra_rows = extra_rows.to(loaded_weight)
                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)

            # For the fused QKV linear layer, manually shard the weights.
            if "c_attn" in name:
-                # GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
-                # When tensor parallelism is used, we shard the weights along the head dimension.
+                # GPT-2's fused QKV has the shape of
+                # [3 * num_heads * head_size, hidden_size].
+                # When tensor parallelism is used, we shard the weights along
+                # the head dimension.
                total_num_heads = self.config.num_attention_heads
                hidden_size = self.config.hidden_size
                head_size = hidden_size // total_num_heads
@ -264,11 +284,13 @@ class GPT2LMHeadModel(nn.Module):
                head_end = (tensor_model_parallel_rank + 1) * num_heads

                if name.endswith(".weight"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads, head_size, hidden_size)
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size, hidden_size)
                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
                elif name.endswith(".bias"):
-                    loaded_weight = loaded_weight.view(3, total_num_heads, head_size)
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size)
                    loaded_weight = loaded_weight[:, head_start:head_end, :]
                    loaded_weight = loaded_weight.reshape(-1)
                else:
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@ -0,0 +1,343 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Copyright 2023 The vLLM team.
+# Copyright 2023 CTranslate2, and Michael Feil
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPTBigCode model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTBigCodeConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import PagedAttention
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class GPTBigCodeAttention(nn.Module):
+
+    def __init__(self, config: GPTBigCodeConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        total_num_heads = config.num_attention_heads
+        self.tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        assert total_num_heads % self.tensor_model_parallel_world_size == 0
+        self.num_heads = (total_num_heads //
+                          self.tensor_model_parallel_world_size)
+        self.head_dim = self.hidden_size // total_num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.multi_query = config.multi_query
+        if self.multi_query:
+            self.num_kv_heads = 1
+            self.kv_dim = self.head_dim
+            self.c_attn_q = ColumnParallelLinear(self.hidden_size,
+                                                 self.hidden_size,
+                                                 bias=True,
+                                                 gather_output=False,
+                                                 perform_initialization=False)
+            self.c_attn_kv = nn.Linear(self.hidden_size,
+                                       2 * self.kv_dim,
+                                       bias=True)
+        else:
+            self.num_kv_heads = self.num_heads
+            self.kv_dim = self.num_kv_heads * self.head_dim
+            self.c_attn = ColumnParallelLinear(self.hidden_size,
+                                               self.hidden_size +
+                                               2 * self.kv_dim,
+                                               bias=True,
+                                               gather_output=False,
+                                               perform_initialization=False)
+
+        self.c_proj = RowParallelLinear(self.hidden_size,
+                                        self.hidden_size,
+                                        bias=True,
+                                        input_is_parallel=True,
+                                        perform_initialization=False)
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   scale=self.scale,
+                                   num_kv_heads=self.num_kv_heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        if self.multi_query:
+            q, _ = self.c_attn_q(hidden_states)
+            kv = self.c_attn_kv(hidden_states)
+            k, v = kv.split([self.kv_dim, self.kv_dim], dim=-1)
+        else:
+            qkv, _ = self.c_attn(hidden_states)
+            q, k, v = qkv.split([
+                self.hidden_size // self.tensor_model_parallel_world_size,
+                self.kv_dim, self.kv_dim
+            ],
+                                dim=-1)
+        key_cache, value_cache = kv_cache
+        attn_output = self.attn(q, k, v, key_cache, value_cache,
+                                input_metadata, cache_event)
+        attn_output, _ = self.c_proj(attn_output)
+        return attn_output
+
+
+class GPTBigMLP(nn.Module):
+
+    def __init__(
+        self,
+        intermediate_size: int,
+        config: GPTBigCodeConfig,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+        self.c_fc = ColumnParallelLinear(hidden_size,
+                                         intermediate_size,
+                                         bias=True,
+                                         gather_output=False,
+                                         perform_initialization=False)
+        self.c_proj = RowParallelLinear(intermediate_size,
+                                        hidden_size,
+                                        bias=True,
+                                        input_is_parallel=True,
+                                        perform_initialization=False)
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.c_proj(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeBlock(nn.Module):
+
+    def __init__(self, config: GPTBigCodeConfig):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = GPTBigCodeAttention(config)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.mlp = GPTBigMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+        return hidden_states
+
+
+class GPTBigCodeModel(nn.Module):
+
+    def __init__(self, config: GPTBigCodeConfig):
+        super().__init__()
+        self.config = config
+        assert not config.add_cross_attention
+
+        self.embed_dim = config.hidden_size
+
+        # Optimization: While the vocab size of GPT-2 is 50257, we extend it
+        # to 50304 in order to make it divisible by 64.
+        # This improves performance since GPUs are faster if the dimension
+        # is divisible by 64. In addition, it allows us to shard the embedding
+        # layer across 2, 4, 8, or more GPUs.
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.wte = VocabParallelEmbedding(vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+        self.h = nn.ModuleList(
+            [GPTBigCodeBlock(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
+                                  cache_event)
+
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPTBigCodeForCausalLM(nn.Module):
+
+    def __init__(self, config: GPTBigCodeConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = GPTBigCodeModel(config)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.transformer.wte.weight
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["wte.weight", "c_fc.weight", "c_fc.bias"]
+    _row_parallel_weights = ["c_proj.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "lm_head.weight" in name:
+                # GPT-2 ties the weights of the embedding layer and the final
+                # linear layer.
+                continue
+            if ".attn.bias" in name:
+                # Skip attention mask.
+                # NOTE: "c_attn.bias" should not be skipped.
+                continue
+
+            if not name.startswith("transformer."):
+                name = "transformer." + name
+
+            # For the fused QKV linear layer, manually shard the weights.
+            if "c_attn" in name:
+                # GPT-2's fused QKV has the shape of
+                # [3 * num_heads * head_size, hidden_size].
+                # When tensor parallelism is used, we shard the weights along
+                # the head dimension.
+                total_num_heads = self.config.num_attention_heads
+                total_num_kv_heads = (1 if self.config.multi_query else
+                                      total_num_heads)
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // total_num_heads
+                total_kv_size = head_size * total_num_kv_heads
+                num_heads = total_num_heads // tensor_model_parallel_world_size
+                head_start = tensor_model_parallel_rank * num_heads
+                head_end = (tensor_model_parallel_rank + 1) * num_heads
+
+                wq, wk, wv = torch.split(
+                    loaded_weight, [hidden_size, total_kv_size, total_kv_size],
+                    dim=0)
+
+                wq = wq[head_size * head_start:head_size * head_end]
+                if not self.config.multi_query:
+                    # Split the heads when using normal multi-head attention
+                    wk = wk[head_size * head_start:head_size * head_end]
+                    wv = wv[head_size * head_start:head_size * head_end]
+                    loaded_weight = torch.cat([wq, wk, wv], dim=0)
+                else:
+                    # For multi-query attention, we split the query
+                    # but replicate the key and value.
+                    loaded_weight_q = wq
+                    loaded_weight_kv = torch.cat([wk, wv], dim=0)
+                    q_weight_name = name.replace("c_attn", "c_attn_q")
+                    kv_weight_name = name.replace("c_attn", "c_attn_kv")
+                    load_tensor_parallel_weights(state_dict[q_weight_name],
+                                                 loaded_weight_q,
+                                                 q_weight_name,
+                                                 self._column_parallel_weights,
+                                                 self._row_parallel_weights,
+                                                 tensor_model_parallel_rank)
+                    load_tensor_parallel_weights(state_dict[kv_weight_name],
+                                                 loaded_weight_kv,
+                                                 kv_weight_name,
+                                                 self._column_parallel_weights,
+                                                 self._row_parallel_weights,
+                                                 tensor_model_parallel_rank)
+                    continue
+
+            param = state_dict[name]
+
+            if name == "transformer.wte.weight":
+                # Consider padding in the vocab size.
+                padded_vocab_size = param.shape[
+                    0] * tensor_model_parallel_world_size
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@ -0,0 +1,251 @@
+# coding=utf-8
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gptj/modeling_gptj.py
+# Copyright 2023 The vLLM team.
+# Copyright 2021 The EleutherAI and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only GPT-J model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import GPTJConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class GPTJAttention(nn.Module):
+
+    def __init__(self, config: GPTJConfig):
+        super().__init__()
+        self.total_num_heads = config.num_attention_heads
+        self.hidden_size = config.hidden_size
+        self.head_size = self.hidden_size // self.total_num_heads
+
+        self.qkv_proj = ColumnParallelLinear(config.hidden_size,
+                                             3 * config.hidden_size,
+                                             bias=False,
+                                             gather_output=False,
+                                             perform_initialization=False)
+        self.out_proj = RowParallelLinear(config.hidden_size,
+                                          config.hidden_size,
+                                          bias=False,
+                                          input_is_parallel=True,
+                                          perform_initialization=False)
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        scaling = self.head_size**-0.5
+        assert getattr(config, "rotary", True)
+        assert config.rotary_dim % 2 == 0
+        self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_size,
+                                           scaling, config.rotary_dim)
+        self.warmup = False
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
+        attn_output, _ = self.out_proj(attn_output)
+        return attn_output
+
+
+class GPTJMLP(nn.Module):
+
+    def __init__(self, intermediate_size: int, config: GPTJConfig):
+        super().__init__()
+        hidden_size = config.n_embd
+        self.fc_in = ColumnParallelLinear(hidden_size,
+                                          intermediate_size,
+                                          gather_output=False,
+                                          perform_initialization=False)
+        self.fc_out = RowParallelLinear(intermediate_size,
+                                        hidden_size,
+                                        input_is_parallel=True,
+                                        perform_initialization=False)
+        self.act = get_act_fn(config.activation_function)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states, _ = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states, _ = self.fc_out(hidden_states)
+        return hidden_states
+
+
+class GPTJBlock(nn.Module):
+
+    def __init__(self, config: GPTJConfig):
+        super().__init__()
+        if config.n_inner is None:
+            inner_dim = 4 * config.n_embd
+        else:
+            inner_dim = config.n_inner
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = GPTJAttention(config)
+        self.mlp = GPTJMLP(inner_dim, config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output = self.attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        mlp_output = self.mlp(hidden_states)
+        hidden_states = attn_output + mlp_output + residual
+        return hidden_states
+
+
+class GPTJModel(nn.Module):
+
+    def __init__(self, config: GPTJConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.n_embd
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          self.embed_dim,
+                                          perform_initialization=False)
+        self.h = nn.ModuleList(
+            [GPTJBlock(config) for _ in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(
+                position_ids,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class GPTJForCausalLM(nn.Module):
+
+    def __init__(self, config: GPTJConfig):
+        super().__init__()
+        self.config = config
+        assert not config.tie_word_embeddings
+        self.transformer = GPTJModel(config)
+        self.lm_head = ColumnParallelLinear(config.n_embd,
+                                            config.vocab_size,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata, self.lm_head.bias)
+        return next_tokens
+
+    _column_parallel_weights = [
+        "wte.weight", "fc_in.weight", "fc_in.bias", "lm_head.weight",
+        "lm_head.bias"
+    ]
+    _row_parallel_weights = ["out_proj.weight", "fc_out.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "attn.bias" in name or "attn.masked_bias" in name:
+                continue
+
+            is_attention_weight = False
+            for stride_id, att_weight_name in enumerate(
+                ["q_proj", "k_proj", "v_proj"]):
+                if att_weight_name not in name:
+                    continue
+                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
+                shard_size = param.shape[1]
+                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
+                                              (tp_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_attention_weight = True
+                break
+            if is_attention_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights, tp_rank)
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
 #
@ -48,19 +49,23 @@ class GPTNeoXAttention(nn.Module):
        self.hidden_size = config.hidden_size
        self.head_size = self.hidden_size // self.total_num_heads

-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)

-        self.query_key_value = ColumnParallelLinear(config.hidden_size,
-                                                    3 * config.hidden_size,
-                                                    gather_output=False,
-                                                    perform_initialization=False)
-        self.dense = RowParallelLinear(config.hidden_size, config.hidden_size,
+        self.query_key_value = ColumnParallelLinear(
+            config.hidden_size,
+            3 * config.hidden_size,
+            gather_output=False,
+            perform_initialization=False)
+        self.dense = RowParallelLinear(config.hidden_size,
+                                       config.hidden_size,
                                       input_is_parallel=True,
                                       perform_initialization=False)

-        scaling = self.head_size ** -0.5
+        scaling = self.head_size**-0.5
        rotary_dim = int(self.head_size * config.rotary_pct)
        assert rotary_dim % 2 == 0
        self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_size,
@ -75,11 +80,10 @@ class GPTNeoXAttention(nn.Module):
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:
        qkv, _ = self.query_key_value(hidden_states)
-
        q, k, v = qkv.chunk(chunks=3, dim=-1)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(
-            position_ids, q, k, v, k_cache, v_cache, input_metadata, cache_event)
+        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
        output, _ = self.dense(attn_output)
        return output

@ -92,7 +96,8 @@ class GPTNeoXMLP(nn.Module):
                                                  config.intermediate_size,
                                                  gather_output=False,
                                                  perform_initialization=False)
-        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size, config.hidden_size,
+        self.dense_4h_to_h = RowParallelLinear(config.intermediate_size,
+                                               config.hidden_size,
                                               input_is_parallel=True,
                                               perform_initialization=False)
        self.act = get_act_fn(config.hidden_act)
@ -109,8 +114,10 @@ class GPTNeoXLayer(nn.Module):
    def __init__(self, config: GPTNeoXConfig):
        super().__init__()
        self.use_parallel_residual = config.use_parallel_residual
-        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size,
+                                            eps=config.layer_norm_eps)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size,
+                                                     eps=config.layer_norm_eps)
        self.attention = GPTNeoXAttention(config)
        self.mlp = GPTNeoXMLP(config)

@ -154,10 +161,13 @@ class GPTNeoXModel(nn.Module):
        super().__init__()
        self.config = config

-        self.embed_in = VocabParallelEmbedding(config.vocab_size, config.hidden_size,
+        self.embed_in = VocabParallelEmbedding(config.vocab_size,
+                                               config.hidden_size,
                                               perform_initialization=False)
-        self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
-        self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layers = nn.ModuleList(
+            [GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.hidden_size,
+                                             eps=config.layer_norm_eps)

    def forward(
        self,
@ -191,8 +201,10 @@ class GPTNeoXForCausalLM(nn.Module):
        super().__init__()
        self.config = config
        self.gpt_neox = GPTNeoXModel(config)
-        self.embed_out = ColumnParallelLinear(config.hidden_size, config.vocab_size,
-                                              bias=False, gather_output=False,
+        self.embed_out = ColumnParallelLinear(config.hidden_size,
+                                              config.vocab_size,
+                                              bias=False,
+                                              gather_output=False,
                                              perform_initialization=False)
        self.sampler = Sampler(config.vocab_size)

@ -204,24 +216,28 @@ class GPTNeoXForCausalLM(nn.Module):
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.gpt_neox(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.embed_out.weight, hidden_states, input_metadata)
+        hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
+                                      input_metadata, cache_events)
+        next_tokens = self.sampler(self.embed_out.weight, hidden_states,
+                                   input_metadata)
        return next_tokens

-    _column_parallel_weights = ["embed_in.weight", "embed_out.weight", "dense_h_to_4h.weight", "dense_h_to_4h.bias"]
+    _column_parallel_weights = [
+        "embed_in.weight", "embed_out.weight", "dense_h_to_4h.weight",
+        "dense_h_to_4h.bias"
+    ]
    _row_parallel_weights = ["dense.weight", "dense_4h_to_h.weight"]

-    def load_weights(self, model_name_or_path: str,
+    def load_weights(self,
+                     model_name_or_path: str,
                     cache_dir: Optional[str] = None,
                     use_np_cache: bool = False):
        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
        state_dict = self.state_dict()
        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, use_np_cache):
+                model_name_or_path, cache_dir, use_np_cache):
            if ("attention.bias" in name or "attention.masked_bias" in name
-                or "rotary_emb.inv_freq" in name):
+                    or "rotary_emb.inv_freq" in name):
                continue
            param = state_dict[name]
            if "query_key_value" in name:
@ -230,17 +246,19 @@ class GPTNeoXForCausalLM(nn.Module):
                # required shape is [3 * num_heads * head_size, hidden_size].
                # Thus, we need weight conversion.
                shard_size = param.shape[0]
-                loaded_weight = loaded_weight[shard_size * tensor_model_parallel_rank
-                                              :shard_size * (tensor_model_parallel_rank + 1)]
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]

                num_heads = self.config.num_attention_heads
                hidden_size = self.config.hidden_size
                head_size = hidden_size // num_heads
-                if 'query_key_value.weight' in name:
-                    loaded_weight = loaded_weight.view(-1, 3, head_size, hidden_size)
+                if "query_key_value.weight" in name:
+                    loaded_weight = loaded_weight.view(-1, 3, head_size,
+                                                       hidden_size)
                    loaded_weight = loaded_weight.transpose(0, 1)
                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif 'query_key_value.bias' in name:
+                elif "query_key_value.bias" in name:
                    loaded_weight = loaded_weight.view(-1, 3, head_size)
                    loaded_weight = loaded_weight.transpose(0, 1)
                    loaded_weight = loaded_weight.reshape(-1)
--- a/vllm/model_executor/models/internlm.py
+++ b/vllm/model_executor/models/internlm.py
@ -0,0 +1,299 @@
+# -*- coding: utf-8 -*-
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+from transformers import LlamaConfig
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding)
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.sequence import SequenceOutputs
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class InternLMMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+    ):
+        super().__init__()
+        self.gate_up_proj = ColumnParallelLinear(hidden_size,
+                                                 2 * intermediate_size,
+                                                 bias=False,
+                                                 gather_output=False,
+                                                 perform_initialization=False)
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           input_is_parallel=True,
+                                           perform_initialization=False)
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class InternLMAttention(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+    ):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+        self.scaling = self.head_dim**-0.5
+
+        self.qkv_proj = ColumnParallelLinear(
+            hidden_size,
+            3 * self.total_num_heads * self.head_dim,
+            bias=True,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=True,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        self.attn = PagedAttentionWithRoPE(self.num_heads,
+                                           self.head_dim,
+                                           self.scaling,
+                                           rotary_dim=self.head_dim)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.qkv_proj(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
+        output, _ = self.o_proj(attn_output)
+        return output
+
+
+class InternLMDecoderLayer(nn.Module):
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = InternLMAttention(
+            hidden_size=self.hidden_size,
+            num_heads=config.num_attention_heads,
+        )
+        self.mlp = InternLMMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+        )
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = self.self_attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class InternLMModel(nn.Module):
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__()
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.embed_tokens = VocabParallelEmbedding(
+            vocab_size, config.hidden_size, perform_initialization=False)
+        self.layers = nn.ModuleList([
+            InternLMDecoderLayer(config)
+            for _ in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.embed_tokens(input_ids)
+        for i in range(len(self.layers)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.layers[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.norm(hidden_states)
+        return hidden_states
+
+
+class InternLMForCausalLM(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.model = InternLMModel(config)
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.lm_head = ColumnParallelLinear(config.hidden_size,
+                                            vocab_size,
+                                            bias=False,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = [
+        "embed_tokens.weight", "lm_head.weight", "qkv_proj.weight",
+        "gate_proj.weight", "up_proj.weight"
+    ]
+    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
+        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "embed_tokens" in name or "lm_head" in name:
+                param = state_dict[name]
+                # Consider padding in the vocab size.
+                padded_vocab_size = (param.shape[0] *
+                                     tensor_model_parallel_world_size)
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            is_attention_weight = False
+            for stride_id, att_weight_name in enumerate(
+                ["q_proj", "k_proj", "v_proj"]):
+                if att_weight_name not in name:
+                    continue
+                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
+                shard_size = param.shape[0] // 3
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_attention_weight = True
+                break
+            if is_attention_weight:
+                continue
+
+            is_gate_up_weight = False
+            for stride_id, weight_name in enumerate(["gate_proj", "up_proj"]):
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "gate_up_proj")]
+                shard_size = param.shape[0] // 2
+                loaded_weight = loaded_weight[
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_gate_up_weight = True
+                break
+            if is_gate_up_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights,
+                                         tensor_model_parallel_rank)
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
@ -30,7 +31,6 @@ import torch
 from torch import nn
 from transformers import LlamaConfig

-from vllm.sequence import SequenceOutputs
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@ -56,15 +56,19 @@ class LlamaMLP(nn.Module):
        hidden_act: str,
    ):
        super().__init__()
-        self.gate_up_proj = ColumnParallelLinear(hidden_size, 2 * intermediate_size,
-                                                 bias=False, gather_output=False,
+        self.gate_up_proj = ColumnParallelLinear(hidden_size,
+                                                 2 * intermediate_size,
+                                                 bias=False,
+                                                 gather_output=False,
                                                 perform_initialization=False)
-        self.down_proj = RowParallelLinear(intermediate_size, hidden_size,
-                                           bias=False, input_is_parallel=True,
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=False,
+                                           input_is_parallel=True,
                                           perform_initialization=False)
-        if hidden_act != 'silu':
-            raise ValueError(f'Unsupported activation: {hidden_act}. '
-                             'Only silu is supported for now.')
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
        self.act_fn = SiluAndMul()

    def forward(self, x):
@ -80,19 +84,26 @@ class LlamaAttention(nn.Module):
        self,
        hidden_size: int,
        num_heads: int,
+        num_kv_heads: int,
    ):
        super().__init__()
        self.hidden_size = hidden_size
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tp_size = get_tensor_model_parallel_world_size()
        self.total_num_heads = num_heads
-        assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = num_kv_heads
+        assert self.total_num_kv_heads % tp_size == 0
+        self.num_kv_heads = self.total_num_kv_heads // tp_size
        self.head_dim = hidden_size // self.total_num_heads
-        self.scaling = self.head_dim ** -0.5
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5

        self.qkv_proj = ColumnParallelLinear(
            hidden_size,
-            3 * self.total_num_heads * self.head_dim,
+            (self.total_num_heads + 2 * self.total_num_kv_heads) *
+            self.head_dim,
            bias=False,
            gather_output=False,
            perform_initialization=False,
@ -104,8 +115,11 @@ class LlamaAttention(nn.Module):
            input_is_parallel=True,
            perform_initialization=False,
        )
-        self.attn = PagedAttentionWithRoPE(self.num_heads, self.head_dim,
-                                           self.scaling, rotary_dim=self.head_dim)
+        self.attn = PagedAttentionWithRoPE(self.num_heads,
+                                           self.head_dim,
+                                           self.scaling,
+                                           rotary_dim=self.head_dim,
+                                           num_kv_heads=self.num_kv_heads)

    def forward(
        self,
@ -116,10 +130,10 @@ class LlamaAttention(nn.Module):
        cache_event: Optional[torch.cuda.Event],
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
-        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(
-            positions, q, k, v, k_cache, v_cache, input_metadata, cache_event)
+        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -132,14 +146,17 @@ class LlamaDecoderLayer(nn.Module):
        self.self_attn = LlamaAttention(
            hidden_size=self.hidden_size,
            num_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
        )
        self.mlp = LlamaMLP(
            hidden_size=self.hidden_size,
            intermediate_size=config.intermediate_size,
            hidden_act=config.hidden_act,
        )
-        self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.input_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+        self.post_attention_layernorm = RMSNorm(config.hidden_size,
+                                                eps=config.rms_norm_eps)

    def forward(
        self,
@ -177,9 +194,12 @@ class LlamaModel(nn.Module):
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size, config.hidden_size,
-                                                   perform_initialization=False)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.embed_tokens = VocabParallelEmbedding(
+            vocab_size, config.hidden_size, perform_initialization=False)
+        self.layers = nn.ModuleList([
+            LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)
+        ])
        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

    def forward(
@ -209,12 +229,14 @@ class LlamaModel(nn.Module):


 class LlamaForCausalLM(nn.Module):
+
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.model = LlamaModel(config)
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
        self.lm_head = ColumnParallelLinear(config.hidden_size,
-                                            config.vocab_size,
+                                            vocab_size,
                                            bias=False,
                                            gather_output=False,
                                            perform_initialization=False)
@ -228,40 +250,64 @@ class LlamaForCausalLM(nn.Module):
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.lm_head.weight, hidden_states, input_metadata)
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
        return next_tokens

-    _column_parallel_weights = ["embed_tokens.weight", "lm_head.weight",
-                                "qkv_proj.weight", "gate_proj.weight",
-                                "up_proj.weight"]
+    _column_parallel_weights = [
+        "embed_tokens.weight", "lm_head.weight", "qkv_proj.weight",
+        "gate_proj.weight", "up_proj.weight"
+    ]
    _row_parallel_weights = ["o_proj.weight", "down_proj.weight"]

-    def load_weights(self, model_name_or_path: str,
+    def load_weights(self,
+                     model_name_or_path: str,
                     cache_dir: Optional[str] = None,
                     use_np_cache: bool = False):
+        tp_size = get_tensor_model_parallel_world_size()
        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
+        q_proj_shard_size = (self.config.hidden_size // tp_size)
+        kv_proj_shard_size = (self.config.hidden_size //
+                              self.config.num_attention_heads *
+                              self.config.num_key_value_heads // tp_size)
+        attention_weight_specs = [
+            # (weight_name, shard_size, offset)
+            ("q_proj", q_proj_shard_size, 0),
+            ("k_proj", kv_proj_shard_size, q_proj_shard_size),
+            ("v_proj", kv_proj_shard_size,
+             q_proj_shard_size + kv_proj_shard_size),
+        ]
        state_dict = self.state_dict()

        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, use_np_cache):
+                model_name_or_path, cache_dir, use_np_cache):
            if "rotary_emb.inv_freq" in name:
                continue

+            if "embed_tokens" in name or "lm_head" in name:
+                param = state_dict[name]
+                # Consider padding in the vocab size.
+                padded_vocab_size = (param.shape[0] * tp_size)
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
            is_attention_weight = False
-            for stride_id, att_weight_name in enumerate(["q_proj", "k_proj", "v_proj"]):
-                if att_weight_name not in name:
+            for weight_name, shard_size, offset in attention_weight_specs:
+                if weight_name not in name:
                    continue
-                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
-                shard_size = param.shape[0] // 3
+                param = state_dict[name.replace(weight_name, "qkv_proj")]
+
                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank
-                    :shard_size * (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id
-                                         :shard_size * (stride_id + 1)]
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[offset:offset + shard_size]
                assert param_slice.shape == loaded_weight.shape
+
                param_slice.copy_(loaded_weight)
                is_attention_weight = True
                break
@ -275,10 +321,10 @@ class LlamaForCausalLM(nn.Module):
                param = state_dict[name.replace(weight_name, "gate_up_proj")]
                shard_size = param.shape[0] // 2
                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank
-                    :shard_size * (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id
-                                         :shard_size * (stride_id + 1)]
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
                assert param_slice.shape == loaded_weight.shape
                param_slice.copy_(loaded_weight)
                is_gate_up_weight = True
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@ -0,0 +1,280 @@
+# coding=utf-8
+# Adapted from https://huggingface.co/mosaicml/mpt-7b/tree/main
+import math
+from typing import Dict, List, Optional, Tuple
+
+import torch
+import torch.nn as nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import get_act_fn
+from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (hf_model_weights_iterator,
+                                              load_tensor_parallel_weights)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding, ColumnParallelLinear, RowParallelLinear)
+from vllm.sequence import SequenceOutputs
+from vllm.transformers_utils.configs.mpt import MPTConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+def _get_alibi_slopes(
+    total_num_heads: int,
+    alibi_bias_max: int,
+) -> torch.Tensor:
+    next_power_of_2 = 2**math.ceil(math.log2(total_num_heads))
+    m = torch.arange(1, next_power_of_2 + 1, dtype=torch.float32)
+    m = m.mul(alibi_bias_max / next_power_of_2)
+    slopes = 1.0 / torch.pow(2, m)
+    if next_power_of_2 != total_num_heads:
+        slopes = torch.concat([slopes[1::2], slopes[::2]])[:total_num_heads]
+    return slopes
+
+
+class MPTAttention(nn.Module):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__()
+        self.d_model = config.d_model
+        self.total_num_heads = config.n_heads
+        self.clip_qkv = config.attn_config["clip_qkv"]
+        self.qk_ln = config.attn_config["qk_ln"]
+        self.alibi_bias_max = config.attn_config["alibi_bias_max"]
+        assert not config.attn_config["prefix_lm"]
+        assert config.attn_config["alibi"]
+
+        self.qkv_proj = ColumnParallelLinear(
+            self.d_model,
+            3 * self.d_model,
+            bias=not config.no_bias,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        if self.qk_ln:
+            self.q_ln = nn.LayerNorm(self.d_model)
+            self.k_ln = nn.LayerNorm(self.d_model)
+        self.out_proj = RowParallelLinear(
+            self.d_model,
+            self.d_model,
+            bias=not config.no_bias,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+
+        tp_world_size = get_tensor_model_parallel_world_size()
+        assert self.total_num_heads % tp_world_size == 0
+        self.num_heads = self.total_num_heads // tp_world_size
+
+        # Create the alibi slopes and slice them.
+        tp_rank = get_tensor_model_parallel_rank()
+        head_start = tp_rank * self.num_heads
+        head_end = (tp_rank + 1) * self.num_heads
+        alibi_slopes = _get_alibi_slopes(self.total_num_heads,
+                                         self.alibi_bias_max)
+        alibi_slopes = alibi_slopes[head_start:head_end].tolist()
+
+        self.head_dim = self.d_model // self.total_num_heads
+        scaling = self.head_dim**-0.5
+        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
+                                            scaling, alibi_slopes)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        del position_ids  # unused.
+        qkv, _ = self.qkv_proj(hidden_states)
+        if self.clip_qkv is not None:
+            qkv.clamp_(min=-self.clip_qkv, max=self.clip_qkv)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.qk_ln:
+            q = self.q_ln(q)
+            k = self.k_ln(k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
+        output, _ = self.out_proj(attn_output)
+        return output
+
+
+class MPTMLP(nn.Module):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__()
+        hidden_size = config.d_model
+        expansion_ratio = config.expansion_ratio
+        intermediate_size = expansion_ratio * hidden_size
+        self.up_proj = ColumnParallelLinear(hidden_size,
+                                            intermediate_size,
+                                            bias=not config.no_bias,
+                                            gather_output=False,
+                                            perform_initialization=False)
+        self.act = get_act_fn("gelu")
+        self.down_proj = RowParallelLinear(intermediate_size,
+                                           hidden_size,
+                                           bias=not config.no_bias,
+                                           input_is_parallel=True,
+                                           perform_initialization=False)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x, _ = self.up_proj(x)
+        x = self.act(x)
+        x, _ = self.down_proj(x)
+        return x
+
+
+class MPTBlock(nn.Module):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__()
+        hidden_size = config.d_model
+        self.norm_1 = nn.LayerNorm(hidden_size)
+        self.attn = MPTAttention(config)
+        self.norm_2 = nn.LayerNorm(hidden_size)
+        self.ffn = MPTMLP(config)
+
+    def forward(
+        self,
+        position_ids: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        x = self.norm_1(hidden_states)
+        x = self.attn(
+            position_ids=position_ids,
+            hidden_states=x,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = hidden_states + x
+        x = self.norm_2(hidden_states)
+        x = self.ffn(x)
+        hidden_states = hidden_states + x
+        return hidden_states
+
+
+class MPTModel(nn.Module):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__()
+        assert config.embedding_fraction == 1.0
+        assert config.norm_type == "low_precision_layernorm"
+
+        self.wte = VocabParallelEmbedding(config.vocab_size,
+                                          config.d_model,
+                                          perform_initialization=False)
+        self.blocks = nn.ModuleList(
+            [MPTBlock(config) for _ in range(config.n_layers)])
+        self.norm_f = nn.LayerNorm(config.d_model)
+        if config.no_bias:
+            for module in self.modules():
+                if hasattr(module, "bias"):
+                    if isinstance(module.bias, nn.Parameter):
+                        # Remove the bias term in Linear and LayerNorm.
+                        module.register_parameter("bias", None)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        for i in range(len(self.blocks)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            block = self.blocks[i]
+            hidden_states = block(
+                position_ids,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.norm_f(hidden_states)
+        return hidden_states
+
+
+class MPTForCausalLM(nn.Module):
+
+    def __init__(self, config: MPTConfig):
+        super().__init__()
+        self.config = config
+        assert config.tie_word_embeddings
+
+        self.transformer = MPTModel(config)
+        # TODO(zhuohan): create a new weight after implementing pipeline
+        #                parallelism
+        self.lm_head_weight = self.transformer.wte.weight
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["wte.weight", "up_proj.weight", "up_proj.bias"]
+    _row_parallel_weights = ["out_proj.weight", "down_proj.weight"]
+
+    def load_weights(self,
+                     model_name_or_path: str,
+                     cache_dir: Optional[str] = None,
+                     use_np_cache: bool = False):
+        tp_world_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "Wqkv" in name:
+                # NOTE(woosuk): MPT's fused QKV has the shape of
+                # [3 * num_heads * head_size, hidden_size].
+                # When tensor model parallelism is used, we need to shard
+                # the weight along the hidden dimension.
+                total_num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // total_num_heads
+                num_heads = total_num_heads // tp_world_size
+                head_start = tp_rank * num_heads
+                head_end = (tp_rank + 1) * num_heads
+
+                if name.endswith(".weight"):
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size, hidden_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
+                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
+                elif name.endswith(".bias"):
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :]
+                    loaded_weight = loaded_weight.reshape(-1)
+                else:
+                    raise ValueError(f"Unexpected parameter name {name}")
+                name = name.replace("Wqkv", "qkv_proj")
+            param = state_dict[name]
+            load_tensor_parallel_weights(param, loaded_weight, name,
+                                         self._column_parallel_weights,
+                                         self._row_parallel_weights, tp_rank)
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@ -1,7 +1,9 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
+# reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -43,8 +45,9 @@ KVCache = Tuple[torch.Tensor, torch.Tensor]
 class OPTLearnedPositionalEmbedding(nn.Embedding):

    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
+        # OPT is set up so that if padding_idx is specified then offset the
+        # embedding ids by 2 and adjust num_embeddings appropriately. Other
+        # models don't have this hack
        self.offset = 2
        super().__init__(num_embeddings + self.offset, embedding_dim)

@ -62,20 +65,26 @@ class OPTAttention(nn.Module):
    ) -> None:
        super().__init__()
        self.embed_dim = embed_dim
-        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size()
+        tensor_model_parallel_world_size = (
+            get_tensor_model_parallel_world_size())
        total_num_heads = num_heads
        assert num_heads % tensor_model_parallel_world_size == 0
        self.num_heads = total_num_heads // tensor_model_parallel_world_size
        self.head_dim = embed_dim // total_num_heads
-        self.scaling = self.head_dim ** -0.5
+        self.scaling = self.head_dim**-0.5

-        self.qkv_proj = ColumnParallelLinear(embed_dim, 3 * embed_dim, bias=bias,
+        self.qkv_proj = ColumnParallelLinear(embed_dim,
+                                             3 * embed_dim,
+                                             bias=bias,
                                             gather_output=False,
                                             perform_initialization=False)
-        self.out_proj = RowParallelLinear(embed_dim, embed_dim, bias=bias,
+        self.out_proj = RowParallelLinear(embed_dim,
+                                          embed_dim,
+                                          bias=bias,
                                          input_is_parallel=True,
                                          perform_initialization=False)
-        self.attn = PagedAttention(self.num_heads, self.head_dim,
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
                                   scale=self.scaling)

    def forward(
@ -88,8 +97,8 @@ class OPTAttention(nn.Module):
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
        key_cache, value_cache = kv_cache
-        attn_output = self.attn(
-            q, k, v, key_cache, value_cache, input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, key_cache, value_cache,
+                                input_metadata, cache_event)
        output, _ = self.out_proj(attn_output)
        return output

@ -109,17 +118,21 @@ class OPTDecoderLayer(nn.Module):
        self.activation_fn = get_act_fn(config.activation_function)

        self.self_attn_layer_norm = nn.LayerNorm(
-            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
-        self.fc1 = ColumnParallelLinear(self.embed_dim, config.ffn_dim,
+            self.embed_dim,
+            elementwise_affine=config.layer_norm_elementwise_affine)
+        self.fc1 = ColumnParallelLinear(self.embed_dim,
+                                        config.ffn_dim,
                                        bias=config.enable_bias,
                                        gather_output=False,
                                        perform_initialization=False)
-        self.fc2 = RowParallelLinear(config.ffn_dim, self.embed_dim,
+        self.fc2 = RowParallelLinear(config.ffn_dim,
+                                     self.embed_dim,
                                     bias=config.enable_bias,
                                     input_is_parallel=True,
                                     perform_initialization=False)
        self.final_layer_norm = nn.LayerNorm(
-            self.embed_dim, elementwise_affine=config.layer_norm_elementwise_affine)
+            self.embed_dim,
+            elementwise_affine=config.layer_norm_elementwise_affine)

    def forward(
        self,
@ -133,11 +146,10 @@ class OPTDecoderLayer(nn.Module):
        # 125m, 1.7B, ..., 175B applies layer norm BEFORE attention
        if self.do_layer_norm_before:
            hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states = self.self_attn(
-            hidden_states=hidden_states,
-            kv_cache=kv_cache,
-            input_metadata=input_metadata,
-            cache_event=cache_event)
+        hidden_states = self.self_attn(hidden_states=hidden_states,
+                                       kv_cache=kv_cache,
+                                       input_metadata=input_metadata,
+                                       cache_event=cache_event)
        hidden_states = residual + hidden_states
        # 350m applies layer norm AFTER attention
        if not self.do_layer_norm_before:
@ -167,35 +179,42 @@ class OPTDecoder(nn.Module):
        self.max_target_positions = config.max_position_embeddings
        self.vocab_size = config.vocab_size

-        self.embed_tokens = VocabParallelEmbedding(config.vocab_size,
-                                                   config.word_embed_proj_dim,
-                                                   perform_initialization=False)
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.word_embed_proj_dim,
+            perform_initialization=False)
        # Positional embeddings are replicated (not sharded).
        self.embed_positions = OPTLearnedPositionalEmbedding(
            config.max_position_embeddings, config.hidden_size)

        # Project out & in will be replicated if they exist.
        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias=False)
+            self.project_out = nn.Linear(config.hidden_size,
+                                         config.word_embed_proj_dim,
+                                         bias=False)
        else:
            self.project_out = None

        if config.word_embed_proj_dim != config.hidden_size:
-            self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias=False)
+            self.project_in = nn.Linear(config.word_embed_proj_dim,
+                                        config.hidden_size,
+                                        bias=False)
        else:
            self.project_in = None

-        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # Note that the only purpose of `config._remove_final_layer_norm` is to
+        # keep backward compatibility with checkpoints that have been fine-tuned
+        # before transformers v4.20.1
        # see https://github.com/facebookresearch/metaseq/pull/164
        if config.do_layer_norm_before and not config._remove_final_layer_norm:
            self.final_layer_norm = nn.LayerNorm(
-                config.hidden_size, elementwise_affine=config.layer_norm_elementwise_affine
-            )
+                config.hidden_size,
+                elementwise_affine=config.layer_norm_elementwise_affine)
        else:
            self.final_layer_norm = None

-        self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList(
+            [OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(
        self,
@ -217,8 +236,8 @@ class OPTDecoder(nn.Module):
            else:
                cache_event = cache_events[i]
            layer = self.layers[i]
-            hidden_states = layer(
-                hidden_states, kv_caches[i], input_metadata, cache_event)
+            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
+                                  cache_event)

        if self.final_layer_norm is not None:
            hidden_states = self.final_layer_norm(hidden_states)
@ -241,8 +260,8 @@ class OPTModel(nn.Module):
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> torch.Tensor:
-        return self.decoder(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
+        return self.decoder(input_ids, positions, kv_caches, input_metadata,
+                            cache_events)


 class OPTForCausalLM(nn.Module):
@ -264,23 +283,26 @@ class OPTForCausalLM(nn.Module):
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> Dict[int, SequenceOutputs]:
-        hidden_states = self.model(
-            input_ids, positions, kv_caches, input_metadata, cache_events)
-        next_tokens = self.sampler(
-            self.lm_head_weight, hidden_states, input_metadata)
+        hidden_states = self.model(input_ids, positions, kv_caches,
+                                   input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
+                                   input_metadata)
        return next_tokens

-    _column_parallel_weights = ["embed_tokens.weight", "fc1.weight", "fc1.bias"]
+    _column_parallel_weights = [
+        "embed_tokens.weight", "fc1.weight", "fc1.bias"
+    ]
    _row_parallel_weights = ["out_proj.weight", "fc2.weight"]

-    def load_weights(self, model_name_or_path: str,
+    def load_weights(self,
+                     model_name_or_path: str,
                     cache_dir: Optional[str] = None,
                     use_np_cache: bool = False):
        tensor_model_parallel_rank = get_tensor_model_parallel_rank()
        state_dict = self.state_dict()

        for name, loaded_weight in hf_model_weights_iterator(
-            model_name_or_path, cache_dir, use_np_cache):
+                model_name_or_path, cache_dir, use_np_cache):
            if "lm_head.weight" in name:
                continue

@ -288,16 +310,17 @@ class OPTForCausalLM(nn.Module):
                name = "model." + name

            is_attention_weight = False
-            for stride_id, att_weight_name in enumerate(["q_proj", "k_proj", "v_proj"]):
+            for stride_id, att_weight_name in enumerate(
+                ["q_proj", "k_proj", "v_proj"]):
                if att_weight_name not in name:
                    continue
                param = state_dict[name.replace(att_weight_name, "qkv_proj")]
                shard_size = param.shape[0] // 3
                loaded_weight = loaded_weight[
-                    shard_size * tensor_model_parallel_rank
-                    :shard_size * (tensor_model_parallel_rank + 1)]
-                param_slice = param.data[shard_size * stride_id
-                                         :shard_size * (stride_id + 1)]
+                    shard_size * tensor_model_parallel_rank:shard_size *
+                    (tensor_model_parallel_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
                assert param_slice.shape == loaded_weight.shape
                param_slice.copy_(loaded_weight)
                is_attention_weight = True
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@ -0,0 +1,316 @@
+# coding=utf-8
+# Adapted from
+# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
+# Copyright (c) Alibaba Cloud.
+# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
+"""Inference-only QWen model compatible with HuggingFace weights.
+
+The input of the model is flattened to a 1D tensor of tokens. The model uses
+InputMetadata to extract the original 2D shape of the input.
+"""
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from torch import nn
+
+from vllm.model_executor.input_metadata import InputMetadata
+from vllm.model_executor.layers.activation import SiluAndMul
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.weight_utils import (
+    hf_model_weights_iterator,
+    load_tensor_parallel_weights,
+)
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.parallel_utils.tensor_parallel import (
+    VocabParallelEmbedding,
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from vllm.sequence import SequenceOutputs
+from vllm.transformers_utils.configs.qwen import QWenConfig
+
+KVCache = Tuple[torch.Tensor, torch.Tensor]
+
+
+class QWenMLP(nn.Module):
+
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str = "silu",
+    ):
+        super().__init__()
+        self.gate_up_proj = ColumnParallelLinear(
+            hidden_size,
+            2 * intermediate_size,
+            bias=False,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.c_proj = RowParallelLinear(
+            intermediate_size,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
+        self.act_fn = SiluAndMul()
+
+    def forward(self, x):
+        gate_up, _ = self.gate_up_proj(x)
+        x = self.act_fn(gate_up)
+        x, _ = self.c_proj(x)
+        return x
+
+
+class QWenAttention(nn.Module):
+
+    def __init__(self, hidden_size: int, num_heads: int,
+                 max_position_embeddings: int):
+        super().__init__()
+        self.hidden_size = hidden_size
+        tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
+        )
+        self.total_num_heads = num_heads
+        assert self.total_num_heads % tensor_model_parallel_world_size == 0
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
+        self.head_dim = hidden_size // self.total_num_heads
+
+        # pylint: disable=invalid-name
+        self.c_attn = ColumnParallelLinear(
+            hidden_size,
+            3 * hidden_size,
+            bias=True,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.c_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            hidden_size,
+            bias=False,
+            input_is_parallel=True,
+            perform_initialization=False,
+        )
+        self.scaling = self.head_dim**-0.5
+        self.attn = PagedAttentionWithRoPE(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            rotary_dim=self.head_dim,
+            max_position=max_position_embeddings,
+        )
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        qkv, _ = self.c_attn(hidden_states)
+        q, k, v = qkv.chunk(chunks=3, dim=-1)
+
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
+                                input_metadata, cache_event)
+
+        output, _ = self.c_proj(attn_output)
+        return output
+
+
+class QWenBlock(nn.Module):
+
+    def __init__(self, config: QWenConfig):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.attn = QWenAttention(config.n_embd, config.num_attention_heads,
+                                  config.max_position_embeddings)
+
+        self.ln_2 = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.mlp = QWenMLP(config.n_embd, config.ffn_hidden_size // 2)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        kv_cache: KVCache,
+        input_metadata: InputMetadata,
+        cache_event: Optional[torch.cuda.Event],
+    ) -> torch.Tensor:
+        # Self Attention
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        hidden_states = self.attn(
+            positions=positions,
+            hidden_states=hidden_states,
+            kv_cache=kv_cache,
+            input_metadata=input_metadata,
+            cache_event=cache_event,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class QWenModel(nn.Module):
+
+    def __init__(self, config: QWenConfig):
+        super().__init__()
+        self.config = config
+        self.vocab_size = config.vocab_size
+
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.wte = VocabParallelEmbedding(vocab_size,
+                                          config.n_embd,
+                                          perform_initialization=False)
+        self.h = nn.ModuleList(
+            [QWenBlock(config) for _ in range(config.num_hidden_layers)])
+        self.ln_f = RMSNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> torch.Tensor:
+        hidden_states = self.wte(input_ids)
+        for i in range(len(self.h)):
+            if cache_events is None:
+                cache_event = None
+            else:
+                cache_event = cache_events[i]
+            layer = self.h[i]
+            hidden_states = layer(
+                positions,
+                hidden_states,
+                kv_caches[i],
+                input_metadata,
+                cache_event,
+            )
+        hidden_states = self.ln_f(hidden_states)
+        return hidden_states
+
+
+class QWenLMHeadModel(nn.Module):
+
+    def __init__(self, config: QWenConfig):
+        super().__init__()
+        self.config = config
+        self.transformer = QWenModel(config)
+        vocab_size = ((config.vocab_size + 63) // 64) * 64
+        self.lm_head = ColumnParallelLinear(
+            config.n_embd,
+            vocab_size,
+            bias=False,
+            gather_output=False,
+            perform_initialization=False,
+        )
+        self.sampler = Sampler(config.vocab_size)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[KVCache],
+        input_metadata: InputMetadata,
+        cache_events: Optional[List[torch.cuda.Event]],
+    ) -> Dict[int, SequenceOutputs]:
+        hidden_states = self.transformer(input_ids, positions, kv_caches,
+                                         input_metadata, cache_events)
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   input_metadata)
+        return next_tokens
+
+    _column_parallel_weights = ["wte.weight", "lm_head.weight"]
+    _row_parallel_weights = ["c_proj.weight"]
+
+    def load_weights(
+        self,
+        model_name_or_path: str,
+        cache_dir: Optional[str] = None,
+        use_np_cache: bool = False,
+    ):
+        tp_world_size = get_tensor_model_parallel_world_size()
+        tp_rank = get_tensor_model_parallel_rank()
+        state_dict = self.state_dict()
+
+        for name, loaded_weight in hf_model_weights_iterator(
+                model_name_or_path, cache_dir, use_np_cache):
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if "wte" in name or "lm_head" in name:
+                # Consider padding in the vocab size.
+                param = state_dict[name]
+                padded_vocab_size = param.shape[0] * tp_world_size
+                num_extra_rows = padded_vocab_size - self.config.vocab_size
+                extra_rows = torch.empty(num_extra_rows,
+                                         loaded_weight.shape[1])
+                extra_rows = extra_rows.to(loaded_weight)
+                loaded_weight = torch.cat([loaded_weight, extra_rows], dim=0)
+
+            if "c_attn" in name:
+                total_num_heads = self.config.num_attention_heads
+                hidden_size = self.config.hidden_size
+                head_size = hidden_size // total_num_heads
+                num_heads = total_num_heads // tp_world_size
+                head_start = tp_rank * num_heads
+                head_end = (tp_rank + 1) * num_heads
+
+                if "weight" in name:
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size, hidden_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :, :]
+                    loaded_weight = loaded_weight.reshape(-1, hidden_size)
+                elif "bias" in name:
+                    loaded_weight = loaded_weight.view(3, total_num_heads,
+                                                       head_size)
+                    loaded_weight = loaded_weight[:, head_start:head_end, :]
+                    loaded_weight = loaded_weight.reshape(-1)
+
+            is_gate_up_weight = False
+            for stride_id, weight_name in enumerate(["w2", "w1"]):
+                if weight_name not in name:
+                    continue
+                param = state_dict[name.replace(weight_name, "gate_up_proj")]
+                shard_size = param.shape[0] // 2
+                loaded_weight = loaded_weight[shard_size * tp_rank:shard_size *
+                                              (tp_rank + 1)]
+                param_slice = param.data[shard_size * stride_id:shard_size *
+                                         (stride_id + 1)]
+                assert param_slice.shape == loaded_weight.shape
+                param_slice.copy_(loaded_weight)
+                is_gate_up_weight = True
+                break
+            if is_gate_up_weight:
+                continue
+
+            param = state_dict[name]
+            load_tensor_parallel_weights(
+                param,
+                loaded_weight,
+                name,
+                self._column_parallel_weights,
+                self._row_parallel_weights,
+                tp_rank,
+            )
--- a/vllm/model_executor/parallel_utils/init.py
+++ b/vllm/model_executor/parallel_utils/init.py
@ -1,9 +1,6 @@
 import vllm.model_executor.parallel_utils.parallel_state
 import vllm.model_executor.parallel_utils.tensor_parallel

-# Alias parallel_state as mpu, its legacy name
-mpu = parallel_state
-
 __all__ = [
    "parallel_state",
    "tensor_parallel",
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@ -44,7 +44,6 @@ _PIPELINE_GLOBAL_RANKS = None
 # rank when broadcasting weights from src to all other data parallel ranks
 _DATA_PARALLEL_GLOBAL_RANKS = None

-_ALL_REDUCE_LAUNCHER: Optional['GraphAllReduce'] = None

 def initialize_model_parallel(
    tensor_model_parallel_size: int = 1,
@ -196,20 +195,6 @@ def initialize_model_parallel(
        if rank in ranks:
            _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks

-def initialize_all_reduce_launcher(
-    max_num_tokens: int,
-    hidden_size: int,
-    dtype: torch.dtype,
-    disable_graph: bool = False,
-) -> None:
-    global _ALL_REDUCE_LAUNCHER
-    _ALL_REDUCE_LAUNCHER = GraphAllReduce(
-        max_num_tokens=max_num_tokens,
-        hidden_size=hidden_size,
-        dtype=dtype,
-        disable_graph=disable_graph,
-    )
-
 def model_parallel_is_initialized():
    """Check if model and data parallel groups are initialized."""
    if _TENSOR_MODEL_PARALLEL_GROUP is None or \
@ -458,6 +443,7 @@ def get_pipeline_model_parallel_last_rank():
    last_rank_local = get_pipeline_model_parallel_world_size() - 1
    return _PIPELINE_GLOBAL_RANKS[last_rank_local]

+
 def get_pipeline_model_parallel_next_rank():
    """Return the global rank that follows the caller in the pipeline"""
    assert _PIPELINE_GLOBAL_RANKS is not None, \
@ -485,10 +471,6 @@ def get_data_parallel_rank():
    """Return my rank for the data parallel group."""
    return torch.distributed.get_rank(group=get_data_parallel_group())

-def get_all_reduce_launcher() -> 'GraphAllReduce':
-    assert _ALL_REDUCE_LAUNCHER is not None, 'all reduce launcher is not initialized'
-    return _ALL_REDUCE_LAUNCHER
-
 def destroy_model_parallel():
    """Set the groups to none."""
    global _MODEL_PARALLEL_GROUP
@ -515,56 +497,3 @@ def destroy_model_parallel():
    _MPU_TENSOR_MODEL_PARALLEL_RANK = None
    global _MPU_PIPELINE_MODEL_PARALLEL_RANK
    _MPU_PIPELINE_MODEL_PARALLEL_RANK = None
-
-
-class GraphAllReduce:
-
-    def __init__(
-        self,
-        max_num_tokens: int,
-        hidden_size: int,
-        dtype: torch.dtype,
-        disable_graph: bool = False,
-    ) -> None:
-        self.max_num_tokens = max_num_tokens
-        self.hidden_size = hidden_size
-        self.disable_graph = disable_graph
-
-        tp_world_size = get_tensor_model_parallel_world_size()
-        if tp_world_size == 1:
-            return
-
-        self.group = get_tensor_model_parallel_group()
-        self.buffer = torch.empty(
-            size=(max_num_tokens, hidden_size),
-            dtype=dtype,
-            device='cuda',
-        )
-
-        # Build graphs for different number of tokens.
-        if not self.disable_graph:
-            self.graphs = {}
-            for num_tokens in range(8, max_num_tokens + 1, 8):
-                self.graphs[num_tokens] = self._build_graph(num_tokens)
-
-    def _build_graph(self, num_tokens: int) -> torch.cuda.CUDAGraph:
-        # Warm up.
-        torch.distributed.all_reduce(self.buffer[:num_tokens], group=self.group)
-        torch.cuda.synchronize()
-
-        # Build graph.
-        graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
-            torch.distributed.all_reduce(
-                self.buffer[:num_tokens], group=self.group)
-        torch.cuda.synchronize()
-        return graph
-
-    def launch(self, x: torch.Tensor) -> torch.Tensor:
-        # NOTE: x must be a slice of self.buffer.
-        num_tokens = x.shape[0]
-        if self.disable_graph:
-            torch.distributed.all_reduce(x, group=self.group)
-        else:
-            self.graphs[num_tokens].replay()
-        return x
--- a/vllm/model_executor/parallel_utils/tensor_parallel/init.py
+++ b/vllm/model_executor/parallel_utils/tensor_parallel/init.py
@ -12,6 +12,7 @@ from .mappings import (
    copy_to_tensor_model_parallel_region,
    gather_from_tensor_model_parallel_region,
    gather_from_sequence_parallel_region,
+    reduce_from_tensor_model_parallel_region,
    scatter_to_tensor_model_parallel_region,
    scatter_to_sequence_parallel_region,
 )
@ -38,7 +39,7 @@ __all__ = [
    "copy_to_tensor_model_parallel_region",
    "gather_from_tensor_model_parallel_region",
    "gather_from_sequence_parallel_region",
-#    "reduce_from_tensor_model_parallel_region",
+    "reduce_from_tensor_model_parallel_region",
    "scatter_to_tensor_model_parallel_region",
    "scatter_to_sequence_parallel_region",
    # random.py
--- a/vllm/model_executor/parallel_utils/tensor_parallel/layers.py
+++ b/vllm/model_executor/parallel_utils/tensor_parallel/layers.py
@ -14,7 +14,6 @@ from torch.nn.parameter import Parameter
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank,
    get_tensor_model_parallel_world_size,
-    get_all_reduce_launcher,
 )
 from .mappings import (
    copy_to_tensor_model_parallel_region,
@ -248,8 +247,8 @@ class ColumnParallelLinear(torch.nn.Module):
        self.output_size = output_size
        self.gather_output = gather_output
        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.output_size_per_partition = divide(output_size, world_size)
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.output_size_per_partition = divide(output_size, self.world_size)
        self.skip_bias_add = skip_bias_add

        if params_dtype is None:
@ -350,6 +349,7 @@ class RowParallelLinear(torch.nn.Module):
        params_dtype:
        use_cpu_initialization:
        perform_initialization:
+        reduce_results:
    """

    def __init__(self, input_size, output_size, *,
@ -360,6 +360,7 @@ class RowParallelLinear(torch.nn.Module):
                 params_dtype=None,
                 use_cpu_initialization=False,
                 perform_initialization=True,
+                 reduce_results=True,
                 ):
        super(RowParallelLinear, self).__init__()

@ -367,14 +368,19 @@ class RowParallelLinear(torch.nn.Module):
        self.input_size = input_size
        self.output_size = output_size
        self.input_is_parallel = input_is_parallel
+        self.reduce_results = reduce_results
        if params_dtype is None:
            params_dtype = torch.get_default_dtype()

        # Divide the weight matrix along the last dimension.
-        world_size = get_tensor_model_parallel_world_size()
-        self.input_size_per_partition = divide(input_size, world_size)
+        self.world_size = get_tensor_model_parallel_world_size()
+        self.input_size_per_partition = divide(input_size, self.world_size)
        self.skip_bias_add = skip_bias_add

+        if not reduce_results and (bias and not skip_bias_add):
+            raise ValueError("When not reduce the results, adding bias to the "
+                             "results can lead to incorrect results")
+
        # Parameters.
        # Note: torch.nn.functional.linear performs XA^T + b and as a result
        # we allocate the transpose.
@ -427,17 +433,12 @@ class RowParallelLinear(torch.nn.Module):
            input_parallel = input_
        else:
            input_parallel = scatter_to_tensor_model_parallel_region(input_)
-        if get_tensor_model_parallel_world_size() == 1:
-            # Matrix multiply.
-            output_ = F.linear(input_parallel, self.weight)
+        # Matrix multiply.
+        output_parallel = F.linear(input_parallel, self.weight)
+        if self.reduce_results and self.world_size > 1:
+            output_ = reduce_from_tensor_model_parallel_region(output_parallel)
        else:
-            # Matrix multiply.
-            all_reduce_launcher = get_all_reduce_launcher()
-            num_tokens = input_parallel.shape[0]
-            output_buffer = all_reduce_launcher.buffer[:num_tokens]
-            torch.matmul(input_parallel, self.weight_t, out=output_buffer)
-            # All-reduce across all the partitions.
-            output_ = all_reduce_launcher.launch(output_buffer)
+            output_ = output_parallel

        if not self.skip_bias_add:
            output = output_ + self.bias if self.bias is not None else output_
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@ -39,14 +39,17 @@ def hf_model_weights_iterator(
    else:
        hf_folder = model_name_or_path

-    hf_bin_files = glob.glob(os.path.join(hf_folder, "*.bin"))
+    hf_bin_files = [
+        x for x in glob.glob(os.path.join(hf_folder, "*.bin"))
+        if not x.endswith("training_args.bin")
+    ]

    if use_np_cache:
        # Convert the model weights from torch tensors to numpy arrays for
        # faster loading.
-        np_folder = os.path.join(hf_folder, 'np')
+        np_folder = os.path.join(hf_folder, "np")
        os.makedirs(np_folder, exist_ok=True)
-        weight_names_file = os.path.join(np_folder, 'weight_names.json')
+        weight_names_file = os.path.join(np_folder, "weight_names.json")
        with lock:
            if not os.path.exists(weight_names_file):
                weight_names = []
@ -57,10 +60,10 @@ def hf_model_weights_iterator(
                        with open(param_path, "wb") as f:
                            np.save(f, param.cpu().detach().numpy())
                        weight_names.append(name)
-                with open(weight_names_file, 'w') as f:
+                with open(weight_names_file, "w") as f:
                    json.dump(weight_names, f)

-        with open(weight_names_file, 'r') as f:
+        with open(weight_names_file, "r") as f:
            weight_names = json.load(f)

        for name in weight_names:
@ -73,6 +76,8 @@ def hf_model_weights_iterator(
            state = torch.load(bin_file, map_location="cpu")
            for name, param in state.items():
                yield name, param
+            del state
+            torch.cuda.empty_cache()


 def load_tensor_parallel_weights(
@ -86,19 +91,20 @@ def load_tensor_parallel_weights(
    for p in column_parallel_weight_names:
        if p in param_name:
            shard_size = param.shape[0]
-            loaded_weight = loaded_weight[
-                shard_size * tensor_model_parallel_rank
-                :shard_size * (tensor_model_parallel_rank + 1)]
+            start_idx = tensor_model_parallel_rank * shard_size
+            end_idx = (tensor_model_parallel_rank + 1) * shard_size
+            loaded_weight = loaded_weight[start_idx:end_idx]
            break
    for p in row_parallel_weight_names:
        if p in param_name:
            shard_size = param.shape[1]
-            loaded_weight = loaded_weight[
-                :,
-                shard_size * tensor_model_parallel_rank
-                :shard_size * (tensor_model_parallel_rank + 1)]
+            start_idx = tensor_model_parallel_rank * shard_size
+            end_idx = (tensor_model_parallel_rank + 1) * shard_size
+            loaded_weight = loaded_weight[:, start_idx:end_idx]
            break
-    assert param.shape == loaded_weight.shape
+    assert param.shape == loaded_weight.shape, (
+        f"{param_name} shape mismatch between model and checkpoint: "
+        f"{param.shape} != {loaded_weight.shape}")
    param.data.copy_(loaded_weight)


--- a/vllm/outputs.py
+++ b/vllm/outputs.py
@ -55,6 +55,7 @@ class RequestOutput:
        outputs: The output sequences of the request.
        finished: Whether the whole request is finished.
    """
+
    def __init__(
        self,
        request_id: str,
@ -75,8 +76,9 @@ class RequestOutput:
        n = seq_group.sampling_params.n
        seqs = seq_group.get_seqs()
        assert n <= len(seqs)
-        sorted_seqs = sorted(
-            seqs, key=lambda seq: seq.get_cumulative_logprob(), reverse=True)
+        sorted_seqs = sorted(seqs,
+                             key=lambda seq: seq.get_cumulative_logprob(),
+                             reverse=True)
        top_n_seqs = sorted_seqs[:n]

        # Create the outputs.
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@ -1,6 +1,8 @@
 """Sampling parameters for text generation."""
 from typing import List, Optional, Union

+_SAMPLING_EPS = 1e-5
+

 class SamplingParams:
    """Sampling parameters for text generation.
@ -50,7 +52,7 @@ class SamplingParams:
        top_p: float = 1.0,
        top_k: int = -1,
        use_beam_search: bool = False,
-        stop: Union[str, List[str]] = [],
+        stop: Union[None, str, List[str]] = None,
        ignore_eos: bool = False,
        max_tokens: int = 16,
        logprobs: Optional[int] = None,
@ -63,15 +65,20 @@ class SamplingParams:
        self.top_p = top_p
        self.top_k = top_k
        self.use_beam_search = use_beam_search
-        self.stop = [stop] if isinstance(stop, str) else list(stop)
+        if stop is None:
+            self.stop = []
+        elif isinstance(stop, str):
+            self.stop = [stop]
+        else:
+            self.stop = list(stop)
        self.ignore_eos = ignore_eos
        self.max_tokens = max_tokens
        self.logprobs = logprobs

        self._verify_args()
        if self.use_beam_search:
-            self._verity_beam_search()
-        elif self.temperature == 0.0:
+            self._verify_beam_search()
+        elif self.temperature < _SAMPLING_EPS:
            # Zero temperature means greedy sampling.
            self._verify_greedy_sampling()

@ -102,13 +109,13 @@ class SamplingParams:
            raise ValueError(
                f"logprobs must be non-negative, got {self.logprobs}.")

-    def _verity_beam_search(self) -> None:
+    def _verify_beam_search(self) -> None:
        if self.best_of == 1:
            raise ValueError("best_of must be greater than 1 when using beam "
                             f"search. Got {self.best_of}.")
-        if self.temperature > 0.0:
+        if self.temperature > _SAMPLING_EPS:
            raise ValueError("temperature must be 0 when using beam search.")
-        if self.top_p < 1.0:
+        if self.top_p < 1.0 - _SAMPLING_EPS:
            raise ValueError("top_p must be 1 when using beam search.")
        if self.top_k != -1:
            raise ValueError("top_k must be -1 when using beam search.")
@ -117,7 +124,7 @@ class SamplingParams:
        if self.best_of > 1:
            raise ValueError("best_of must be 1 when using greedy sampling."
                             f"Got {self.best_of}.")
-        if self.top_p < 1.0:
+        if self.top_p < 1.0 - _SAMPLING_EPS:
            raise ValueError("top_p must be 1 when using greedy sampling.")
        if self.top_k != -1:
            raise ValueError("top_k must be -1 when using greedy sampling.")
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@ -1,3 +1,4 @@
+"""Sequence and its related classes."""
 import copy
 import enum
 from typing import Dict, List, Optional, Union
@ -7,12 +8,14 @@ from vllm.sampling_params import SamplingParams


 class SequenceStatus(enum.Enum):
+    """Status of a sequence."""
    WAITING = enum.auto()
    RUNNING = enum.auto()
    SWAPPED = enum.auto()
    FINISHED_STOPPED = enum.auto()
    FINISHED_LENGTH_CAPPED = enum.auto()
    FINISHED_ABORTED = enum.auto()
+    FINISHED_IGNORED = enum.auto()

    @staticmethod
    def is_finished(status: "SequenceStatus") -> bool:
@ -20,6 +23,7 @@ class SequenceStatus(enum.Enum):
            SequenceStatus.FINISHED_STOPPED,
            SequenceStatus.FINISHED_LENGTH_CAPPED,
            SequenceStatus.FINISHED_ABORTED,
+            SequenceStatus.FINISHED_IGNORED,
        ]

    @staticmethod
@ -30,12 +34,25 @@ class SequenceStatus(enum.Enum):
            finish_reason = "length"
        elif status == SequenceStatus.FINISHED_ABORTED:
            finish_reason = "abort"
+        elif status == SequenceStatus.FINISHED_IGNORED:
+            finish_reason = "length"
        else:
            finish_reason = None
        return finish_reason


 class SequenceData:
+    """Data associated with a sequence.
+
+
+    Args:
+        prompt_token_ids: The token IDs of the prompt.
+
+    Attributes:
+        prompt_token_ids: The token IDs of the prompt.
+        output_token_ids: The token IDs of the output.
+        cumulative_logprob: The cumulative log probability of the output.
+    """

    def __init__(
        self,
@ -71,6 +88,15 @@ class SequenceData:


 class Sequence:
+    """Stores the data, status, and block information of a sequence.
+
+    Args:
+        seq_id: The ID of the sequence.
+        prompt: The prompt of the sequence.
+        prompt_token_ids: The token IDs of the prompt.
+        block_size: The block size of the sequence. Should be the same as the
+            block size used by the block manager and cache engine.
+    """

    def __init__(
        self,
@ -101,7 +127,8 @@ class Sequence:
        self.logical_token_blocks.append(block)

    def _append_tokens_to_blocks(self, token_ids: List[int]) -> None:
-        while token_ids:
+        cursor = 0
+        while cursor < len(token_ids):
            if not self.logical_token_blocks:
                self._append_logical_block()

@ -111,8 +138,9 @@ class Sequence:
                last_block = self.logical_token_blocks[-1]

            num_empty_slots = last_block.get_num_empty_slots()
-            last_block.append_tokens(token_ids[:num_empty_slots])
-            token_ids = token_ids[num_empty_slots:]
+            last_block.append_tokens(token_ids[cursor:cursor +
+                                               num_empty_slots])
+            cursor += num_empty_slots

    def append_token_id(
        self,
@ -145,19 +173,27 @@ class Sequence:
    def is_finished(self) -> bool:
        return SequenceStatus.is_finished(self.status)

-    def fork(self, child_seq: 'Sequence') -> None:
-        child_seq.logical_token_blocks = copy.deepcopy(self.logical_token_blocks)
+    def fork(self, child_seq: "Sequence") -> None:
+        child_seq.logical_token_blocks = copy.deepcopy(
+            self.logical_token_blocks)
        child_seq.output_logprobs = copy.deepcopy(self.output_logprobs)
        child_seq.data = copy.deepcopy(self.data)
-        return None

    def __repr__(self) -> str:
-        return (f'Sequence(seq_id={self.seq_id}, '
-                f'status={self.status.name}, '
-                f'num_blocks={len(self.logical_token_blocks)})')
+        return (f"Sequence(seq_id={self.seq_id}, "
+                f"status={self.status.name}, "
+                f"num_blocks={len(self.logical_token_blocks)})")


 class SequenceGroup:
+    """A group of sequences that are generated from the same prompt.
+
+    Args:
+        request_id: The ID of the request.
+        seqs: The list of sequences.
+        sampling_params: The sampling parameters used to generate the outputs.
+        arrival_time: The arrival time of the request.
+    """

    def __init__(
        self,
@ -187,7 +223,7 @@ class SequenceGroup:
        for seq in self.seqs:
            if seq.seq_id == seq_id:
                return seq
-        raise ValueError(f'Sequence {seq_id} not found.')
+        raise ValueError(f"Sequence {seq_id} not found.")

    def is_finished(self) -> bool:
        return all(seq.is_finished() for seq in self.seqs)
@ -199,14 +235,25 @@ class SequenceGroup:


 class SequenceGroupMetadata:
+    """Metadata for a sequence group. Used to create `InputMetadata`.
+
+
+    Args:
+        request_id: The ID of the request.
+        is_prompt: Whether the request is at prompt stage.
+        seq_data: The sequence data. (Seq id -> sequence data)
+        sampling_params: The sampling parameters used to generate the outputs.
+        block_tables: The block tables. (Seq id -> list of physical block
+            numbers)
+    """

    def __init__(
        self,
        request_id: str,
        is_prompt: bool,
-        seq_data: Dict[int, SequenceData],      # Seq id -> sequence data.
+        seq_data: Dict[int, SequenceData],
        sampling_params: SamplingParams,
-        block_tables: Dict[int, List[int]],     # Seq id -> list of physical block numbers.
+        block_tables: Dict[int, List[int]],
    ) -> None:
        self.request_id = request_id
        self.is_prompt = is_prompt
@ -216,13 +263,23 @@ class SequenceGroupMetadata:


 class SequenceOutputs:
+    """The model output associated with a sequence.
+
+    Args:
+        seq_id: The ID of the sequence.
+        parent_seq_id: The ID of the parent sequence (for forking in beam
+            search).
+        output_token: The output token ID.
+        logprobs: The logprobs of the output token.
+            (Token id -> logP(x_i+1 | x_0, ..., x_i))
+    """

    def __init__(
        self,
        seq_id: int,
        parent_seq_id: int,
        output_token: int,
-        logprobs: Dict[int, float],         # Token id -> logP(x_i+1 | x_0, ..., x_i).
+        logprobs: Dict[int, float],
    ) -> None:
        self.seq_id = seq_id
        self.parent_seq_id = parent_seq_id
@ -230,15 +287,15 @@ class SequenceOutputs:
        self.logprobs = logprobs

    def __repr__(self) -> str:
-        return (f'SequenceOutputs(seq_id={self.seq_id}, '
-                f'parent_seq_id={self.parent_seq_id}, '
-                f'output_token={self.output_token}), '
-                f'logprobs={self.logprobs}')
+        return (f"SequenceOutputs(seq_id={self.seq_id}, "
+                f"parent_seq_id={self.parent_seq_id}, "
+                f"output_token={self.output_token}), "
+                f"logprobs={self.logprobs}")

    def __eq__(self, other: object) -> bool:
        if not isinstance(other, SequenceOutputs):
            return NotImplemented
-        return (self.seq_id == other.seq_id and
-                self.parent_seq_id == other.parent_seq_id and
-                self.output_token == other.output_token and
-                self.logprobs == other.logprobs)
+        return (self.seq_id == other.seq_id
+                and self.parent_seq_id == other.parent_seq_id
+                and self.output_token == other.output_token
+                and self.logprobs == other.logprobs)
--- a/vllm/transformers_utils/init.py
+++ b/vllm/transformers_utils/init.py
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -0,0 +1,33 @@
+from transformers import AutoConfig, PretrainedConfig
+
+from vllm.transformers_utils.configs import *  # pylint: disable=wildcard-import
+
+_CONFIG_REGISTRY = {
+    "mpt": MPTConfig,
+    "baichuan": BaiChuanConfig,
+    "aquila": AquilaConfig,
+    "qwen": QWenConfig,
+    "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
+    "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
+}
+
+
+def get_config(model: str, trust_remote_code: bool) -> PretrainedConfig:
+    try:
+        config = AutoConfig.from_pretrained(
+            model, trust_remote_code=trust_remote_code)
+    except ValueError as e:
+        if (not trust_remote_code and
+                "requires you to execute the configuration file" in str(e)):
+            err_msg = (
+                "Failed to load the model config. If the model is a custom "
+                "model not yet available in the HuggingFace transformers "
+                "library, consider setting `trust_remote_code=True` in LLM "
+                "or using the `--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+    if config.model_type in _CONFIG_REGISTRY:
+        config_class = _CONFIG_REGISTRY[config.model_type]
+        config = config_class.from_pretrained(model)
+    return config
--- a/vllm/transformers_utils/configs/init.py
+++ b/vllm/transformers_utils/configs/init.py
@ -0,0 +1,16 @@
+from vllm.transformers_utils.configs.mpt import MPTConfig
+from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
+from vllm.transformers_utils.configs.aquila import AquilaConfig
+from vllm.transformers_utils.configs.qwen import QWenConfig
+# RWConfig is for the original tiiuae/falcon-40b(-instruct) and
+# tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
+# `FalconConfig` class from the official HuggingFace transformers library.
+from vllm.transformers_utils.configs.falcon import RWConfig
+
+__all__ = [
+    "MPTConfig",
+    "BaiChuanConfig",
+    "AquilaConfig",
+    "QWenConfig",
+    "RWConfig",
+]
--- a/vllm/transformers_utils/configs/aquila.py
+++ b/vllm/transformers_utils/configs/aquila.py
@ -0,0 +1,63 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Aquila model configuration"""
+
+from transformers import PretrainedConfig
+
+
+class AquilaConfig(PretrainedConfig):
+    model_type = "aquila"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=100008,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.006,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/configs/baichuan.py
+++ b/vllm/transformers_utils/configs/baichuan.py
@ -0,0 +1,62 @@
+# coding=utf-8
+# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from transformers.configuration_utils import PretrainedConfig
+
+
+class BaiChuanConfig(PretrainedConfig):
+    model_type = "baichuan"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=64000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=4096,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/vllm/transformers_utils/configs/falcon.py
+++ b/vllm/transformers_utils/configs/falcon.py
@ -0,0 +1,87 @@
+# Adapted from
+# https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
+# Copyright 2023 The vLLM team.
+# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Falcon configuration"""
+from transformers.configuration_utils import PretrainedConfig
+
+
+class RWConfig(PretrainedConfig):
+    model_type = "falcon"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_hidden_layers": "n_layer",
+        "num_attention_heads": "n_head",
+        "num_kv_heads": "n_head_kv",
+    }
+
+    def __init__(
+        self,
+        vocab_size=250880,
+        hidden_size=64,
+        n_layer=2,
+        n_head=8,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=1,
+        eos_token_id=2,
+        hidden_dropout=0.0,
+        attention_dropout=0.0,
+        multi_query=True,
+        n_head_kv=None,
+        alibi=False,
+        bias=False,
+        parallel_attn=False,
+        new_decoder_architecture=False,
+        **kwargs,
+    ) -> None:
+        self.vocab_size = vocab_size
+        # Backward compatibility with n_embed kwarg
+        n_embed = kwargs.pop("n_embed", None)
+        self.hidden_size = hidden_size if n_embed is None else n_embed
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.multi_query = multi_query
+        self.n_head_kv = 1 if n_head_kv is None else n_head_kv
+        self.alibi = alibi
+        self.bias = bias
+        self.parallel_attn = parallel_attn
+        self.new_decoder_architecture = new_decoder_architecture
+
+        if self.hidden_size == 8192:
+            # Hack for falcon-40b
+            self.new_decoder_architecture = True
+
+        super().__init__(bos_token_id=bos_token_id,
+                         eos_token_id=eos_token_id,
+                         **kwargs)
+
+    @property
+    def head_dim(self):
+        return self.hidden_size // self.n_head
+
+    @property
+    def rotary(self):
+        return not self.alibi
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@ -0,0 +1,74 @@
+# Adapted from
+# https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
+from typing import Any, Dict, Optional, Union
+
+from transformers import PretrainedConfig
+
+_ATTN_CONFIG_DEFAULTS = {
+    "attn_type": "multihead_attention",
+    "attn_pdrop": 0.0,
+    "attn_impl": "triton",
+    "qk_ln": False,
+    "clip_qkv": None,
+    "softmax_scale": None,
+    "prefix_lm": False,
+    "attn_uses_sequence_id": False,
+    "alibi": False,
+    "alibi_bias_max": 8,
+}
+
+
+class MPTConfig(PretrainedConfig):
+    model_type = "mpt"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_heads",
+        "num_hidden_layers": "n_layers",
+    }
+
+    def __init__(
+        self,
+        d_model: int = 2048,
+        n_heads: int = 16,
+        n_layers: int = 24,
+        expansion_ratio: int = 4,
+        max_seq_len: int = 2048,
+        vocab_size: int = 50368,
+        resid_pdrop: float = 0.0,
+        emb_pdrop: float = 0.0,
+        learned_pos_emb: bool = True,
+        attn_config: Optional[Dict[str, Any]] = None,
+        init_device: str = "cpu",
+        logit_scale: Optional[Union[float, str]] = None,
+        no_bias: bool = False,
+        verbose: int = 0,
+        embedding_fraction: float = 1.0,
+        norm_type: str = "low_precision_layernorm",
+        use_cache: bool = False,
+        **kwargs,
+    ) -> None:
+        self.d_model = d_model
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.expansion_ratio = expansion_ratio
+        self.max_seq_len = max_seq_len
+        self.vocab_size = vocab_size
+        self.resid_pdrop = resid_pdrop
+        self.emb_pdrop = emb_pdrop
+        self.learned_pos_emb = learned_pos_emb
+        if attn_config is None:
+            self.attn_config = _ATTN_CONFIG_DEFAULTS
+        else:
+            self.attn_config = attn_config
+        self.init_device = init_device
+        self.logit_scale = logit_scale
+        self.no_bias = no_bias
+        self.verbose = verbose
+        self.embedding_fraction = embedding_fraction
+        self.norm_type = norm_type
+        self.use_cache = use_cache
+        if "name" in kwargs:
+            del kwargs["name"]
+        if "loss_fn" in kwargs:
+            del kwargs["loss_fn"]
+        super().__init__(**kwargs)
--- a/vllm/transformers_utils/configs/qwen.py
+++ b/vllm/transformers_utils/configs/qwen.py
@ -0,0 +1,71 @@
+# Copyright (c) Alibaba Cloud.
+# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
+
+from transformers import PretrainedConfig
+
+
+class QWenConfig(PretrainedConfig):
+    model_type = "qwen"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "max_position_embeddings": "n_positions",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=151851,
+        n_embd=4096,
+        n_layer=32,
+        n_head=32,
+        n_inner=None,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        eos_token_id=151643,
+        apply_residual_connection_post_layernorm=False,
+        bf16=True,
+        kv_channels=128,
+        rotary_pct=1.0,
+        rotary_emb_base=10000,
+        use_dynamic_ntk=False,
+        use_logn_attn=False,
+        use_flash_attn=True,
+        ffn_hidden_size=22016,
+        no_bias=True,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.eos_token_id = eos_token_id
+        super().__init__(eos_token_id=eos_token_id,
+                         tie_word_embeddings=tie_word_embeddings,
+                         **kwargs)
+
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.apply_residual_connection_post_layernorm = (
+            apply_residual_connection_post_layernorm)
+        self.bf16 = bf16
+        self.kv_channels = kv_channels
+        self.rotary_pct = rotary_pct
+        self.rotary_emb_base = rotary_emb_base
+        self.use_dynamic_ntk = use_dynamic_ntk
+        self.use_logn_attn = use_logn_attn
+        self.use_flash_attn = use_flash_attn
+        self.ffn_hidden_size = ffn_hidden_size
+        self.no_bias = no_bias
+        self.tie_word_embeddings = tie_word_embeddings
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -0,0 +1,118 @@
+from typing import List, Tuple, Union
+
+from transformers import (AutoTokenizer, PreTrainedTokenizer,
+                          PreTrainedTokenizerFast)
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+# A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
+_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
+
+
+def get_tokenizer(
+    tokenizer_name: str,
+    *args,
+    tokenizer_mode: str = "auto",
+    trust_remote_code: bool = False,
+    **kwargs,
+) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+    """Gets a tokenizer for the given model name via Huggingface."""
+    if tokenizer_mode == "slow":
+        if kwargs.get("use_fast", False):
+            raise ValueError(
+                "Cannot use the fast tokenizer in slow tokenizer mode.")
+        kwargs["use_fast"] = False
+
+    if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
+        logger.info(
+            "For some LLaMA-based models, initializing the fast tokenizer may "
+            "take a long time. To eliminate the initialization time, consider "
+            f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
+            "tokenizer.")
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(
+            tokenizer_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            **kwargs)
+    except TypeError as e:
+        # The LLaMA tokenizer causes a protobuf error in some environments.
+        err_msg = (
+            "Failed to load the tokenizer. If you are using a LLaMA-based "
+            f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
+            "tokenizer.")
+        raise RuntimeError(err_msg) from e
+    except ValueError as e:
+        # If the error pertains to the tokenizer class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        if (not trust_remote_code and
+            ("does not exist or is not currently imported." in str(e)
+             or "requires you to execute the tokenizer file" in str(e))):
+            err_msg = (
+                "Failed to load the tokenizer. If the tokenizer is a custom "
+                "tokenizer not yet available in the HuggingFace transformers "
+                "library, consider setting `trust_remote_code=True` in LLM "
+                "or using the `--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    if not isinstance(tokenizer, PreTrainedTokenizerFast):
+        logger.warning(
+            "Using a slow tokenizer. This might cause a significant "
+            "slowdown. Consider using a fast tokenizer instead.")
+    return tokenizer
+
+
+def detokenize_incrementally(
+    tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
+    prev_output_tokens: List[str],
+    new_token_id: int,
+    skip_special_tokens: bool,
+) -> Tuple[str, str]:
+    """Detokenizes the new token in conjunction with the previous output tokens.
+
+    NOTE: This function does not update prev_output_tokens.
+
+    Returns:
+        new_token: The new token as a string.
+        output_text: The new output text as a string.
+    """
+    if skip_special_tokens and (new_token_id in tokenizer.all_special_ids):
+        return None, prev_output_tokens
+    new_token = tokenizer.convert_ids_to_tokens(
+        new_token_id, skip_special_tokens=skip_special_tokens)
+    output_tokens = prev_output_tokens + [new_token]
+
+    # Convert the tokens to a string.
+    # Optimization: If the tokenizer does not have `added_tokens_encoder`,
+    # then we can directly use `convert_tokens_to_string`.
+    if not getattr(tokenizer, "added_tokens_encoder", {}):
+        output_text = tokenizer.convert_tokens_to_string(output_tokens)
+        return new_token, output_text
+
+    # Adapted from
+    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # NOTE(woosuk): The following code is slow because it runs a for loop over
+    # the output_tokens. In Python, running a for loop over a list can be slow
+    # even when the loop body is very simple.
+    sub_texts = []
+    current_sub_text = []
+    for token in output_tokens:
+        if skip_special_tokens and token in tokenizer.all_special_tokens:
+            continue
+        if token in tokenizer.added_tokens_encoder:
+            if current_sub_text:
+                sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+                sub_texts.append(sub_text)
+                current_sub_text = []
+            sub_texts.append(token)
+        else:
+            current_sub_text.append(token)
+    if current_sub_text:
+        sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
+        sub_texts.append(sub_text)
+    output_text = " ".join(sub_texts)
+    return new_token, output_text
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -1,4 +1,5 @@
 import enum
+from platform import uname
 import uuid

 import psutil
@ -16,9 +17,9 @@ class Counter:
        self.counter = start

    def __next__(self) -> int:
-        id = self.counter
+        i = self.counter
        self.counter += 1
-        return id
+        return i

    def reset(self) -> None:
        self.counter = 0
@ -36,3 +37,8 @@ def get_cpu_memory() -> int:

 def random_uuid() -> str:
    return str(uuid.uuid4().hex)
+
+
+def in_wsl() -> bool:
+    # Reference: https://github.com/microsoft/WSL/issues/4071
+    return "microsoft" in " ".join(uname()).lower()
--- a/vllm/worker/cache_engine.py
+++ b/vllm/worker/cache_engine.py
@ -5,6 +5,10 @@ import torch

 from vllm import cache_ops
 from vllm.config import CacheConfig, ModelConfig, ParallelConfig
+from vllm.logger import init_logger
+from vllm.utils import in_wsl
+
+logger = init_logger(__name__)

 KVCache = Tuple[torch.Tensor, torch.Tensor]

@ -85,16 +89,22 @@ class CacheEngine:
        cpu_cache: List[KVCache] = []
        key_block_shape = self.get_key_block_shape()
        value_block_shape = self.get_value_block_shape()
+        pin_memory = not in_wsl()
+        if not pin_memory:
+            # Pinning memory in WSL is not supported.
+            # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
        for _ in range(self.num_layers):
            key_blocks = torch.empty(
                size=(self.num_cpu_blocks, *key_block_shape),
                dtype=self.dtype,
-                pin_memory=True,
+                pin_memory=pin_memory,
            )
            value_blocks = torch.empty(
                size=(self.num_cpu_blocks, *value_block_shape),
                dtype=self.dtype,
-                pin_memory=True,
+                pin_memory=pin_memory,
            )
            cpu_cache.append((key_blocks, value_blocks))
        return cpu_cache
@ -110,11 +120,10 @@ class CacheEngine:
                src_key_cache, src_value_cache = src[i]
                dst_key_cache, dst_value_cache = dst[i]
                # Copy the key blocks.
-                cache_ops.swap_blocks(
-                    src_key_cache, dst_key_cache, src_to_dst)
+                cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
                # Copy the value blocks.
-                cache_ops.swap_blocks(
-                    src_value_cache, dst_value_cache, src_to_dst)
+                cache_ops.swap_blocks(src_value_cache, dst_value_cache,
+                                      src_to_dst)
                event = self.events[i]
                event.record(stream=self.cache_stream)

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@ -1,13 +1,15 @@
 """A GPU worker class."""
-from typing import Dict, List, Tuple
+import os
+from typing import Dict, List, Tuple, Optional

 import torch
+import torch.distributed

 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
 from vllm.model_executor import get_model, InputMetadata, set_random_seed
 from vllm.model_executor.parallel_utils.parallel_state import (
-    initialize_model_parallel, initialize_all_reduce_launcher)
+    initialize_model_parallel)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SequenceData, SequenceGroupMetadata, SequenceOutputs
 from vllm.worker.cache_engine import CacheEngine
@ -27,8 +29,8 @@ class Worker:
        model_config: ModelConfig,
        parallel_config: ParallelConfig,
        scheduler_config: SchedulerConfig,
-        rank: int,
-        distributed_init_method: str,
+        rank: Optional[int] = None,
+        distributed_init_method: Optional[str] = None,
    ) -> None:
        self.model_config = model_config
        self.parallel_config = parallel_config
@ -36,19 +38,6 @@ class Worker:
        self.rank = rank
        self.distributed_init_method = distributed_init_method

-        # Initialize the distributed environment.
-        _init_distributed_environment(parallel_config, rank,
-                                      distributed_init_method)
-
-        # Initialize the model.
-        set_random_seed(self.model_config.seed)
-        self.model = get_model(model_config)
-        initialize_all_reduce_launcher(
-            self.scheduler_config.max_num_batched_tokens,
-            self.model_config.get_hidden_size(),
-            self.model_config.dtype,
-        )
-
        # Uninitialized cache engine. Will be initialized by
        # self.init_cache_engine().
        self.cache_config = None
@ -57,6 +46,26 @@ class Worker:
        self.cache_events = None
        self.gpu_cache = None

+    def init_model(self):
+        # This env var set by Ray causes exceptions with graph building.
+        os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None)
+        # Env vars will be set by Ray.
+        self.rank = self.rank if self.rank is not None else int(
+            os.getenv("RANK", "-1"))
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        self.device = torch.device(f"cuda:{local_rank}")
+        if self.rank < 0:
+            raise ValueError("Invalid or unspecified rank.")
+        torch.cuda.set_device(self.device)
+
+        # Initialize the distributed environment.
+        _init_distributed_environment(self.parallel_config, self.rank,
+                                      self.distributed_init_method)
+
+        # Initialize the model.
+        set_random_seed(self.model_config.seed)
+        self.model = get_model(self.model_config)
+
    @torch.inference_mode()
    def profile_num_available_blocks(
        self,
@ -73,8 +82,8 @@ class Worker:
        # number of tokens equal to max_num_batched_tokens.

        # Enable top-k sampling to reflect the accurate memory usage.
-        sampling_params = SamplingParams(top_p=0.99,
-                                         top_k=self.model.config.vocab_size - 1)
+        vocab_size = self.model.config.vocab_size
+        sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
        max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
        max_num_seqs = self.scheduler_config.max_num_seqs
        seqs = []
@ -91,7 +100,8 @@ class Worker:
            )
            seqs.append(seq)

-        input_tokens, input_positions, input_metadata = self._prepare_inputs(seqs)
+        input_tokens, input_positions, input_metadata = self._prepare_inputs(
+            seqs)

        # Execute the model.
        num_layers = self.model_config.get_num_layers(self.parallel_config)
@ -110,9 +120,12 @@ class Worker:
        total_gpu_memory = get_gpu_memory()
        cache_block_size = CacheEngine.get_cache_block_size(
            block_size, self.model_config, self.parallel_config)
-        num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization
-                              - peak_memory) // cache_block_size)
+        num_gpu_blocks = int(
+            (total_gpu_memory * gpu_memory_utilization - peak_memory) //
+            cache_block_size)
        num_cpu_blocks = int(cpu_swap_space // cache_block_size)
+        num_gpu_blocks = max(num_gpu_blocks, 0)
+        num_cpu_blocks = max(num_cpu_blocks, 0)
        torch.cuda.empty_cache()

        # Reset the seed to ensure that the random state is not affected by
@ -123,8 +136,8 @@ class Worker:
    def init_cache_engine(self, cache_config: CacheConfig) -> None:
        self.cache_config = cache_config
        self.block_size = cache_config.block_size
-        self.cache_engine = CacheEngine(
-            self.cache_config, self.model_config, self.parallel_config)
+        self.cache_engine = CacheEngine(self.cache_config, self.model_config,
+                                        self.parallel_config)
        self.cache_events = self.cache_engine.events
        self.gpu_cache = self.cache_engine.gpu_cache

@ -200,8 +213,8 @@ class Worker:
                generation_block_tables.append(block_table)

                max_context_len = max(max_context_len, context_len)
-                max_num_blocks_per_seq = max(
-                    max_num_blocks_per_seq, len(block_table))
+                max_num_blocks_per_seq = max(max_num_blocks_per_seq,
+                                             len(block_table))
                context_lens.append(context_len)

                block_number = block_table[position // self.block_size]
@ -221,7 +234,8 @@ class Worker:
        context_lens_tensor = torch.cuda.IntTensor(context_lens)
        padded_block_tables = [
            _pad_to_max(block_table, max_num_blocks_per_seq)
-            for block_table in generation_block_tables]
+            for block_table in generation_block_tables
+        ]
        block_tables_tensor = torch.cuda.IntTensor(padded_block_tables)

        seq_data: Dict[int, SequenceData] = {}
@ -289,15 +303,28 @@ class Worker:
 def _init_distributed_environment(
    parallel_config: ParallelConfig,
    rank: int,
-    distributed_init_method: str,
+    distributed_init_method: Optional[str] = None,
 ) -> None:
    """Initialize the distributed environment."""
-    torch.distributed.init_process_group(
-        backend="nccl",
-        world_size=parallel_config.world_size,
-        rank=rank,
-        init_method=distributed_init_method,
-    )
+    if torch.distributed.is_initialized():
+        torch_world_size = torch.distributed.get_world_size()
+        if torch_world_size != parallel_config.world_size:
+            raise RuntimeError(
+                "torch.distributed is already initialized but the torch world "
+                "size does not match parallel_config.world_size "
+                f"({torch_world_size} vs. {parallel_config.world_size}).")
+    elif not distributed_init_method:
+        raise ValueError(
+            "distributed_init_method must be set if torch.distributed "
+            "is not already initialized")
+    else:
+        torch.distributed.init_process_group(
+            backend="nccl",
+            world_size=parallel_config.world_size,
+            rank=rank,
+            init_method=distributed_init_method,
+        )
+
    # A small all_reduce for warmup.
    torch.distributed.all_reduce(torch.zeros(1).cuda())
    initialize_model_parallel(parallel_config.tensor_parallel_size,
Author	SHA1	Message	Date
Woosuk Kwon	791d79de32	Bump up the version to v0.1.4 (#846 )	2023-08-25 12:28:00 +09:00
Woosuk Kwon	94d2f59895	Set replacement=True in torch.multinomial (#858 )	2023-08-25 12:22:01 +09:00
wenjun93	75c0ca9d43	Clean up code (#844 )	2023-08-23 16:44:15 -07:00
Woosuk Kwon	2a4ec90854	Fix for breaking changes in xformers 0.0.21 (#834 )	2023-08-23 17:44:21 +09:00
ldwang	85ebcda94d	Fix typo of Aquila in README.md (#836 )	2023-08-22 20:48:36 -07:00
Woosuk Kwon	d64bf1646c	Implement approximate GELU kernels (#828 )	2023-08-23 07:43:21 +09:00
Woosuk Kwon	a41c20435e	Add compute capability 8.9 to default targets (#829 )	2023-08-23 07:28:38 +09:00
Wen Sun	eedac9dba0	fix: revert code to avoid no attribute problem (#827 )	2023-08-22 11:55:16 -07:00
Zhuohan Li	14f9c72bfd	Update Supported Model List (#825 )	2023-08-22 11:51:44 -07:00
shunxing1234	ad5f2fe34c	Add support for aquila (#663 ) * add aquila Signed-off-by: ftgreat <ftgreat@163.com> * fix some bug Signed-off-by: shunxing1234 <xw747777271@gmail.com> * delete pdb Signed-off-by: shunxing1234 <xw747777271@gmail.com> * fix bugs Signed-off-by: shunxing1234 <xw747777271@gmail.com> * fix bugs Signed-off-by: shunxing1234 <xw747777271@gmail.com> * delete whitespace Signed-off-by: shunxing1234 <xw747777271@gmail.com> * format * fix order --------- Signed-off-by: ftgreat <ftgreat@163.com> Signed-off-by: shunxing1234 <xw747777271@gmail.com> Co-authored-by: ftgreat <ftgreat@163.com>	2023-08-22 00:13:36 -07:00
zhaoyang-star	4f8584756d	Fix mqa is false case in gpt_bigcode (#806 )	2023-08-21 22:22:06 -07:00
Xudong Zhang	65fc1c3127	set default coompute capability according to cuda version (#773 )	2023-08-21 16:05:44 -07:00
Daniel	c393af6cd7	[Feature \| CI] Added a github action to build wheels (#746 )	2023-08-21 16:59:15 +09:00
wangcx18	0c04ce3234	Fix typo in sampling_params.py (#788 )	2023-08-18 10:12:46 +09:00
Xinyu Yang	73b3de79ea	explicitly del state (#784 )	2023-08-17 12:56:04 -07:00
Abraham-Xu	d1744376ae	Align with huggingface Top K sampling (#753 )	2023-08-15 16:44:33 -07:00
Ikko Eltociear Ashimine	805de738f6	Fix typo in tokenizer.py (#750 ) conjuction -> conjunction	2023-08-14 22:26:36 -07:00
Uranus	1b151ed181	Fix baichuan doc style (#748 )	2023-08-13 20:57:31 -07:00
WanMok	e06f504a76	Supports tokens and arrays of tokens as inputs to the OpenAI completion API (#715 )	2023-08-11 12:14:34 -07:00
WRH	462ae5220a	[Fix] unwantted bias in InternLM Model (#740 )	2023-08-11 11:40:37 -07:00
Nicolas Basile	66c54aa9c3	Check the max prompt length for the OpenAI completions API (#472 )	2023-08-08 17:43:49 -07:00
Jia Guoqing	735ecfff61	add internlm model (#528 )	2023-08-08 16:35:06 -07:00
Qing	a57d13cc96	add QWen-7b (#685 ) Co-authored-by: wq.chu <wq.chu@tianrang-inc.com>	2023-08-08 13:50:38 -07:00
Dean Leitersdorf	79af7e96a0	[OPTIMIZATION] Optimizes the single_query_cached_kv_attention kernel (#420 )	2023-08-04 10:57:29 -07:00
Wen Sun	621980bdc0	fix: incorrect bigcode attention heads num (#676 )	2023-08-04 10:35:22 -07:00
Zhuohan Li	aa84c92ef6	Bump up version to 0.1.3 (#657 )	2023-08-02 16:46:53 -07:00
Zhuohan Li	f7389f4763	[Doc] Add Baichuan 13B to supported models (#656 )	2023-08-02 16:45:12 -07:00
Woosuk Kwon	55fe8a81ec	Refactor scheduler (#658 )	2023-08-02 16:42:01 -07:00
YHPeter	e8ddc08ec8	[BUG FIX] upgrade fschat version to 0.2.23 (#650 ) Co-authored-by: hao.yu <hao.yu@cn-c017.server.mila.quebec>	2023-08-02 14:05:59 -07:00
Zhuohan Li	1b0bd0fe8a	Add Falcon support (new) (#592 )	2023-08-02 14:04:39 -07:00
Lily Liu	20044cab7a	Fix log message in scheduler (#652 )	2023-08-02 13:35:10 -07:00
Song	64f23c2900	fix baichuan for different position embedding for 7b and 13b models (#643 )	2023-08-01 22:22:51 -07:00
Qing	d4c7755ca8	fix biachuan-7b tp (#598 ) Co-authored-by: wq.chu <wq.chu@tianrang-inc.com>	2023-08-01 15:41:36 -07:00
Chaofan Lin	aa39e42c5a	fix doc (#622 )	2023-07-31 13:11:57 -07:00
Fang li	953f28cf9a	fix ModuleNotFoundError (#599 ) Co-authored-by: fangli <fangli@tencent.com>	2023-07-29 20:52:41 -07:00
Xudong Zhang	c0d00f5be6	[Fix] fix import error of RayWorker (#604 ) (#605 )	2023-07-27 23:37:40 -07:00
Zhuohan Li	58a072be15	[Fix] Add model sequence length into model config (#575 )	2023-07-25 23:46:30 -07:00
Zhuohan Li	82ad323dee	[Fix] Add chat completion Example and simplify dependencies (#576 )	2023-07-25 23:45:48 -07:00
Zhuohan Li	df5dd3c68e	Add Baichuan-7B to README (#494 )	2023-07-25 15:25:12 -07:00
MoeedDar	2d867b55fa	fixed tensor parallel is not defined (#564 )	2023-07-25 14:16:51 -07:00
Tao Peng	d7a1c6d614	Fix paged attention testing. (#495 ) Signed-off-by: Tao Peng <jiankeng.pt@alibaba-inc.com>	2023-07-24 21:01:56 -07:00
Zhuohan Li	7d5a155e4a	[Fix] Fix GPTBigcoder for distributed execution (#503 )	2023-07-24 18:36:33 -07:00
leegohi04517	1dde34e0f8	GPTJConfig has no attribute rotary. (#532 )	2023-07-24 11:29:30 -07:00
Zhuohan Li	6fc2a38b11	Add support for LLaMA-2 (#505 )	2023-07-20 11:38:27 -07:00
Antoni Baum	c487a221ee	Fix bad assert in initialize_cluster if PG already exists (#526 )	2023-07-19 23:17:12 -07:00
Antoni Baum	9925c17940	Ray placement group support (#397 )	2023-07-19 22:49:31 -07:00
Ricardo Lu	8c4b2592fb	fix: enable trust-remote-code in api server & benchmark. (#509 )	2023-07-19 17:06:15 -07:00
WRH	cf21a9bd5c	support trust_remote_code in benchmark (#518 )	2023-07-19 17:02:40 -07:00
Massimiliano Pronesti	16c3e295a8	fix(ray_utils): ignore re-init error (#465 )	2023-07-19 17:01:19 -07:00
Song	bda41c70dd	hotfix attn alibi wo head mapping (#496 ) Co-authored-by: oliveryuan <oliveryuan@basemind.com>	2023-07-18 11:31:48 -07:00
Lily Liu	453bafb96f	Merge pull request #498 from MoeedDar/main Fixed old name reference for max_seq_len	2023-07-18 09:22:56 -07:00
MoeedDar	328d231c17	Fixed old name reference for max_seq_len	2023-07-18 16:47:59 +01:00
Lily Liu	b4b195b360	fix max seq len (#489 )	2023-07-17 23:20:20 -07:00
codethazine	20b0d88d16	Add support for baichuan (#365 )	2023-07-17 13:50:55 -07:00
Zhuohan Li	2bdea7ac11	[Fix] Fix the condition of max_seq_len (#477 )	2023-07-17 00:33:48 -04:00
Zhanghao Wu	58df2883cb	[Doc] Add doc for running vLLM on the cloud (#426 ) Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-07-16 13:37:14 -07:00
Zhangir Azerbayev	6d7d95a70a	Offload port selection to OS (#467 )	2023-07-15 23:11:02 -07:00
Zhuohan Li	96853af5a8	Optimize MQA Kernel (#452 )	2023-07-14 20:06:40 -04:00
Wen Sun	dbed69058c	Fix the `KeyError` when loading bloom-based models (#441 )	2023-07-13 21:58:09 -07:00
panda	7b6ae94059	add vocab padding for LLama(Support WizardLM) (#411 )	2023-07-13 23:56:22 -04:00
xcnick	c6dfc3cdbe	Fix handling of special tokens in decoding. (#418 )	2023-07-12 11:14:56 -04:00
Keming	51be365143	fix: freeze pydantic to v1 (#429 )	2023-07-12 11:10:55 -04:00
Andre Slavescu	c894836108	[Model] Add support for GPT-J (#226 ) Co-authored-by: woWoosuk Kwon <woosuk.kwon@berkeley.edu>	2023-07-08 17:55:16 -07:00
Fazlul Shahriar	75beba29b5	Don't try to load training_args.bin (#373 )	2023-07-08 15:26:28 -07:00
Woosuk Kwon	ddfdf470ae	Add trust_remote_code arg to get_config (#405 )	2023-07-08 15:24:17 -07:00
Woosuk Kwon	b6fbb9a565	Sort the outputs before return (#402 )	2023-07-08 14:48:18 -07:00
Lily Liu	2179e4f4c5	avoid python list copy in sequence initialization (#401 )	2023-07-08 12:42:08 -07:00
codethazine	a945fcc2ae	Add trust-remote-code flag to handle remote tokenizers (#364 )	2023-07-07 11:04:58 -07:00
Nicolas Frenay	be54f8e5c4	[Fix] Change /generate response-type to json for non-streaming (#374 )	2023-07-06 18:15:17 -07:00
Ricardo Lu	b396cb4998	fix: only response [DONE] once when streaming response. (#378 )	2023-07-06 18:08:40 -07:00
Woosuk Kwon	1c395b4eaa	Bump up the version (#300 )	2023-07-04 21:41:53 -07:00
akxxsb	3d64cf019e	[Server] use fastchat.model.model_adapter.get_conversation_template method to get model template (#357 )	2023-07-04 21:39:59 -07:00
Zhuohan Li	98fe8cb542	[Server] Add option to specify chat template for chat endpoint (#345 )	2023-07-03 23:01:56 -07:00
Woosuk Kwon	ffa6d2f9f9	[Docs] Fix typo (#346 )	2023-07-03 16:51:47 -07:00
Woosuk Kwon	404422f42e	[Model] Add support for MPT (#334 )	2023-07-03 16:47:53 -07:00
coolcloudcol	7717d0838b	Fix an endless loop issue when engine_step throws a RuntimeError (#339 )	2023-07-03 15:22:28 -07:00
Zhuohan Li	42e0c1df78	[Quality] Add CI for formatting (#343 )	2023-07-03 14:50:56 -07:00
Woosuk Kwon	e41f06702c	Add support for BLOOM (#331 )	2023-07-03 13:12:35 -07:00
Zhuohan Li	d6fa1be3a8	[Quality] Add code formatter and linter (#326 )	2023-07-03 11:31:55 -07:00
Zhuohan Li	0ffded812a	[Fix] Better error message for batched prompts (#342 )	2023-07-03 09:27:31 -07:00
Michele Catalano	0bd2a573a5	Allow send list of str for the Prompt on openai demo endpoint /v1/completions (#323 ) * allow str or List[str] for prompt * Update vllm/entrypoints/openai/api_server.py Co-authored-by: Zhuohan Li <zhuohan123@gmail.com> --------- Co-authored-by: Zhuohan Li <zhuohan123@gmail.com>	2023-07-03 09:17:50 -07:00
Ricardo Lu	49b26e2cec	feat: add ChatCompletion endpoint in OpenAI demo server. (#330 )	2023-07-02 22:54:33 -07:00
Lily Liu	dafd924c1f	Raise error for long prompt (#273 )	2023-06-30 18:48:49 -07:00
Zhuohan Li	598dc4b79a	[Fix] Weight loading for GPTBigCode (#313 )	2023-06-29 22:14:17 -07:00
Zhuohan Li	85de093472	[Fix] Do not pin memory when in WSL (#312 )	2023-06-29 15:00:21 -07:00
Zhanghao Wu	f72297562f	Add news for the vllm+skypilot example (#314 )	2023-06-29 12:32:37 -07:00
Bayang	9d27b09d12	Update README.md (#306 )	2023-06-29 06:52:15 -07:00
Woosuk Kwon	998d9d1509	[Tokenizer] Add tokenizer mode (#298 )	2023-06-28 14:19:22 -07:00
Lily Liu	425040d4c1	remove floats == 0 comparison (#285 )	2023-06-28 14:11:51 -07:00
Woosuk Kwon	4338cc4750	[Tokenizer] Add an option to specify tokenizer (#284 )	2023-06-28 09:46:58 -07:00
Jishnu Ray Chowdhury	bdd6b4c8bc	Add LLM.set_tokenizer (#283 )	2023-06-28 00:28:29 -07:00
Cody Yu	2b7d3aca2e	Update setup.py (#282 ) Co-authored-by: neubig <neubig@gmail.com>	2023-06-27 14:34:23 -07:00
twaka	4026a049d3	expand coverage of gpt2 model loading (#271 )	2023-06-27 06:27:41 -07:00
Zhuohan Li	43710e8d09	[Fix] Fix default port number in benchmark scripts (#265 )	2023-06-26 13:15:35 -07:00
Woosuk Kwon	526df28fb2	[BugFix] Fix a bug in counting running sequences (#266 )	2023-06-26 13:09:02 -07:00
Zhuohan Li	2cf1a333b6	[Doc] Documentation for distributed inference (#261 )	2023-06-26 11:34:23 -07:00
Zhuohan Li	0b7db411b5	[Bug] Fix the OOM condition for CPU cache (#260 )	2023-06-26 11:16:13 -07:00
BasicCoder	471a7a4566	Compatible with Decapoda Research llama hf version (#251 )	2023-06-26 09:23:57 -07:00
Lianmin Zheng	6214dd6ce9	Update README.md (#236 )	2023-06-25 16:58:06 -07:00
metacryptom	0603379863	fix wrong using getattr to get dict value (#232 )	2023-06-24 22:00:24 -07:00
Woosuk Kwon	665c48963b	[Docs] Add GPTBigCode to supported models (#213 )	2023-06-22 15:05:11 -07:00
Michael Feil	298695b766	GPTBigCode (StarCoder, SantaCoder Support) (#209 )	2023-06-23 01:49:27 +08:00