Bump up to v0.2.3 (#1903 )

Fix num_gpus when TP > 1 (#1852 )
chore(examples-docs): upgrade to OpenAI V1 (#1785 )
2025-10-21 15:43:52 +08:00 · 2023-12-03 12:27:47 -08:00 · 2023-12-03 12:24:30 -08:00 · 2023-12-03 01:11:22 -08:00 · 2023-12-02 22:17:33 -08:00 · 2023-12-02 21:18:40 -08:00
103 changed files with 2973 additions and 2419 deletions
--- a/.github/workflows/pylint.yml
+++ b/.github/workflows/pylint.yml
@ -1,4 +1,4 @@
-name: pylint
+name: ruff

 on:
  # Trigger the workflow on push or pull request,
@ -11,7 +11,7 @@ on:
      - main

 jobs:
-  pylint:
+  ruff:
    runs-on: ubuntu-latest
    strategy:
      matrix:
@ -25,7 +25,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install pylint==2.8.2
-    - name: Analysing the code with pylint
+        pip install ruff==0.1.5
+    - name: Analysing the code with ruff
      run: |
-        pylint vllm tests
+        ruff vllm tests
--- a/.pylintrc
+++ b/.pylintrc
@ -1,434 +0,0 @@
-# This Pylint rcfile contains a best-effort configuration to uphold the
-# best-practices and style described in the Google Python style guide:
-#   https://google.github.io/styleguide/pyguide.html
-#
-# Its canonical open-source location is:
-#   https://google.github.io/styleguide/pylintrc
-
-[MASTER]
-
-# Files or directories to be skipped. They should be base names, not paths.
-ignore=docs
-
-# Files or directories matching the regex patterns are skipped. The regex
-# matches against base names, not paths.
-ignore-patterns=
-
-# Pickle collected data for later comparisons.
-persistent=no
-
-# List of plugins (as comma separated values of python modules names) to load,
-# usually to register additional checkers.
-load-plugins=
-
-# Use multiple processes to speed up Pylint.
-jobs=4
-
-# Allow loading of arbitrary C extensions. Extensions are imported into the
-# active Python interpreter and may run arbitrary code.
-unsafe-load-any-extension=no
-
-
-[MESSAGES CONTROL]
-
-# Only show warnings with the listed confidence levels. Leave empty to show
-# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
-confidence=
-
-# Enable the message, report, category or checker with the given id(s). You can
-# either give multiple identifier separated by comma (,) or put this option
-# multiple time (only on the command line, not in the configuration file where
-# it should appear only once). See also the "--disable" option for examples.
-#enable=
-
-# Disable the message, report, category or checker with the given id(s). You
-# can either give multiple identifiers separated by comma (,) or put this
-# option multiple times (only on the command line, not in the configuration
-# file where it should appear only once).You can also use "--disable=all" to
-# disable everything first and then reenable specific checks. For example, if
-# you want to run only the similarities checker, you can use "--disable=all
-# --enable=similarities". If you want to run only the classes checker, but have
-# no Warning level messages displayed, use"--disable=all --enable=classes
-# --disable=W"
-disable=abstract-method,
-        apply-builtin,
-        arguments-differ,
-        attribute-defined-outside-init,
-        backtick,
-        bad-option-value,
-        basestring-builtin,
-        buffer-builtin,
-        c-extension-no-member,
-        consider-using-enumerate,
-        cmp-builtin,
-        cmp-method,
-        coerce-builtin,
-        coerce-method,
-        delslice-method,
-        div-method,
-        duplicate-code,
-        eq-without-hash,
-        execfile-builtin,
-        file-builtin,
-        filter-builtin-not-iterating,
-        fixme,
-        getslice-method,
-        global-statement,
-        hex-method,
-        idiv-method,
-        implicit-str-concat-in-sequence,
-        import-error,
-        import-self,
-        import-star-module-level,
-        inconsistent-return-statements,
-        input-builtin,
-        intern-builtin,
-        invalid-str-codec,
-        locally-disabled,
-        logging-fstring-interpolation,  # added by vLLM
-        logging-not-lazy,  # added by vLLM
-        long-builtin,
-        long-suffix,
-        map-builtin-not-iterating,
-        misplaced-comparison-constant,
-        missing-class-docstring,  # TODO (vLLM): enable
-        missing-function-docstring,
-        missing-module-docstring,  # TODO (vLLM): enable
-        metaclass-assignment,
-        next-method-called,
-        next-method-defined,
-        no-absolute-import,
-        no-else-break,
-        no-else-continue,
-        no-else-raise,
-        no-else-return,
-        no-init,  # added
-        no-member,
-        no-name-in-module,
-        no-self-use,
-        nonzero-method,
-        oct-method,
-        old-division,
-        old-ne-operator,
-        old-octal-literal,
-        old-raise-syntax,
-        parameter-unpacking,
-        print-statement,
-        raising-string,
-        range-builtin-not-iterating,
-        raw_input-builtin,
-        rdiv-method,
-        reduce-builtin,
-        relative-import,
-        reload-builtin,
-        round-builtin,
-        setslice-method,
-        signature-differs,
-        standarderror-builtin,
-        suppressed-message,
-        sys-max-int,
-        too-few-public-methods,
-        too-many-ancestors,
-        too-many-arguments,
-        too-many-boolean-expressions,
-        too-many-branches,
-        too-many-instance-attributes,
-        too-many-locals,
-        too-many-nested-blocks,
-        too-many-public-methods,
-        too-many-return-statements,
-        too-many-statements,
-        trailing-newlines,
-        unichr-builtin,
-        unicode-builtin,
-        unnecessary-pass,
-        unpacking-in-except,
-        unspecified-encoding,
-        useless-else-on-loop,
-        useless-object-inheritance,
-        useless-suppression,
-        using-cmp-argument,
-        wrong-import-order,
-        xrange-builtin,
-        zip-builtin-not-iterating,
-
-
-[REPORTS]
-
-# Set the output format. Available formats are text, parseable, colorized, msvs
-# (visual studio) and html. You can also give a reporter class, eg
-# mypackage.mymodule.MyReporterClass.
-output-format=text
-
-# Tells whether to display a full report or only the messages
-reports=no
-
-# Python expression which should return a note less than 10 (10 is the highest
-# note). You have access to the variables errors warning, statement which
-# respectively contain the number of errors / warnings messages and the total
-# number of statements analyzed. This is used by the global evaluation report
-# (RP0004).
-evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
-
-# Template used to display messages. This is a python new-style format string
-# used to format the message information. See doc for all details
-#msg-template=
-
-
-[BASIC]
-
-# Good variable names which should always be accepted, separated by a comma
-good-names=main,_
-
-# Bad variable names which should always be refused, separated by a comma
-bad-names=
-
-# Colon-delimited sets of names that determine each other's naming style when
-# the name regexes allow several styles.
-name-group=
-
-# Include a hint for the correct naming format with invalid-name
-include-naming-hint=no
-
-# List of decorators that produce properties, such as abc.abstractproperty. Add
-# to this list to register other decorators that produce valid properties.
-property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
-
-# Regular expression matching correct function names
-function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
-
-# Regular expression matching correct variable names
-variable-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct constant names
-const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression matching correct attribute names
-attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
-
-# Regular expression matching correct argument names
-argument-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct class attribute names
-class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
-
-# Regular expression matching correct inline iteration names
-inlinevar-rgx=^[a-z][a-z0-9_]*$
-
-# Regular expression matching correct class names
-class-rgx=^_?[A-Z][a-zA-Z0-9]*$
-
-# Regular expression matching correct module names
-module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
-
-# Regular expression matching correct method names
-method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
-
-# Regular expression which should only match function or class names that do
-# not require a docstring.
-no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
-
-# Minimum line length for functions/classes that require docstrings, shorter
-# ones are exempt.
-docstring-min-length=10
-
-
-[TYPECHECK]
-
-# List of decorators that produce context managers, such as
-# contextlib.contextmanager. Add to this list to register other decorators that
-# produce valid context managers.
-contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
-
-# Tells whether missing members accessed in mixin class should be ignored. A
-# mixin class is detected if its name ends with "mixin" (case insensitive).
-ignore-mixin-members=yes
-
-# List of module names for which member attributes should not be checked
-# (useful for modules/projects where namespaces are manipulated during runtime
-# and thus existing member attributes cannot be deduced by static analysis. It
-# supports qualified module names, as well as Unix pattern matching.
-ignored-modules=
-
-# List of class names for which member attributes should not be checked (useful
-# for classes with dynamically set attributes). This supports the use of
-# qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local
-
-# List of members which are set dynamically and missed by pylint inference
-# system, and so shouldn't trigger E1101 when accessed. Python regular
-# expressions are accepted.
-generated-members=
-
-
-[FORMAT]
-
-# Maximum number of characters on a single line.
-max-line-length=80
-
-# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
-# lines made too long by directives to pytype.
-
-# Regexp for a line that is allowed to be longer than the limit.
-ignore-long-lines=(?x)(
-  ^\s*(\#\ )?<?https?://\S+>?$|
-  ^\s*(from\s+\S+\s+)?import\s+.+$)
-
-# Allow the body of an if to be on the same line as the test if there is no
-# else.
-single-line-if-stmt=yes
-
-# Maximum number of lines in a module
-max-module-lines=99999
-
-# String used as indentation unit.  The internal Google style guide mandates 2
-# spaces.  Google's externaly-published style guide says 4, consistent with
-# PEP 8.  Here, we use 2 spaces, for conformity with many open-sourced Google
-# projects (like TensorFlow).
-indent-string='    '
-
-# Number of spaces of indent required inside a hanging  or continued line.
-indent-after-paren=4
-
-# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
-expected-line-ending-format=
-
-
-[MISCELLANEOUS]
-
-# List of note tags to take in consideration, separated by a comma.
-notes=TODO
-
-
-[STRING]
-
-# This flag controls whether inconsistent-quotes generates a warning when the
-# character used as a quote delimiter is used inconsistently within a module.
-check-quote-consistency=yes
-
-
-[VARIABLES]
-
-# Tells whether we should check for unused import in __init__ files.
-init-import=no
-
-# A regular expression matching the name of dummy variables (i.e. expectedly
-# not used).
-dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
-
-# List of additional names supposed to be defined in builtins. Remember that
-# you should avoid to define new builtins when possible.
-additional-builtins=
-
-# List of strings which can identify a callback function by name. A callback
-# name must start or end with one of those strings.
-callbacks=cb_,_cb
-
-# List of qualified module names which can have objects that can redefine
-# builtins.
-redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
-
-
-[LOGGING]
-
-# Logging modules to check that the string format arguments are in logging
-# function parameter format
-logging-modules=logging,absl.logging,tensorflow.io.logging
-
-
-[SIMILARITIES]
-
-# Minimum lines number of a similarity.
-min-similarity-lines=4
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
-
-
-[SPELLING]
-
-# Spelling dictionary name. Available dictionaries: none. To make it working
-# install python-enchant package.
-spelling-dict=
-
-# List of comma separated words that should not be checked.
-spelling-ignore-words=
-
-# A path to a file that contains private dictionary; one word per line.
-spelling-private-dict-file=
-
-# Tells whether to store unknown words to indicated private dictionary in
-# --spelling-private-dict-file option instead of raising a message.
-spelling-store-unknown-words=no
-
-
-[IMPORTS]
-
-# Deprecated modules which should not be used, separated by a comma
-deprecated-modules=regsub,
-                   TERMIOS,
-                   Bastion,
-                   rexec,
-                   sets
-
-# Create a graph of every (i.e. internal and external) dependencies in the
-# given file (report RP0402 must not be disabled)
-import-graph=
-
-# Create a graph of external dependencies in the given file (report RP0402 must
-# not be disabled)
-ext-import-graph=
-
-# Create a graph of internal dependencies in the given file (report RP0402 must
-# not be disabled)
-int-import-graph=
-
-# Force import order to recognize a module as part of the standard
-# compatibility libraries.
-known-standard-library=
-
-# Force import order to recognize a module as part of a third party library.
-known-third-party=enchant, absl
-
-# Analyse import fallback blocks. This can be used to support both Python 2 and
-# 3 compatible code, which means that the block might have code that exists
-# only in one or another interpreter, leading to false positives when analysed.
-analyse-fallback-blocks=no
-
-
-[CLASSES]
-
-# List of method names used to declare (i.e. assign) instance attributes.
-defining-attr-methods=__init__,
-                      __new__,
-                      setUp
-
-# List of member names, which should be excluded from the protected access
-# warning.
-exclude-protected=_asdict,
-                  _fields,
-                  _replace,
-                  _source,
-                  _make
-
-# List of valid names for the first argument in a class method.
-valid-classmethod-first-arg=cls,
-                            class_
-
-# List of valid names for the first argument in a metaclass class method.
-valid-metaclass-classmethod-first-arg=mcs
-
-
-[EXCEPTIONS]
-
-# Exceptions that will emit a warning when being caught. Defaults to
-# "Exception"
-overgeneral-exceptions=StandardError,
-                       Exception,
-                       BaseException
--- a/5
+++ b/5
@ -18,6 +18,11 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 # image to build pytorch extensions
 FROM dev AS build

+# install build dependencies
+COPY requirements-build.txt requirements-build.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    pip install -r requirements-build.txt
+
 # copy input files
 COPY csrc csrc
 COPY setup.py setup.py
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@ Easy, fast, and cheap LLM serving for everyone
 </h3>

 <p align="center">
-| <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
+| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |

 </p>

@ -47,7 +47,7 @@ vLLM is flexible and easy to use with:
 vLLM seamlessly supports many Hugging Face models, including the following architectures:

 - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
+- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
 - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
 - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@ -12,7 +12,6 @@ from vllm import LLM, SamplingParams
 def main(args: argparse.Namespace):
    print(args)

-    # Process all the requests in a single batch if possible.
    # NOTE(woosuk): If the request cannot be processed in a single batch,
    # the engine will automatically process the request in multiple batches.
    llm = LLM(
@ -20,8 +19,6 @@ def main(args: argparse.Namespace):
        tokenizer=args.tokenizer,
        quantization=args.quantization,
        tensor_parallel_size=args.tensor_parallel_size,
-        max_num_seqs=args.batch_size,
-        max_num_batched_tokens=args.batch_size * args.input_len,
        trust_remote_code=args.trust_remote_code,
        dtype=args.dtype,
    )
@ -39,22 +36,31 @@ def main(args: argparse.Namespace):

    def run_to_completion(profile: bool = False):
        if profile:
-            torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.perf_counter()
-
-        llm.generate(prompt_token_ids=dummy_prompt_token_ids,
-                     sampling_params=sampling_params,
-                     use_tqdm=False)
-
-        end_time = time.perf_counter()
-        latency = end_time - start_time
-        if profile:
-            torch.cuda.cudart().cudaProfilerStop()
-        return latency
+            with torch.profiler.profile(activities=[
+                    torch.profiler.ProfilerActivity.CPU,
+                    torch.profiler.ProfilerActivity.CUDA,
+            ]) as p:
+                llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                             sampling_params=sampling_params,
+                             use_tqdm=False)
+            print(p.key_averages())
+        else:
+            start_time = time.perf_counter()
+            llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+                         sampling_params=sampling_params,
+                         use_tqdm=False)
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency

    print("Warming up...")
    run_to_completion(profile=False)

+    if args.profile:
+        print("Profiling...")
+        run_to_completion(profile=True)
+        return
+
    # Benchmark.
    latencies = []
    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
@ -97,5 +103,9 @@ if __name__ == '__main__':
        'The "auto" option will use FP16 precision '
        'for FP32 and FP16 models, and BF16 precision '
        'for BF16 models.')
+    parser.add_argument(
+        '--profile',
+        action='store_true',
+        help='profile the generation process of a single batch')
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@ -17,9 +17,8 @@ def sample_requests(
    tokenizer: PreTrainedTokenizerBase,
    fixed_output_len: Optional[int],
 ) -> List[Tuple[str, int, int]]:
-    if fixed_output_len is not None:
-        if fixed_output_len < 4:
-            raise ValueError("output_len too small")
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")

    # Load the dataset.
    with open(dataset_path) as f:
@ -70,6 +69,7 @@ def run_vllm(
    use_beam_search: bool,
    trust_remote_code: bool,
    dtype: str,
+    max_model_len: Optional[int] = None,
 ) -> float:
    from vllm import LLM, SamplingParams
    llm = LLM(
@ -80,6 +80,7 @@ def run_vllm(
        seed=seed,
        trust_remote_code=trust_remote_code,
        dtype=dtype,
+        max_model_len=max_model_len,
    )

    # Add the requests to the engine.
@ -202,7 +203,8 @@ def main(args: argparse.Namespace):
        elapsed_time = run_vllm(requests, args.model, args.tokenizer,
                                args.quantization, args.tensor_parallel_size,
                                args.seed, args.n, args.use_beam_search,
-                                args.trust_remote_code, args.dtype)
+                                args.trust_remote_code, args.dtype,
+                                args.max_model_len)
    elif args.backend == "hf":
        assert args.tensor_parallel_size == 1
        elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@ -262,6 +264,12 @@ if __name__ == "__main__":
    parser.add_argument('--trust-remote-code',
                        action='store_true',
                        help='trust remote code from huggingface')
+    parser.add_argument(
+        '--max-model-len',
+        type=int,
+        default=None,
+        help='Maximum length of a sequence (including prompt and output). '
+        'If None, will be derived from the model.')
    parser.add_argument(
        '--dtype',
        type=str,
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@ -4,7 +4,7 @@ import time

 import torch

-from vllm import attention_ops
+from vllm._C import ops

 NUM_BLOCKS = 1024
 PARTITION_SIZE = 512
@ -98,7 +98,7 @@ def main(

        for _ in range(num_iters):
            if version == "v1":
-                attention_ops.paged_attention_v1(
+                ops.paged_attention_v1(
                    output,
                    query,
                    key_cache,
@ -112,7 +112,7 @@ def main(
                    alibi_slopes,
                )
            elif version == "v2":
-                attention_ops.paged_attention_v2(
+                ops.paged_attention_v2(
                    output,
                    exp_sums,
                    max_logits,
--- a/csrc/activation.cpp
+++ b/csrc/activation.cpp
@ -1,28 +0,0 @@
-#include <torch/extension.h>
-
-void silu_and_mul(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-void gelu_new(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-void gelu_fast(
-  torch::Tensor& out,
-  torch::Tensor& input);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "silu_and_mul",
-    &silu_and_mul,
-    "Activation function used in SwiGLU.");
-  m.def(
-    "gelu_new",
-    &gelu_new,
-    "GELU implementation used in GPT-2.");
-  m.def(
-    "gelu_fast",
-    &gelu_fast,
-    "Approximate GELU implementation.");
-}
--- a/csrc/attention.cpp
+++ b/csrc/attention.cpp
@ -1,42 +0,0 @@
-#include <torch/extension.h>
-#include <c10/util/Optional.h>
-
-void paged_attention_v1(
-  torch::Tensor& out,
-  torch::Tensor& query,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& head_mapping,
-  float scale,
-  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
-  int block_size,
-  int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes);
-
-void paged_attention_v2(
-  torch::Tensor& out,
-  torch::Tensor& exp_sums,
-  torch::Tensor& max_logits,
-  torch::Tensor& tmp_out,
-  torch::Tensor& query,
-  torch::Tensor& key_cache,
-  torch::Tensor& value_cache,
-  torch::Tensor& head_mapping,
-  float scale,
-  torch::Tensor& block_tables,
-  torch::Tensor& context_lens,
-  int block_size,
-  int max_context_len,
-  const c10::optional<torch::Tensor>& alibi_slopes);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "paged_attention_v1",
-    &paged_attention_v1,
-    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
-  m.def(
-    "paged_attention_v2",
-    &paged_attention_v2,
-    "PagedAttention V2.");
-}
--- a/csrc/cache.cpp
+++ b/csrc/cache.cpp
@ -26,22 +26,3 @@ void gather_cached_kv(
  torch::Tensor& key_cache,
  torch::Tensor& value_cache,
  torch::Tensor& slot_mapping);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "swap_blocks",
-    &swap_blocks,
-    "Swap in (out) the cache blocks from src to dst");
-  m.def(
-    "copy_blocks",
-    &copy_blocks,
-    "Copy the cache blocks from src to dst");
-  m.def(
-    "reshape_and_cache",
-    &reshape_and_cache,
-    "Reshape the key and value tensors and cache them");
-  m.def(
-    "gather_cached_kv",
-    &gather_cached_kv,
-    "Gather key and value from the cache into contiguous QKV tensors");
-}
--- a/csrc/cuda_utils.cpp
+++ b/csrc/cuda_utils.cpp
@ -1,13 +0,0 @@
-#include <torch/extension.h>
-
-int get_device_attribute(
-    int attribute,
-    int device_id);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "get_device_attribute",
-    &get_device_attribute,
-    "Gets the specified device attribute.");
-}
-
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@ -0,0 +1,5 @@
+#include <torch/extension.h>
+
+int get_device_attribute(
+    int attribute,
+    int device_id);
--- a/csrc/layernorm.cpp
+++ b/csrc/layernorm.cpp
@ -1,24 +0,0 @@
-#include <torch/extension.h>
-
-void rms_norm(
-  torch::Tensor& out,
-  torch::Tensor& input,
-  torch::Tensor& weight,
-  float epsilon);
-
-void fused_add_rms_norm(
-  torch::Tensor& input,
-  torch::Tensor& residual,
-  torch::Tensor& weight,
-  float epsilon);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "rms_norm",
-    &rms_norm,
-    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-  m.def(
-    "fused_add_rms_norm",
-    &fused_add_rms_norm,
-    "In-place fused Add and RMS Normalization");
-}
--- a/csrc/ops.h
+++ b/csrc/ops.h
@ -0,0 +1,75 @@
+#include <torch/extension.h>
+
+void paged_attention_v1(
+  torch::Tensor& out,
+  torch::Tensor& query,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
+  float scale,
+  torch::Tensor& block_tables,
+  torch::Tensor& context_lens,
+  int block_size,
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes);
+
+void paged_attention_v2(
+  torch::Tensor& out,
+  torch::Tensor& exp_sums,
+  torch::Tensor& max_logits,
+  torch::Tensor& tmp_out,
+  torch::Tensor& query,
+  torch::Tensor& key_cache,
+  torch::Tensor& value_cache,
+  torch::Tensor& head_mapping,
+  float scale,
+  torch::Tensor& block_tables,
+  torch::Tensor& context_lens,
+  int block_size,
+  int max_context_len,
+  const c10::optional<torch::Tensor>& alibi_slopes);
+
+void rms_norm(
+  torch::Tensor& out,
+  torch::Tensor& input,
+  torch::Tensor& weight,
+  float epsilon);
+
+void fused_add_rms_norm(
+  torch::Tensor& input,
+  torch::Tensor& residual,
+  torch::Tensor& weight,
+  float epsilon);
+
+void rotary_embedding(
+  torch::Tensor& positions,
+  torch::Tensor& query,
+  torch::Tensor& key,
+  int head_size,
+  torch::Tensor& cos_sin_cache,
+  bool is_neox);
+
+void silu_and_mul(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+void gelu_new(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+void gelu_fast(
+  torch::Tensor& out,
+  torch::Tensor& input);
+
+torch::Tensor awq_gemm(
+  torch::Tensor _in_feats,
+  torch::Tensor _kernel,
+  torch::Tensor _scaling_factors,
+  torch::Tensor _zeros,
+  int split_k_iters);
+
+void squeezellm_gemm(
+  torch::Tensor vec,
+  torch::Tensor mat,
+  torch::Tensor mul,
+  torch::Tensor lookup_table);
--- a/csrc/pos_encoding.cpp
+++ b/csrc/pos_encoding.cpp
@ -1,16 +0,0 @@
-#include <torch/extension.h>
-
-void rotary_embedding(
-  torch::Tensor& positions,
-  torch::Tensor& query,
-  torch::Tensor& key,
-  int head_size,
-  torch::Tensor& cos_sin_cache,
-  bool is_neox);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def(
-    "rotary_embedding",
-    &rotary_embedding,
-    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-}
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@ -0,0 +1,80 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include <torch/extension.h>
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  // vLLM custom ops
+  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
+
+  // Attention ops
+  ops.def(
+    "paged_attention_v1",
+    &paged_attention_v1,
+    "Compute the attention between an input query and the cached keys/values using PagedAttention.");
+  ops.def(
+    "paged_attention_v2",
+    &paged_attention_v2,
+    "PagedAttention V2.");
+
+  // Activation ops
+  ops.def(
+    "silu_and_mul",
+    &silu_and_mul,
+    "Activation function used in SwiGLU.");
+  ops.def(
+    "gelu_new",
+    &gelu_new,
+    "GELU implementation used in GPT-2.");
+  ops.def(
+    "gelu_fast",
+    &gelu_fast,
+    "Approximate GELU implementation.");
+
+  // Layernorm
+  ops.def(
+    "rms_norm",
+    &rms_norm,
+    "Apply Root Mean Square (RMS) Normalization to the input tensor.");
+
+  ops.def(
+    "fused_add_rms_norm",
+    &fused_add_rms_norm,
+    "In-place fused Add and RMS Normalization");
+
+  // Rotary embedding
+  ops.def(
+    "rotary_embedding",
+    &rotary_embedding,
+    "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
+
+  // Quantization ops
+  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
+
+  // Cache ops
+  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
+  cache_ops.def(
+    "swap_blocks",
+    &swap_blocks,
+    "Swap in (out) the cache blocks from src to dst");
+  cache_ops.def(
+    "copy_blocks",
+    &copy_blocks,
+    "Copy the cache blocks from src to dst");
+  cache_ops.def(
+    "reshape_and_cache",
+    &reshape_and_cache,
+    "Reshape the key and value tensors and cache them");
+  cache_ops.def(
+    "gather_cached_kv",
+    &gather_cached_kv,
+    "Gather key and value from the cache into contiguous QKV tensors");
+
+  // Cuda utils
+  pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
+  cuda_utils.def(
+    "get_device_attribute",
+    &get_device_attribute,
+    "Gets the specified device attribute.");
+}
--- a/csrc/quantization.cpp
+++ b/csrc/quantization.cpp
@ -1,19 +0,0 @@
-#include <torch/extension.h>
-
-torch::Tensor awq_gemm(
-  torch::Tensor _in_feats,
-  torch::Tensor _kernel,
-  torch::Tensor _scaling_factors,
-  torch::Tensor _zeros,
-  int split_k_iters);
-
-void squeezellm_gemm(
-  torch::Tensor vec,
-  torch::Tensor mat,
-  torch::Tensor mul,
-  torch::Tensor lookup_table);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
-  m.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-}
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation.rst
@ -3,14 +3,14 @@
 Installation
 ============

-vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries.
+vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.

 Requirements
 ------------

 * OS: Linux
 * Python: 3.8 -- 3.11
-* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.)
+* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)

 Install with pip
 ----------------
@ -23,9 +23,24 @@ You can install vLLM using pip:
    $ conda create -n myenv python=3.8 -y
    $ conda activate myenv

-    $ # Install vLLM.
+    $ # Install vLLM with CUDA 12.1.
    $ pip install vllm

+.. note::
+
+    As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
+    However, you can install vLLM with CUDA 11.8 by running:
+
+    .. code-block:: console
+
+        $ # Install vLLM with CUDA 11.8.
+        $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
+        $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl
+
+        $ # Re-install PyTorch with CUDA 11.8.
+        $ pip uninstall torch -y
+        $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
+

 .. _build_from_source:

@ -45,6 +60,5 @@ You can also build and install vLLM from source:

    .. code-block:: console

-        $ # Pull the Docker image with CUDA 11.8.
        $ # Use `--ipc=host` to make sure the shared memory is large enough.
-        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:22.12-py3
+        $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@ -107,6 +107,7 @@ OpenAI-Compatible Server
 ------------------------

 vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
+By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_, `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_, and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.

 Start the server:

@ -122,7 +123,13 @@ Use model from www.modelscope.cn
    $ VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server \
    $     --model="qwen/Qwen-7B-Chat" --revision="v1.1.8" --trust-remote-code

-By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
+By default, the server uses a predefined chat template stored in the tokenizer. You can override this template by using the ``--chat-template`` argument:
+
+.. code-block:: console
+
+   $ python -m vllm.entrypoints.openai.api_server \
+   $     --model facebook/opt-125m \
+   $     --chat-template ./examples/template_chatml.json

 This server can be queried in the same format as OpenAI API. For example, list the models:

@ -130,6 +137,9 @@ This server can be queried in the same format as OpenAI API. For example, list t

    $ curl http://localhost:8000/v1/models

+Using OpenAI Completions API with vLLM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Query the model with input prompts:

 .. code-block:: console
@ -147,12 +157,65 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep

 .. code-block:: python

-    import openai
+    from openai import OpenAI
+
    # Modify OpenAI's API key and API base to use vLLM's API server.
-    openai.api_key = "EMPTY"
-    openai.api_base = "http://localhost:8000/v1"
-    completion = openai.Completion.create(model="facebook/opt-125m",
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    completion = client.completions.create(model="facebook/opt-125m",
                                          prompt="San Francisco is a")
    print("Completion result:", completion)

 For a more detailed client example, refer to `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
+
+Using OpenAI Chat API with vLLM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The vLLM server is designed to support the OpenAI Chat API, allowing you to engage in dynamic conversations with the model. The chat interface is a more interactive way to communicate with the model, allowing back-and-forth exchanges that can be stored in the chat history. This is useful for tasks that require context or more detailed explanations.
+
+Querying the model using OpenAI Chat API:
+
+You can use the `create chat completion <https://platform.openai.com/docs/api-reference/chat/completions/create>`_ endpoint to communicate with the model in a chat-like interface:
+
+.. code-block:: console
+
+    $ curl http://localhost:8000/v1/chat/completions \
+    $     -H "Content-Type: application/json" \
+    $     -d '{
+    $         "model": "facebook/opt-125m",
+    $         "messages": [
+    $             {"role": "system", "content": "You are a helpful assistant."},
+    $             {"role": "user", "content": "Who won the world series in 2020?"}
+    $         ]
+    $     }'
+
+Python Client Example:
+
+Using the `openai` python package, you can also communicate with the model in a chat-like manner:
+
+.. code-block:: python
+
+    from openai import OpenAI
+    # Set OpenAI's API key and API base to use vLLM's API server.
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+
+    chat_response = client.chat.completions.create(
+        model="facebook/opt-125m",
+        messages=[
+            {"role": "system", "content": "You are a helpful assistant."},
+            {"role": "user", "content": "Tell me a joke."},
+        ]
+    )
+    print("Chat response:", chat_response)
+
+For more in-depth examples and advanced features of the chat API, you can refer to the official OpenAI documentation.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -66,6 +66,8 @@ Documentation
   serving/run_on_sky
   serving/deploying_with_triton
   serving/deploying_with_docker
+   serving/serving_with_langchain
+   serving/metrics

 .. toctree::
   :maxdepth: 1
@ -73,6 +75,7 @@ Documentation

   models/supported_models
   models/adding_model
+   models/engine_args

 .. toctree::
   :maxdepth: 1
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@ -18,7 +18,7 @@ This document provides a high-level guide on integrating a `HuggingFace Transfor
 0. Fork the vLLM repository
 --------------------------------

-Start by forking our `GitHub <https://github.com/vllm-project/vllm/>`_ repository and then :ref:`build it from source <build_from_source>`.
+Start by forking our `GitHub`_ repository and then :ref:`build it from source <build_from_source>`.
 This gives you the ability to modify the codebase and test your model.


--- a/docs/source/models/engine_args.rst
+++ b/docs/source/models/engine_args.rst
@ -0,0 +1,114 @@
+.. _engine_args:
+
+Engine Arguments
+================
+
+Below, you can find an explanation of every engine argument for vLLM:
+
+.. option:: --model <model_name_or_path>
+
+    Name or path of the huggingface model to use.
+
+.. option:: --tokenizer <tokenizer_name_or_path>
+
+    Name or path of the huggingface tokenizer to use.
+
+.. option:: --revision <revision>
+
+    The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+
+.. option:: --tokenizer-revision <revision>
+
+    The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
+
+.. option:: --tokenizer-mode {auto,slow}
+
+    The tokenizer mode.
+    
+    * "auto" will use the fast tokenizer if available.
+    * "slow" will always use the slow tokenizer.
+
+.. option:: --trust-remote-code
+
+    Trust remote code from huggingface.
+
+.. option:: --download-dir <directory>
+
+    Directory to download and load the weights, default to the default cache dir of huggingface.
+
+.. option:: --load-format {auto,pt,safetensors,npcache,dummy}
+
+    The format of the model weights to load.
+
+    * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
+    * "pt" will load the weights in the pytorch bin format.
+    * "safetensors" will load the weights in the safetensors format.
+    * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
+    * "dummy" will initialize the weights with random values, mainly for profiling.
+
+.. option:: --dtype {auto,half,float16,bfloat16,float,float32}
+
+    Data type for model weights and activations.
+
+    * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
+    * "half" for FP16. Recommended for AWQ quantization.
+    * "float16" is the same as "half".
+    * "bfloat16" for a balance between precision and range.
+    * "float" is shorthand for FP32 precision.
+    * "float32" for FP32 precision.
+
+.. option:: --max-model-len <length>
+
+    Model context length. If unspecified, will be automatically derived from the model config.
+
+.. option:: --worker-use-ray
+
+    Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
+
+.. option:: --pipeline-parallel-size (-pp) <size>
+
+    Number of pipeline stages.
+
+.. option:: --tensor-parallel-size (-tp) <size>
+
+    Number of tensor parallel replicas.
+
+.. option:: --max-parallel-loading-workers <workers>
+
+    Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
+
+.. option:: --block-size {8,16,32}
+
+    Token block size for contiguous chunks of tokens.
+
+.. option:: --seed <seed>
+
+    Random seed for operations.
+
+.. option:: --swap-space <size>
+
+    CPU swap space size (GiB) per GPU.
+
+.. option:: --gpu-memory-utilization <percentage>
+
+    The percentage of GPU memory to be used for the model executor.
+
+.. option:: --max-num-batched-tokens <tokens>
+
+    Maximum number of batched tokens per iteration.
+
+.. option:: --max-num-seqs <sequences>
+
+    Maximum number of sequences per iteration.
+
+.. option:: --max-paddings <paddings>
+
+    Maximum number of paddings in a batch.
+
+.. option:: --disable-log-stats
+
+    Disable logging statistics.
+
+.. option:: --quantization (-q) {awq,squeezellm,None}
+
+    Method used to quantize the weights.
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@ -19,7 +19,7 @@ Alongside each architecture, we include some popular models that use it.
    - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
  * - :code:`BaiChuanForCausalLM`
    - Baichuan
-    - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
+    - :code:`baichuan-inc/Baichuan2-13B-Chat`, :code:`baichuan-inc/Baichuan-7B`, etc.
  * - :code:`ChatGLMModel`
    - ChatGLM
    - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
--- a/docs/source/quantization/auto_awq.rst
+++ b/docs/source/quantization/auto_awq.rst
@ -3,6 +3,12 @@
 AutoAWQ
 ==================

+.. warning::
+
+   Please note that AWQ support in vLLM is under-optimized at the moment. We would recommend using the unquantized version of the model for better
+   accuracy and higher throughput. Currently, you can use AWQ as a way to reduce memory footprint. As of now, it is more suitable for low latency
+   inference with small number of concurrent requests. vLLM's AWQ implementation have lower throughput than unquantized version.
+
 To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. 
 Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 The main benefits are lower latency and memory usage.
--- a/docs/source/serving/deploying_with_docker.rst
+++ b/docs/source/serving/deploying_with_docker.rst
@ -3,11 +3,33 @@
 Deploying with Docker
 ============================

+vLLM offers official docker image for deployment.
+The image can be used to run OpenAI compatible server.
+The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
+
+.. code-block:: console
+
+    $ docker run --runtime nvidia --gpus all \
+        -v ~/.cache/huggingface:/root/.cache/huggingface \
+        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
+        -p 8000:8000 \
+        --ipc=host \
+        vllm/vllm-openai:latest \
+        --model mistralai/Mistral-7B-v0.1
+
+
+.. note::
+
+        You can either use the ``ipc=host`` flag or ``--shm-size`` flag to allow the
+        container to access the host's shared memory. vLLM uses PyTorch, which uses shared
+        memory to share data between processes under the hood, particularly for tensor parallel inference.
+
+
 You can build and run vLLM from source via the provided dockerfile. To build vLLM:

 .. code-block:: console

-    $ DOCKER_BUILDKIT=1 docker build . --target vllm --tag vllm --build-arg max_jobs=8
+    $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --build-arg max_jobs=8

 To run vLLM:

@ -17,5 +39,5 @@ To run vLLM:
        -v ~/.cache/huggingface:/root/.cache/huggingface \
        -p 8000:8000 \
        --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
-        vllm <args...>
+        vllm/vllm-openai <args...>

--- a/docs/source/serving/metrics.rst
+++ b/docs/source/serving/metrics.rst
@ -0,0 +1,13 @@
+Production Metrics
+==================
+
+vLLM exposes a number of metrics that can be used to monitor the health of the
+system. These metrics are exposed via the `/metrics` endpoint on the vLLM
+OpenAI compatible API server.
+
+The following metrics are exposed:
+
+.. literalinclude:: ../../../vllm/engine/metrics.py
+    :language: python
+    :start-after: begin-metrics-definitions
+    :end-before: end-metrics-definitions
--- a/docs/source/serving/serving_with_langchain.rst
+++ b/docs/source/serving/serving_with_langchain.rst
@ -0,0 +1,31 @@
+.. _run_on_langchain:
+
+Serving with Langchain
+============================
+
+vLLM is also available via `Langchain <https://github.com/langchain-ai/langchain>`_ .
+
+To install langchain, run
+
+.. code-block:: console
+
+    $ pip install langchain -q
+
+To run inference on a single or multiple GPUs, use ``VLLM`` class from ``langchain``.
+
+.. code-block:: python
+
+    from langchain.llms import VLLM
+
+    llm = VLLM(model="mosaicml/mpt-7b",
+               trust_remote_code=True,  # mandatory for hf models
+               max_new_tokens=128,
+               top_k=10,
+               top_p=0.95,
+               temperature=0.8,
+               # tensor_parallel_size=... # for distributed inference
+    )
+
+    print(llm("What is the capital of France ?"))
+
+Please refer to this `Tutorial <https://github.com/langchain-ai/langchain/blob/master/docs/extras/integrations/llms/vllm.ipynb>`_ for more details.
--- a/examples/openai_chatcompletion_client.py
+++ b/examples/openai_chatcompletion_client.py
@ -1,18 +1,19 @@
-import openai
+from openai import OpenAI

 # Modify OpenAI's API key and API base to use vLLM's API server.
-openai.api_key = "EMPTY"
-openai.api_base = "http://localhost:8000/v1"
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"

-# List models API
-models = openai.Model.list()
-print("Models:", models)
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)

-model = models["data"][0]["id"]
+models = client.models.list()
+model = models.data[0].id

-# Chat completion API
-chat_completion = openai.ChatCompletion.create(
-    model=model,
+chat_completion = client.chat.completions.create(
    messages=[{
        "role": "system",
        "content": "You are a helpful assistant."
@ -27,7 +28,10 @@ chat_completion = openai.ChatCompletion.create(
    }, {
        "role": "user",
        "content": "Where was it played?"
-    }])
+    }],
+    model=model,
+)
+

 print("Chat completion results:")
 print(chat_completion)
--- a/examples/openai_completion_client.py
+++ b/examples/openai_completion_client.py
@ -1,24 +1,28 @@
-import openai
+from openai import OpenAI

 # Modify OpenAI's API key and API base to use vLLM's API server.
-openai.api_key = "EMPTY"
-openai.api_base = "http://localhost:8000/v1"
+openai_api_key = "EMPTY"
+openai_api_base = "http://localhost:8000/v1"

-# List models API
-models = openai.Model.list()
-print("Models:", models)
+client = OpenAI(
+    # defaults to os.environ.get("OPENAI_API_KEY")
+    api_key=openai_api_key,
+    base_url=openai_api_base,
+)

-model = models["data"][0]["id"]
+models = client.models.list()
+model = models.data[0].id

 # Completion API
 stream = False
-completion = openai.Completion.create(
+completion = client.completions.create(
    model=model,
    prompt="A robot may not injure a human being",
    echo=False,
    n=2,
    stream=stream,
-    logprobs=3)
+    logprobs=3
+)

 print("Completion results:")
 if stream:
--- a/examples/template_alpaca.jinja
+++ b/examples/template_alpaca.jinja
@ -0,0 +1,29 @@
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+### Instruction:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+### Response:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+### Input:
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+### Response:
+{% endif %}
--- a/examples/template_chatml.jinja
+++ b/examples/template_chatml.jinja
@ -0,0 +1,2 @@
+{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\n' }}{% endif %}
--- a/examples/template_inkbot.jinja
+++ b/examples/template_inkbot.jinja
@ -0,0 +1,30 @@
+<#meta#>
+- Date: {{ (messages|selectattr('role', 'equalto', 'meta-current_date')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-current_date')|list) else '' }}
+- Task: {{ (messages|selectattr('role', 'equalto', 'meta-task_name')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'meta-task_name')|list) else '' }}
+<#system#>
+{{ (messages|selectattr('role', 'equalto', 'system')|list|last).content|trim if (messages|selectattr('role', 'equalto', 'system')|list) else '' }}
+<#chat#>
+{% for message in messages %}
+{% if message['role'] == 'user' %}
+<#user#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'assistant' %}
+<#bot#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% elif message['role'] == 'user_context' %}
+<#user_context#>
+{{ message['content']|trim -}}
+{% if not loop.last %}
+
+{% endif %}
+{% endif %}
+{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}
+<#bot#>
+{% endif %}
--- a/format.sh
+++ b/format.sh
@ -7,7 +7,7 @@
 #    # Format files that differ from origin/main.
 #    bash format.sh

-#    # Commit changed files with message 'Run yapf and pylint'
+#    # Commit changed files with message 'Run yapf and ruff'
 #
 #
 # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
@ -22,7 +22,7 @@ ROOT="$(git rev-parse --show-toplevel)"
 builtin cd "$ROOT" || exit 1

 YAPF_VERSION=$(yapf --version | awk '{print $2}')
-PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
+RUFF_VERSION=$(ruff --version | awk '{print $2}')
 MYPY_VERSION=$(mypy --version | awk '{print $2}')

 # # params: tool name, tool version, required version
@ -34,7 +34,7 @@ tool_version_check() {
 }

 tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
-tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
+tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
 tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"

 YAPF_FLAGS=(
@ -95,14 +95,14 @@ echo 'vLLM yapf: Done'

 # Lint specified files
 lint() {
-    pylint "$@"
+    ruff "$@"
 }

 # Lint files that differ from main branch. Ignores dirs that are not slated
 # for autolint yet.
 lint_changed() {
    # The `if` guard ensures that the list of filenames is not empty, which
-    # could cause pylint to receive 0 positional arguments, making it hang
+    # could cause ruff to receive 0 positional arguments, making it hang
    # waiting for STDIN.
    #
    # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
@ -111,13 +111,13 @@ lint_changed() {

    if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
        git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
-             pylint
+             ruff
    fi

 }

-# Run Pylint
-echo 'vLLM Pylint:'
+# Run Ruff
+echo 'vLLM Ruff:'
 ## This flag lints individual files. --files *must* be the first command line
 ## arg to use this option.
 if [[ "$1" == '--files' ]]; then
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,9 +1,34 @@
 [build-system]
+# Should be mirrored in requirements-build.txt
 requires = [
    "ninja",
    "packaging",
-    "setuptools",
+    "setuptools >= 49.4.0",
    "torch >= 2.1.0",
    "wheel",
 ]
 build-backend = "setuptools.build_meta"
+
+[tool.ruff.lint]
+select = [
+    # pycodestyle
+    "E",
+    # Pyflakes
+    "F",
+    # pyupgrade
+    # "UP",
+    # flake8-bugbear
+    "B",
+    # flake8-simplify
+    "SIM",
+    # isort
+    # "I",
+]
+ignore = [
+    # star imports
+    "F405", "F403",
+    # lambda expression assignment
+    "E731",
+    # line too long, handled by black formatting
+    "E501",
+]
--- a/requirements-build.txt
+++ b/requirements-build.txt
@ -0,0 +1,6 @@
+# Should be mirrored in pyproject.toml
+ninja
+packaging
+setuptools>=49.4.0
+torch>=2.1.0
+wheel
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -1,6 +1,6 @@
 # formatting
 yapf==0.32.0
-pylint==2.8.2
+ruff==0.1.5

 # type checking
 mypy==0.991
--- a/requirements.txt
+++ b/requirements.txt
@ -12,3 +12,4 @@ xformers >= 0.0.22.post7  # Required for CUDA 12.1.
 fastapi
 uvicorn[standard]
 pydantic == 1.10.13  # Required for OpenAI server.
+aioprometheus[starlette]
--- a/setup.py
+++ b/setup.py
@ -75,7 +75,8 @@ def get_torch_arch_list() -> Set[str]:
            f"Unsupported CUDA architectures ({invalid_arch_list}) are "
            "excluded from the `TORCH_CUDA_ARCH_LIST` env variable "
            f"({env_arch_list}). Supported CUDA architectures are: "
-            f"{valid_archs}.")
+            f"{valid_archs}.",
+            stacklevel=2)
    return arch_list


@ -106,10 +107,10 @@ if not compute_capabilities:
 # Validate the NVCC CUDA version.
 if nvcc_cuda_version < Version("11.0"):
    raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
-if nvcc_cuda_version < Version("11.1"):
-    if any(cc.startswith("8.6") for cc in compute_capabilities):
-        raise RuntimeError(
-            "CUDA 11.1 or higher is required for compute capability 8.6.")
+if (nvcc_cuda_version < Version("11.1")
+        and any(cc.startswith("8.6") for cc in compute_capabilities)):
+    raise RuntimeError(
+        "CUDA 11.1 or higher is required for compute capability 8.6.")
 if nvcc_cuda_version < Version("11.8"):
    if any(cc.startswith("8.9") for cc in compute_capabilities):
        # CUDA 11.8 is required to generate the code targeting compute capability 8.9.
@ -119,7 +120,8 @@ if nvcc_cuda_version < Version("11.8"):
        # instead of 8.9.
        warnings.warn(
            "CUDA 11.8 or higher is required for compute capability 8.9. "
-            "Targeting compute capability 8.0 instead.")
+            "Targeting compute capability 8.0 instead.",
+            stacklevel=2)
        compute_capabilities = set(cc for cc in compute_capabilities
                                   if not cc.startswith("8.9"))
        compute_capabilities.add("8.0+PTX")
@ -140,87 +142,25 @@ if nvcc_cuda_version >= Version("11.2"):
    NVCC_FLAGS += ["--threads", str(num_threads)]

 ext_modules = []
-
-# Cache operations.
-cache_extension = CUDAExtension(
-    name="vllm.cache_ops",
-    sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(cache_extension)
-
-# Attention kernels.
-attention_extension = CUDAExtension(
-    name="vllm.attention_ops",
-    sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(attention_extension)
-
-# Positional encoding kernels.
-positional_encoding_extension = CUDAExtension(
-    name="vllm.pos_encoding_ops",
-    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(positional_encoding_extension)
-
-# Layer normalization kernels.
-layernorm_extension = CUDAExtension(
-    name="vllm.layernorm_ops",
-    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(layernorm_extension)
-
-# Activation kernels.
-activation_extension = CUDAExtension(
-    name="vllm.activation_ops",
-    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(activation_extension)
-
-# Quantization kernels.
-quantization_extension = CUDAExtension(
-    name="vllm.quantization_ops",
+vllm_extension = CUDAExtension(
+    name="vllm._C",
    sources=[
-        "csrc/quantization.cpp",
+        "csrc/cache_kernels.cu",
+        "csrc/attention/attention_kernels.cu",
+        "csrc/pos_encoding_kernels.cu",
+        "csrc/activation_kernels.cu",
+        "csrc/layernorm_kernels.cu",
        "csrc/quantization/awq/gemm_kernels.cu",
        "csrc/quantization/squeezellm/quant_cuda_kernel.cu",
+        "csrc/cuda_utils_kernels.cu",
+        "csrc/pybind.cpp",
    ],
    extra_compile_args={
        "cxx": CXX_FLAGS,
        "nvcc": NVCC_FLAGS,
    },
 )
-ext_modules.append(quantization_extension)
-
-# Misc. CUDA utils.
-cuda_utils_extension = CUDAExtension(
-    name="vllm.cuda_utils",
-    sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(cuda_utils_extension)
+ext_modules.append(vllm_extension)


 def get_path(*filepath) -> str:
--- a/tests/async_engine/api_server_async_engine.py
+++ b/tests/async_engine/api_server_async_engine.py
@ -14,7 +14,6 @@ app = vllm.entrypoints.api_server.app

 class AsyncLLMEngineWithStats(AsyncLLMEngine):

-    # pylint: disable=redefined-outer-name
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._num_aborts = 0
--- a/tests/async_engine/test_api_server.py
+++ b/tests/async_engine/test_api_server.py
@ -24,7 +24,6 @@ def _query_server(prompt: str) -> dict:
 def api_server():
    script_path = Path(__file__).parent.joinpath(
        "api_server_async_engine.py").absolute()
-    # pylint: disable=consider-using-with
    uvicorn_process = subprocess.Popen([
        sys.executable, "-u",
        str(script_path), "--model", "facebook/opt-125m"
@ -33,7 +32,6 @@ def api_server():
    uvicorn_process.terminate()


-# pylint: disable=redefined-outer-name, unused-argument
 def test_api_server(api_server):
    """
    Run the API server and test it.
@ -49,11 +47,10 @@ def test_api_server(api_server):
        prompts = ["Hello world"] * 1
        result = None
        while not result:
-            # pylint: disable=bare-except
            try:
-                for result in pool.map(_query_server, prompts):
+                for _ in pool.map(_query_server, prompts):
                    break
-            except:
+            except Exception:
                time.sleep(1)

        # Actual tests start here
--- a/tests/async_engine/test_openai_server.py
+++ b/tests/async_engine/test_openai_server.py
@ -0,0 +1,119 @@
+from argparse import Namespace
+from dataclasses import dataclass
+
+import pytest
+from fastapi.testclient import TestClient
+
+from vllm.entrypoints.openai.api_server import *
+
+# Define models, templates, and their corresponding expected outputs
+MODEL_TEMPLATE_GENERATON_OUTPUT = [
+    ("facebook/opt-125m", None, True,
+     "Hello</s>Hi there!</s>What is the capital of</s>"),
+    ("facebook/opt-125m", None, False,
+     "Hello</s>Hi there!</s>What is the capital of</s>"),
+    ("facebook/opt-125m", "../../examples/template_chatml.jinja", True,
+     """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of<|im_end|>
+<|im_start|>assistant
+"""),
+    ("facebook/opt-125m", "../../examples/template_chatml.jinja", False,
+     """<|im_start|>user
+Hello<|im_end|>
+<|im_start|>assistant
+Hi there!<|im_end|>
+<|im_start|>user
+What is the capital of""")
+]
+
+TEST_MESSAGES = [
+    {
+        'role': 'user',
+        'content': 'Hello'
+    },
+    {
+        'role': 'assistant',
+        'content': 'Hi there!'
+    },
+    {
+        'role': 'user',
+        'content': 'What is the capital of'
+    },
+]
+client = TestClient(app)
+
+
+@dataclass
+class MockTokenizer:
+    chat_template = None
+
+
+def test_load_chat_template():
+    # Testing chatml template
+    template = "../../examples/template_chatml.jinja"
+    mock_args = Namespace(chat_template=template)
+    tokenizer = MockTokenizer()
+
+    # Call the function with the mocked args
+    load_chat_template(mock_args, tokenizer)
+
+    template_content = tokenizer.chat_template
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert template_content == """{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content']}}{% if (loop.last and add_generation_prompt) or not loop.last %}{{ '<|im_end|>' + '\\n'}}{% endif %}{% endfor %}
+{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{ '<|im_start|>assistant\\n' }}{% endif %}"""
+
+
+def test_no_load_chat_template():
+    # Testing chatml template
+    template = "../../examples/does_not_exist"
+    mock_args = Namespace(chat_template=template)
+    tokenizer = MockTokenizer()
+
+    # Call the function with the mocked args
+    load_chat_template(mock_args, tokenizer=tokenizer)
+    template_content = tokenizer.chat_template
+
+    # Test assertions
+    assert template_content is not None
+    # Hard coded value for template_chatml.jinja
+    assert template_content == """../../examples/does_not_exist"""
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model,template,add_generation_prompt,expected_output",
+    MODEL_TEMPLATE_GENERATON_OUTPUT)
+async def test_get_gen_prompt(model, template, add_generation_prompt,
+                              expected_output):
+    # Initialize the tokenizer
+    tokenizer = get_tokenizer(tokenizer_name=model)
+
+    mock_args = Namespace(chat_template=template)
+    load_chat_template(mock_args, tokenizer)
+
+    # Create a mock request object using keyword arguments
+    mock_request = ChatCompletionRequest(
+        model=model,
+        messages=TEST_MESSAGES,
+        add_generation_prompt=add_generation_prompt)
+
+    # Call the function and get the result
+    result = tokenizer.apply_chat_template(
+        conversation=mock_request.messages,
+        tokenize=False,
+        add_generation_prompt=mock_request.add_generation_prompt)
+
+    # Test assertion
+    assert result == expected_output, f"The generated prompt does not match the expected output for model {model} and template {template}"
+
+
+def test_health_endpoint():
+    response = client.get("/health")
+    assert response.status_code == 200
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -8,7 +8,6 @@ from vllm import LLM, SamplingParams
 from vllm.transformers_utils.tokenizer import get_tokenizer

 _TEST_PROMPTS = [
-    # pylint: disable=line-too-long
    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
    "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
    "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
--- a/tests/engine/test_detokenize.py
+++ b/tests/engine/test_detokenize.py
@ -5,10 +5,9 @@ from transformers import AutoTokenizer
 from vllm.transformers_utils.tokenizer import detokenize_incrementally

 TRUTH = [
-    # pylint: disable=line-too-long
-    "Hello here, this is a simple test",
-    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",
-    "我很感谢你的热情"
+    "Hello here, this is a simple test",  # noqa: E501
+    "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa: E501
+    "我很感谢你的热情"  # noqa: E501
 ]
 TOKENIZERS = [
    "facebook/opt-125m",
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@ -1,9 +1,7 @@
 import pytest
 import torch
-import torch.nn.functional as F
-from transformers.activations import get_activation

-from vllm import activation_ops
+from vllm.model_executor.layers.activation import FastGELU, NewGELU, SiluAndMul

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
@ -11,11 +9,6 @@ D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
 SEEDS = [0]


-def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
-    x1, x2 = x.chunk(chunks=2, dim=1)
-    return F.silu(x1) * x2
-
-
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("d", D)
@pytest.mark.parametrize("dtype", DTYPES)
@ -30,9 +23,9 @@ def test_silu_and_mul(
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
-    activation_ops.silu_and_mul(out, x)
-    ref_out = ref_silu_and_mul(x)
+    layer = SiluAndMul()
+    out = layer(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)


@ -50,9 +43,9 @@ def test_gelu_new(
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
-    activation_ops.gelu_new(out, x)
-    ref_out = get_activation("gelu_new")(x)
+    layer = NewGELU()
+    out = layer(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)


@ -69,7 +62,7 @@ def test_gelu_fast(
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
-    out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
-    activation_ops.gelu_fast(out, x)
-    ref_out = get_activation("gelu_fast")(x)
+    layer = FastGELU()
+    out = layer(x)
+    ref_out = layer._forward(x)
    assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
--- a/tests/kernels/test_attention.py
+++ b/tests/kernels/test_attention.py
@ -6,7 +6,7 @@ import torch
 from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import BlockDiagonalCausalMask

-from vllm import attention_ops
+from vllm._C import ops
 from vllm.utils import get_max_shared_memory_bytes

 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
@ -165,7 +165,7 @@ def test_paged_attention(
    # Call the paged attention kernel.
    output = torch.empty_like(query)
    if version == "v1":
-        attention_ops.paged_attention_v1(
+        ops.paged_attention_v1(
            output,
            query,
            key_cache,
@ -194,7 +194,7 @@ def test_paged_attention(
            device=output.device,
        )
        max_logits = torch.empty_like(exp_sums)
-        attention_ops.paged_attention_v2(
+        ops.paged_attention_v2(
            output,
            exp_sums,
            max_logits,
@ -211,7 +211,7 @@ def test_paged_attention(
            alibi_slopes,
        )
    else:
-        assert False, f"Unknown version: {version}"
+        raise AssertionError(f"Unknown version: {version}")

    # Run the reference implementation.
    ref_output = torch.empty_like(query)
--- a/tests/kernels/test_cache.py
+++ b/tests/kernels/test_cache.py
@ -3,7 +3,7 @@ import random
 import pytest
 import torch

-from vllm import cache_ops
+from vllm._C import cache_ops

 DTYPES = [torch.half, torch.bfloat16, torch.float]
 NUM_TOKENS = [83]  # Arbitrary values for testing
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@ -1,58 +1,47 @@
 import pytest
 import torch
-import torch.nn as nn

-from vllm import layernorm_ops
+from vllm.model_executor.layers.layernorm import RMSNorm

 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
+HIDDEN_SIZES = [768, 5120, 8192]  # Arbitrary values for testing
+ADD_RESIDUAL = [False, True]
 SEEDS = [0]


-class RefRMSNorm(nn.Module):
-
-    def __init__(self, hidden_size, eps=1e-6):
-        super().__init__()
-        weight = torch.empty(hidden_size)
-        weight.normal_(mean=1.0, std=0.1)
-        self.weight = nn.Parameter(weight)
-        self.variance_epsilon = eps
-
-    def forward(self, hidden_states):
-        input_dtype = hidden_states.dtype
-        hidden_states = hidden_states.to(torch.float32)
-        variance = hidden_states.pow(2).mean(-1, keepdim=True)
-        hidden_states = hidden_states * torch.rsqrt(variance +
-                                                    self.variance_epsilon)
-        return self.weight * hidden_states.to(input_dtype)
-
-
@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("add_residual", ADD_RESIDUAL)
@pytest.mark.parametrize("dtype", DTYPES)
@pytest.mark.parametrize("seed", SEEDS)
@torch.inference_mode()
 def test_rms_norm(
    num_tokens: int,
    hidden_size: int,
+    add_residual: bool,
    dtype: torch.dtype,
    seed: int,
 ) -> None:
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

-    scale = float(hidden_size**-0.5)
-    x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
-    x.uniform_(-scale, scale)
-    ref = RefRMSNorm(hidden_size).to(dtype).cuda()
+    layer = RMSNorm(hidden_size).to(dtype).cuda()
+    layer.weight.data.normal_(mean=1.0, std=0.1)
+    scale = 1 / (2 * hidden_size)
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype, device="cuda")
+    x *= scale
+    residual = torch.randn_like(x) * scale if add_residual else None

-    out = torch.empty_like(x)
-    layernorm_ops.rms_norm(
-        out,
-        x,
-        ref.weight.data,
-        ref.variance_epsilon,
-    )
-    ref_out = ref(x)
-    assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_out = layer._forward(x, residual)
+    out = layer(x, residual)
+    # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
+    # numerical errors than other operators because they involve reductions.
+    # Therefore, we use a larger tolerance.
+    if add_residual:
+        assert torch.allclose(out[0], ref_out[0], atol=1e-2, rtol=1e-2)
+        assert torch.allclose(out[1], ref_out[1], atol=1e-2, rtol=1e-2)
+    else:
+        assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-2)
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@ -1,105 +1,23 @@
-from typing import Optional, Tuple
+from typing import Optional

 import pytest
 import torch
-import torch.nn as nn
-import torch.nn.functional as F

-from vllm import pos_encoding_ops
+from vllm.model_executor.layers.rotary_embedding import get_rope

 IS_NEOX_STYLE = [True, False]
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
-NUM_HEADS = [7, 12, 40, 52]  # Arbitrary values for testing
-NUM_TOKENS = [11, 83, 2048]  # Arbitrary values for testing
+NUM_HEADS = [7, 17]  # Arbitrary values for testing
+BATCH_SIZES = [1, 5]  # Arbitrary values for testing
+SEQ_LENS = [11, 8192]  # Arbitrary values for testing
 SEEDS = [0]


-def rotate_neox(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., :x.shape[-1] // 2]
-    x2 = x[..., x.shape[-1] // 2:]
-    return torch.cat((-x2, x1), dim=-1)
-
-
-def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
-    x1 = x[..., ::2]
-    x2 = x[..., 1::2]
-    x = torch.stack((-x2, x1), dim=-1)
-    return x.flatten(-2)
-
-
-def apply_rope(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-    is_neox_style: bool,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    rotate_fn = rotate_neox if is_neox_style else rotate_gptj
-    q_embed = (q * cos) + (rotate_fn(q) * sin)
-    k_embed = (k * cos) + (rotate_fn(k) * sin)
-    return q_embed, k_embed
-
-
-class RefRotaryEmbedding(nn.Module):
-    """Reference implementation of rotary embedding."""
-
-    def __init__(
-        self,
-        dim: int,
-        is_neox_style: bool,
-        max_position_embeddings: int = 8192,
-        base: int = 10000,
-    ) -> None:
-        super().__init__()
-        self.rotary_dim = dim
-        self.is_neox_style = is_neox_style
-        self.max_position_embeddings = max_position_embeddings
-
-        # Create cos and sin embeddings.
-        inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
-        t = torch.arange(max_position_embeddings).float()
-        freqs = torch.einsum("i,j->ij", t, inv_freq.float())
-        if is_neox_style:
-            emb = torch.cat((freqs, freqs), dim=-1)
-        else:
-            emb = torch.repeat_interleave(freqs, 2, -1)
-        cos = emb.cos().to(dtype=inv_freq.dtype)
-        sin = emb.sin().to(dtype=inv_freq.dtype)
-        self.register_buffer("cos_cached", cos, persistent=False)
-        self.register_buffer("sin_cached", sin, persistent=False)
-
-    def forward(
-        self,
-        positions: torch.Tensor,  # [num_tokens]
-        query: torch.Tensor,  # [num_tokens, num_heads, head_size]
-        key: torch.Tensor,  # [num_tokens, num_heads, head_size]
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        query_rot = query[..., :self.rotary_dim]
-        query_pass = query[..., self.rotary_dim:]
-        key_rot = key[..., :self.rotary_dim]
-        key_pass = key[..., self.rotary_dim:]
-
-        query_rot = query_rot.transpose(0, 1)
-        key_rot = key_rot.transpose(0, 1)
-        cos = F.embedding(positions, self.cos_cached)
-        sin = F.embedding(positions, self.sin_cached)
-
-        query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
-                                        self.is_neox_style)
-        query_rot = query_rot.transpose(0, 1).contiguous()
-        key_rot = key_rot.transpose(0, 1).contiguous()
-
-        query = torch.cat((query_rot, query_pass), dim=-1)
-        key = torch.cat((key_rot, key_pass), dim=-1)
-
-        # Output query/key shape: [num_tokens, num_tokens, head_size]
-        return query, key
-
-
@pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
-@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("seq_len", SEQ_LENS)
@pytest.mark.parametrize("num_heads", NUM_HEADS)
@pytest.mark.parametrize("head_size", HEAD_SIZES)
@pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
@ -108,7 +26,8 @@ class RefRotaryEmbedding(nn.Module):
@torch.inference_mode()
 def test_rotary_embedding(
    is_neox_style: bool,
-    num_tokens: int,
+    batch_size: int,
+    seq_len: int,
    num_heads: int,
    head_size: int,
    rotary_dim: Optional[int],
@ -122,53 +41,25 @@ def test_rotary_embedding(
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

-    positions = torch.randint(0, max_position, (num_tokens, ), device="cuda")
-    query = torch.randn(num_tokens,
+    if rotary_dim is None:
+        rotary_dim = head_size
+    rope = get_rope(head_size, rotary_dim, max_position, base, is_neox_style)
+    rope = rope.to(dtype).cuda()
+
+    positions = torch.randint(0,
+                              max_position, (batch_size, seq_len),
+                              device="cuda")
+    query = torch.randn(batch_size,
+                        seq_len,
                        num_heads * head_size,
                        dtype=dtype,
                        device="cuda")
-    key = torch.randn(num_tokens,
-                      num_heads * head_size,
-                      dtype=dtype,
-                      device="cuda")
-
-    # Create the rotary embedding.
-    inv_freq = 1.0 / (base**(
-        torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
-    t = torch.arange(max_position).float()
-    freqs = torch.einsum("i,j -> ij", t, inv_freq)
-    cos = freqs.cos()
-    sin = freqs.sin()
-    cos_sin_cache = torch.cat((cos, sin), dim=-1)
-    cos_sin_cache = cos_sin_cache.to(dtype=dtype, device="cuda")
-
-    # Run the kernel. The kernel is in-place, so we need to clone the inputs.
-    out_query = query.clone()
-    out_key = key.clone()
-    pos_encoding_ops.rotary_embedding(
-        positions,
-        out_query,
-        out_key,
-        head_size,
-        cos_sin_cache,
-        is_neox_style,
-    )
-
-    # Run the reference implementation.
-    ref_rotary_embedding = RefRotaryEmbedding(
-        dim=rotary_dim,
-        is_neox_style=is_neox_style,
-        max_position_embeddings=max_position,
-        base=base,
-    ).to(dtype=dtype, device="cuda")
-    ref_query, ref_key = ref_rotary_embedding(
-        positions,
-        query.view(num_tokens, num_heads, head_size),
-        key.view(num_tokens, num_heads, head_size),
-    )
-    ref_query = ref_query.view(num_tokens, num_heads * head_size)
-    ref_key = ref_key.view(num_tokens, num_heads * head_size)
+    key = torch.randn_like(query)

+    # NOTE(woosuk): The reference implementation should be executed first
+    # because the custom kernel is in-place.
+    ref_query, ref_key = rope._forward(positions, query, key)
+    out_query, out_key = rope.forward(positions, query, key)
    # Compare the results.
    assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
    assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
--- a/tests/samplers/test_sampler.py
+++ b/tests/samplers/test_sampler.py
@ -1,4 +1,3 @@
-# pylint: disable=protected-access
 import random
 from typing import Tuple
 from unittest.mock import patch
@ -9,7 +8,7 @@ import torch
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.utils import set_random_seed
 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.worker.worker import Worker
+from vllm.worker.model_runner import ModelRunner


 class MockLogitsSampler(Sampler):
@ -20,15 +19,15 @@ class MockLogitsSampler(Sampler):

    def forward(self, *args, **kwargs):
        with patch("vllm.model_executor.layers.sampler._prune_hidden_states",
-                   lambda x, y: x):
-            with patch("vllm.model_executor.layers.sampler._get_logits",
+                   lambda x, y: x), patch(
+                       "vllm.model_executor.layers.sampler._get_logits",
                       lambda *args, **kwargs: self.fake_logits):
-                return super().forward(*args, **kwargs)
+            return super().forward(*args, **kwargs)


 def _prepare_test(
    batch_size: int
-) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, Worker]:
+) -> Tuple[torch.Tensor, torch.Tensor, MockLogitsSampler, ModelRunner]:
    vocab_size = 32000
    input_tensor = torch.rand((batch_size, 1024),
                              device="cuda",
@ -38,9 +37,8 @@ def _prepare_test(
                             device=input_tensor.device,
                             dtype=input_tensor.dtype)
    sampler = MockLogitsSampler(32000, fake_logits)
-    worker = Worker(None, None, None)
-    worker.block_size = 16
-    return input_tensor, fake_logits, sampler, worker
+    model_runner = ModelRunner(None, None, None)
+    return input_tensor, fake_logits, sampler, model_runner


 RANDOM_SEEDS = list(range(128))
@ -50,9 +48,11 @@ RANDOM_SEEDS = list(range(128))
 def test_sampler_all_greedy(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)

    seq_group_metadata_list = []
+    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -62,11 +62,13 @@ def test_sampler_all_greedy(seed: int):
                sampling_params=SamplingParams(temperature=0, ),
                block_tables={0: [1]},
            ))
+        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    expected = torch.argmax(fake_logits, dim=-1)
    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
@ -77,12 +79,14 @@ def test_sampler_all_greedy(seed: int):
 def test_sampler_all_random(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)

    for i in range(batch_size):
        fake_logits[i, i] = 1e2

    seq_group_metadata_list = []
+    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -95,11 +99,13 @@ def test_sampler_all_random(seed: int):
                ),
                block_tables={0: [1]},
            ))
+        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    for i, sequence_output in enumerate(sampler_output):
        for nth_output in sequence_output.samples:
            assert nth_output.output_token == i
@ -109,9 +115,10 @@ def test_sampler_all_random(seed: int):
 def test_sampler_all_beam(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, _, sampler, worker = _prepare_test(batch_size)
+    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)

    seq_group_metadata_list = []
+    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -125,11 +132,13 @@ def test_sampler_all_beam(seed: int):
                ),
                block_tables={0: [1]},
            ))
+        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
    sampler(embedding=None,
            hidden_states=input_tensor,
-            input_metadata=input_metadata)
+            sampling_metadata=sampling_metadata)
    # no assertion here as I am not sure how to determine whether
    # the outputs are expected - in other words, this just tests
    # whether there are no exceptions in the sampler
@ -140,10 +149,12 @@ def test_sampler_all_beam(seed: int):
 def test_sampler_mixed(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, fake_logits, sampler, worker = _prepare_test(batch_size)
+    input_tensor, fake_logits, sampler, model_runner = _prepare_test(
+        batch_size)

    seq_group_metadata_list = []
    expected_tokens = []
+    prompt_lens = []
    for i in range(batch_size):
        n = 1
        sampling_type = random.randint(0, 2)
@ -173,11 +184,13 @@ def test_sampler_mixed(seed: int):
                sampling_params=sampling_params,
                block_tables={0: [1]},
            ))
+        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
+                             sampling_metadata=sampling_metadata)
    for i, sequence_output in enumerate(sampler_output):
        if seq_group_metadata_list[i].sampling_params.use_beam_search:
            continue
@ -189,7 +202,7 @@ def test_sampler_mixed(seed: int):
 def test_sampler_logits_processors(seed: int):
    set_random_seed(seed)
    batch_size = random.randint(1, 256)
-    input_tensor, _, sampler, worker = _prepare_test(batch_size)
+    input_tensor, _, sampler, model_runner = _prepare_test(batch_size)

    # This sample logits processor gives infinite score to the i-th token,
    # where i is the length of the input sequence.
@ -199,6 +212,7 @@ def test_sampler_logits_processors(seed: int):
        return logits

    seq_group_metadata_list = []
+    prompt_lens = []
    for i in range(batch_size):
        seq_group_metadata_list.append(
            SequenceGroupMetadata(
@ -209,11 +223,13 @@ def test_sampler_logits_processors(seed: int):
                                               logits_processors=[pick_ith]),
                block_tables={0: [1]},
            ))
+        prompt_lens.append(seq_group_metadata_list[-1].seq_data[0].get_len())

-    _, _, input_metadata = worker._prepare_inputs(seq_group_metadata_list)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
    sampler_output = sampler(embedding=None,
                             hidden_states=input_tensor,
-                             input_metadata=input_metadata)
-    for i, sequence_output in enumerate(sampler_output):
+                             sampling_metadata=sampling_metadata)
+    for _, sequence_output in enumerate(sampler_output):
        for idx, nth_output in enumerate(sequence_output.samples):
            assert nth_output.output_token == idx
--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@ -1,20 +1,20 @@
-# pylint: disable=protected-access
 import random
 import torch

 from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
-from vllm.worker.worker import Worker
+from vllm.worker.model_runner import ModelRunner


-def test_worker_prepare_inputs_for_prompt():
-    worker = Worker(None, None, None)
-    worker.block_size = 16
+def test_prepare_prompt():
+    model_runner = ModelRunner(None, None, None)
+    model_runner.set_block_size(16)
+
    batch_size = random.randint(1, 256)
    prompt_lens = []
    seq_group_metadata_list = []
    for i in range(batch_size):
        # make sure all tokens fit into one block
-        prompt_len = i % (worker.block_size - 1) + 1
+        prompt_len = i % (model_runner.block_size - 1) + 1
        prompt_lens.append(prompt_len)
        seq_data = list(range(prompt_len))
        seq_group_metadata_list.append(
@ -25,6 +25,7 @@ def test_worker_prepare_inputs_for_prompt():
                sampling_params=SamplingParams(temperature=0),
                block_tables={0: [1]},
            ))
+
    expected_selected_token_indices = []
    selected_token_start_idx = 0
    max_seq_len = max(prompt_lens)
@ -32,12 +33,15 @@ def test_worker_prepare_inputs_for_prompt():
        expected_selected_token_indices.append(selected_token_start_idx +
                                               prompt_len - 1)
        selected_token_start_idx += max_seq_len
-    input_tokens, input_positions, input_metadata = worker._prepare_inputs(
+    input_tokens, input_positions, _ = model_runner._prepare_prompt(
        seq_group_metadata_list)
-    assert input_tokens.shape == input_positions.shape == (batch_size,
-                                                           max_seq_len)
+    sampling_metadata = model_runner._prepare_sample(seq_group_metadata_list,
+                                                     prompt_lens)
+    assert input_tokens.shape == (batch_size, max_seq_len)
+    assert input_positions.shape == (batch_size, max_seq_len)
    torch.testing.assert_close(input_tokens, input_positions)
-    actual = input_metadata.selected_token_indices
+
+    actual = sampling_metadata.selected_token_indices
    expected = torch.tensor(expected_selected_token_indices,
                            device=actual.device,
                            dtype=actual.dtype)
--- a/vllm/init.py
+++ b/vllm/init.py
@ -8,7 +8,7 @@ from vllm.entrypoints.llm import LLM
 from vllm.outputs import CompletionOutput, RequestOutput
 from vllm.sampling_params import SamplingParams

-__version__ = "0.2.2"
+__version__ = "0.2.3"

 __all__ = [
    "LLM",
--- a/vllm/config.py
+++ b/vllm/config.py
@ -161,6 +161,12 @@ class ModelConfig:
                "must be divisible by pipeline parallel size "
                f"({pipeline_parallel_size}).")

+    def get_sliding_window(self) -> Optional[int]:
+        return getattr(self.hf_config, "sliding_window", None)
+
+    def get_vocab_size(self) -> int:
+        return self.hf_config.vocab_size
+
    def get_hidden_size(self) -> int:
        return self.hf_config.hidden_size

@ -285,10 +291,12 @@ class ParallelConfig:
        pipeline_parallel_size: int,
        tensor_parallel_size: int,
        worker_use_ray: bool,
+        max_parallel_loading_workers: Optional[int] = None,
    ) -> None:
        self.pipeline_parallel_size = pipeline_parallel_size
        self.tensor_parallel_size = tensor_parallel_size
        self.worker_use_ray = worker_use_ray
+        self.max_parallel_loading_workers = max_parallel_loading_workers

        self.world_size = pipeline_parallel_size * tensor_parallel_size
        if self.world_size > 1:
--- a/vllm/core/block_manager.py
+++ b/vllm/core/block_manager.py
@ -1,10 +1,14 @@
 """A block manager that manages token blocks."""
+import enum
 from typing import Dict, List, Optional, Set, Tuple

 from vllm.block import PhysicalTokenBlock
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
 from vllm.utils import Device

+# Mapping: logical block number -> physical block.
+BlockTable = List[PhysicalTokenBlock]
+

 class BlockAllocator:
    """Manages free physical token blocks for a device.
@ -25,7 +29,7 @@ class BlockAllocator:
        self.num_blocks = num_blocks

        # Initialize the free blocks.
-        self.free_blocks: List[PhysicalTokenBlock] = []
+        self.free_blocks: BlockTable = []
        for i in range(num_blocks):
            block = PhysicalTokenBlock(device=device,
                                       block_number=i,
@ -50,8 +54,18 @@ class BlockAllocator:
        return len(self.free_blocks)


-# Mapping: logical block number -> physical block.
-BlockTable = List[PhysicalTokenBlock]
+class AllocStatus(enum.Enum):
+    """Result for BlockSpaceManager.can_allocate
+
+    1. Ok: seq_group can be allocated now.
+    2. Later: seq_group cannot be allocated.
+      The capacity of allocator is larger than seq_group required.
+    3. Never: seq_group can never be allocated.
+      The seq_group is too large to allocated in GPU.
+    """
+    OK = enum.auto()
+    LATER = enum.auto()
+    NEVER = enum.auto()


 class BlockSpaceManager:
@ -86,7 +100,7 @@ class BlockSpaceManager:
        # Mapping: seq_id -> BlockTable.
        self.block_tables: Dict[int, BlockTable] = {}

-    def can_allocate(self, seq_group: SequenceGroup) -> bool:
+    def can_allocate(self, seq_group: SequenceGroup) -> AllocStatus:
        # FIXME(woosuk): Here we assume that all sequences in the group share
        # the same prompt. This may not be true for preempted sequences.
        seq = seq_group.get_seqs()[0]
@ -95,9 +109,15 @@ class BlockSpaceManager:
            num_required_blocks = min(num_required_blocks,
                                      self.block_sliding_window)
        num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
+
        # Use watermark to avoid frequent cache eviction.
-        return (num_free_gpu_blocks - num_required_blocks >=
-                self.watermark_blocks)
+        if (self.num_total_gpu_blocks - num_required_blocks <
+                self.watermark_blocks):
+            return AllocStatus.NEVER
+        if num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER

    def allocate(self, seq_group: SequenceGroup) -> None:
        # NOTE: Here we assume that all sequences in the group have the same
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@ -3,7 +3,7 @@ import time
 from typing import Dict, Iterable, List, Optional, Tuple, Union

 from vllm.config import CacheConfig, SchedulerConfig
-from vllm.core.block_manager import BlockSpaceManager
+from vllm.core.block_manager import AllocStatus, BlockSpaceManager
 from vllm.core.policy import PolicyFactory
 from vllm.logger import init_logger
 from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
@ -154,8 +154,18 @@ class Scheduler:
                    continue

                # If the sequence group cannot be allocated, stop.
-                if not self.block_manager.can_allocate(seq_group):
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if can_allocate == AllocStatus.LATER:
                    break
+                elif can_allocate == AllocStatus.NEVER:
+                    logger.warning(
+                        f"Input prompt ({num_prompt_tokens} tokens) is too long"
+                        f" and exceeds the capacity of block_manager")
+                    for seq in seq_group.get_seqs():
+                        seq.status = SequenceStatus.FINISHED_IGNORED
+                    ignored_seq_groups.append(seq_group)
+                    self.waiting.pop(0)
+                    continue

                # If the number of batched tokens exceeds the limit, stop.
                new_seq_lens = seq_lens + [num_prompt_tokens]
@ -186,7 +196,8 @@ class Scheduler:
                scheduler_outputs = SchedulerOutputs(
                    scheduled_seq_groups=scheduled,
                    prompt_run=True,
-                    num_batched_tokens=len(seq_lens) * max(seq_lens),
+                    num_batched_tokens=len(seq_lens) *
+                    max(seq_lens) if seq_lens else 0,
                    blocks_to_swap_in=blocks_to_swap_in,
                    blocks_to_swap_out=blocks_to_swap_out,
                    blocks_to_copy=blocks_to_copy,
@ -350,7 +361,7 @@ class Scheduler:
        elif preemption_mode == PreemptionMode.SWAP:
            self._preempt_by_swap(seq_group, blocks_to_swap_out)
        else:
-            assert False, "Invalid preemption mode."
+            raise AssertionError("Invalid preemption mode.")

    def _preempt_by_recompute(
        self,
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@ -22,6 +22,7 @@ class EngineArgs:
    worker_use_ray: bool = False
    pipeline_parallel_size: int = 1
    tensor_parallel_size: int = 1
+    max_parallel_loading_workers: Optional[int] = None
    block_size: int = 16
    swap_space: int = 4  # GiB
    gpu_memory_utilization: float = 0.90
@ -41,6 +42,10 @@ class EngineArgs:
    def add_cli_args(
            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
        """Shared CLI arguments for vLLM engine."""
+
+        # NOTE: If you update any of the arguments below, please also
+        # make sure to update docs/source/models/engine_args.rst
+
        # Model arguments
        parser.add_argument(
            '--model',
@ -128,6 +133,12 @@ class EngineArgs:
                            type=int,
                            default=EngineArgs.tensor_parallel_size,
                            help='number of tensor parallel replicas')
+        parser.add_argument(
+            '--max-parallel-loading-workers',
+            type=int,
+            help='load model sequentially in multiple batches, '
+            'to avoid RAM OOM when using tensor '
+            'parallel and large models')
        # KV cache arguments
        parser.add_argument('--block-size',
                            type=int,
@ -190,12 +201,14 @@ class EngineArgs:
                                   self.dtype, self.seed, self.revision,
                                   self.tokenizer_revision, self.max_model_len,
                                   self.quantization)
-        cache_config = CacheConfig(
-            self.block_size, self.gpu_memory_utilization, self.swap_space,
-            getattr(model_config.hf_config, 'sliding_window', None))
+        cache_config = CacheConfig(self.block_size,
+                                   self.gpu_memory_utilization,
+                                   self.swap_space,
+                                   model_config.get_sliding_window())
        parallel_config = ParallelConfig(self.pipeline_parallel_size,
                                         self.tensor_parallel_size,
-                                         self.worker_use_ray)
+                                         self.worker_use_ray,
+                                         self.max_parallel_loading_workers)
        scheduler_config = SchedulerConfig(self.max_num_batched_tokens,
                                           self.max_num_seqs,
                                           model_config.max_model_len,
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@ -301,7 +301,16 @@ class AsyncLLMEngine:
        elif self.worker_use_ray:
            engine_class = ray.remote(num_cpus=0)(self._engine_class).remote
        else:
-            engine_class = ray.remote(num_gpus=1)(self._engine_class).remote
+            # FIXME(woosuk): This is a bit hacky. Be careful when changing the
+            # order of the arguments.
+            cache_config = args[1]
+            parallel_config = args[2]
+            if parallel_config.tensor_parallel_size == 1:
+                num_gpus = cache_config.gpu_memory_utilization
+            else:
+                num_gpus = 1
+            engine_class = ray.remote(num_gpus=num_gpus)(
+                self._engine_class).remote
        return engine_class(*args, **kwargs)

    async def engine_step(self) -> bool:
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@ -7,13 +7,14 @@ from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                         SchedulerConfig)
 from vllm.core.scheduler import Scheduler, SchedulerOutputs
 from vllm.engine.arg_utils import EngineArgs
-from vllm.engine.ray_utils import RayWorker, initialize_cluster, ray
+from vllm.engine.metrics import record_metrics
+from vllm.engine.ray_utils import RayWorkerVllm, initialize_cluster, ray
 from vllm.logger import init_logger
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup,
-                           SequenceGroupMetadata, SequenceGroupOutputs,
-                           SequenceOutputs, SequenceStatus)
+                           SequenceGroupMetadata, SequenceGroupOutput,
+                           SequenceOutput, SequenceStatus)
 from vllm.transformers_utils.tokenizer import (detokenize_incrementally,
                                               get_tokenizer)
 from vllm.utils import Counter
@ -88,8 +89,6 @@ class LLMEngine:

        self.model_config = model_config
        self.cache_config = cache_config
-        assert self.cache_config.sliding_window == getattr(
-            self.model_config.hf_config, "sliding_window", None)
        self.parallel_config = parallel_config
        self.scheduler_config = scheduler_config
        self.log_stats = log_stats
@ -125,7 +124,7 @@ class LLMEngine:
    def _init_workers(self, distributed_init_method: str):
        # Lazy import the Worker to avoid importing torch.cuda/xformers
        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+        from vllm.worker.worker import Worker

        assert self.parallel_config.world_size == 1, (
            "Ray is required if parallel_config.world_size > 1.")
@ -143,25 +142,35 @@ class LLMEngine:
            "init_model",
            get_all_outputs=True,
        )
+        self._run_workers(
+            "load_model",
+            get_all_outputs=True,
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )

    def _init_workers_ray(self, placement_group: "PlacementGroup",
                          **ray_remote_kwargs):
        # Lazy import the Worker to avoid importing torch.cuda/xformers
        # before CUDA_VISIBLE_DEVICES is set in the Worker
-        from vllm.worker.worker import Worker  # pylint: disable=import-outside-toplevel
+        from vllm.worker.worker import Worker

        self.workers: List[Worker] = []
        for bundle in placement_group.bundle_specs:
            if not bundle.get("GPU", 0):
                continue
+            if self.parallel_config.tensor_parallel_size == 1:
+                num_gpus = self.cache_config.gpu_memory_utilization
+            else:
+                num_gpus = 1
            worker = ray.remote(
                num_cpus=0,
-                num_gpus=1,
+                num_gpus=num_gpus,
                scheduling_strategy=PlacementGroupSchedulingStrategy(
                    placement_group=placement_group,
                    placement_group_capture_child_tasks=True),
                **ray_remote_kwargs,
-            )(RayWorker).remote(self.model_config.trust_remote_code)
+            )(RayWorkerVllm).remote(self.model_config.trust_remote_code)
            self.workers.append(worker)

        # Initialize torch distributed process group for the workers.
@ -182,6 +191,12 @@ class LLMEngine:
            "init_model",
            get_all_outputs=True,
        )
+        self._run_workers(
+            "load_model",
+            get_all_outputs=True,
+            max_concurrent_workers=self.parallel_config.
+            max_parallel_loading_workers,
+        )

    def _verify_args(self) -> None:
        self.model_config.verify_with_parallel_config(self.parallel_config)
@ -351,7 +366,7 @@ class LLMEngine:
        return current_worst_score >= highest_attainable_score

    def _process_sequence_group_outputs(self, seq_group: SequenceGroup,
-                                        outputs: SequenceGroupOutputs) -> None:
+                                        outputs: SequenceGroupOutput) -> None:
        # Process prompt logprobs
        prompt_logprobs = outputs.prompt_logprobs
        if prompt_logprobs is not None:
@ -372,7 +387,7 @@ class LLMEngine:

        # Process the child samples for each parent sequence
        for parent in parent_seqs:
-            child_samples: List[SequenceOutputs] = parent_child_dict[
+            child_samples: List[SequenceOutput] = parent_child_dict[
                parent.seq_id]
            if len(child_samples) == 0:
                # This parent sequence has no children samples. Remove
@ -581,8 +596,8 @@ class LLMEngine:
        else:
            self.num_generation_tokens.append((now, num_batched_tokens))

-        elapsed_time = now - self.last_logging_time
-        if elapsed_time < _LOGGING_INTERVAL_SEC:
+        should_log = now - self.last_logging_time >= _LOGGING_INTERVAL_SEC
+        if not should_log:
            return

        # Discard the old stats.
@ -621,6 +636,16 @@ class LLMEngine:
        else:
            cpu_cache_usage = 0.0

+        record_metrics(
+            avg_prompt_throughput=avg_prompt_throughput,
+            avg_generation_throughput=avg_generation_throughput,
+            scheduler_running=len(self.scheduler.running),
+            scheduler_swapped=len(self.scheduler.swapped),
+            scheduler_waiting=len(self.scheduler.waiting),
+            gpu_cache_usage=gpu_cache_usage,
+            cpu_cache_usage=cpu_cache_usage,
+        )
+
        logger.info("Avg prompt throughput: "
                    f"{avg_prompt_throughput:.1f} tokens/s, "
                    "Avg generation throughput: "
@ -682,16 +707,15 @@ class LLMEngine:
            seq.status = SequenceStatus.FINISHED_STOPPED
            return

-    def _run_workers(
+    def _run_workers_in_batch(
        self,
+        workers,
        method: str,
        *args,
-        get_all_outputs: bool = False,
        **kwargs,
-    ) -> Any:
-        """Runs the given method on all workers."""
+    ):
        all_outputs = []
-        for worker in self.workers:
+        for worker in workers:
            if self.parallel_config.worker_use_ray:
                executor = partial(worker.execute_method.remote, method)
            else:
@ -699,9 +723,31 @@ class LLMEngine:

            output = executor(*args, **kwargs)
            all_outputs.append(output)
-
        if self.parallel_config.worker_use_ray:
            all_outputs = ray.get(all_outputs)
+        return all_outputs
+
+    def _run_workers(
+        self,
+        method: str,
+        *args,
+        get_all_outputs: bool = False,
+        max_concurrent_workers: Optional[int] = None,
+        **kwargs,
+    ) -> Any:
+        """Runs the given method on all workers."""
+        all_outputs = []
+        if max_concurrent_workers:
+            work_groups = [
+                self.workers[i:i + max_concurrent_workers]
+                for i in range(0, len(self.workers), max_concurrent_workers)
+            ]
+        else:
+            work_groups = [self.workers]
+
+        for workers in work_groups:
+            all_outputs.extend(
+                self._run_workers_in_batch(workers, method, *args, **kwargs))

        if get_all_outputs:
            return all_outputs
--- a/vllm/engine/metrics.py
+++ b/vllm/engine/metrics.py
@ -0,0 +1,51 @@
+from aioprometheus import Gauge
+
+# The begin-* and end* here are used by the documentation generator
+# to extract the metrics definitions.
+
+# begin-metrics-definitions
+gauge_avg_prompt_throughput = Gauge("vllm:avg_prompt_throughput_toks_per_s",
+                                    "Average prefill throughput in tokens/s.")
+gauge_avg_generation_throughput = Gauge(
+    "vllm:avg_generation_throughput_toks_per_s",
+    "Average generation throughput in tokens/s.")
+
+gauge_scheduler_running = Gauge(
+    "vllm:num_requests_running",
+    "Number of requests that is currently running for inference.")
+gauge_scheduler_swapped = Gauge("vllm:num_requests_swapped",
+                                "Number requests swapped to CPU.")
+gauge_scheduler_waiting = Gauge("vllm:num_requests_waiting",
+                                "Number of requests waiting to be processed.")
+
+gauge_gpu_cache_usage = Gauge(
+    "vllm:gpu_cache_usage_perc",
+    "GPU KV-cache usage. 1 means 100 percent usage.")
+gauge_cpu_cache_usage = Gauge(
+    "vllm:cpu_cache_usage_perc",
+    "CPU KV-cache usage. 1 means 100 percent usage.")
+# end-metrics-definitions
+
+labels = {}
+
+
+def add_global_metrics_labels(**kwargs):
+    labels.update(kwargs)
+
+
+def record_metrics(
+    avg_prompt_throughput: float,
+    avg_generation_throughput: float,
+    scheduler_running: int,
+    scheduler_swapped: int,
+    scheduler_waiting: int,
+    gpu_cache_usage: float,
+    cpu_cache_usage: float,
+):
+    gauge_avg_prompt_throughput.set(labels, avg_prompt_throughput)
+    gauge_avg_generation_throughput.set(labels, avg_generation_throughput)
+    gauge_scheduler_running.set(labels, scheduler_running)
+    gauge_scheduler_swapped.set(labels, scheduler_swapped)
+    gauge_scheduler_waiting.set(labels, scheduler_waiting)
+    gauge_gpu_cache_usage.set(labels, gpu_cache_usage)
+    gauge_cpu_cache_usage.set(labels, cpu_cache_usage)
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@ -10,13 +10,12 @@ try:
    import ray
    from ray.air.util.torch_dist import TorchDistributedWorker

-    class RayWorker(TorchDistributedWorker):
+    class RayWorkerVllm(TorchDistributedWorker):
        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""

        def __init__(self, init_cached_hf_modules=False) -> None:
            if init_cached_hf_modules:
-                # pylint: disable=import-outside-toplevel
                from transformers.dynamic_module_utils import init_hf_modules
                init_hf_modules()
            self.worker = None
@ -37,7 +36,7 @@ except ImportError as e:
                   "`pip install ray pandas pyarrow`.")
    ray = None
    TorchDistributedWorker = None
-    RayWorker = None  # pylint: disable=invalid-name
+    RayWorkerVllm = None

 if TYPE_CHECKING:
    from ray.util.placement_group import PlacementGroup
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@ -134,25 +134,21 @@ class LLM:
        if isinstance(prompts, str):
            # Convert a single prompt to a list.
            prompts = [prompts]
-        if prompts is not None and prompt_token_ids is not None:
-            if len(prompts) != len(prompt_token_ids):
-                raise ValueError("The lengths of prompts and prompt_token_ids "
-                                 "must be the same.")
+        if (prompts is not None and prompt_token_ids is not None
+                and len(prompts) != len(prompt_token_ids)):
+            raise ValueError("The lengths of prompts and prompt_token_ids "
+                             "must be the same.")
        if sampling_params is None:
            # Use default sampling params.
            sampling_params = SamplingParams()

        # Add requests to the engine.
-        if prompts is not None:
-            num_requests = len(prompts)
-        else:
-            num_requests = len(prompt_token_ids)
+        num_requests = len(prompts) if prompts is not None else len(
+            prompt_token_ids)
        for i in range(num_requests):
            prompt = prompts[i] if prompts is not None else None
-            if prompt_token_ids is None:
-                token_ids = None
-            else:
-                token_ids = prompt_token_ids[i]
+            token_ids = None if prompt_token_ids is None else prompt_token_ids[
+                i]
            self._add_request(prompt, sampling_params, token_ids)
        return self._run_engine(use_tqdm)

--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@ -3,21 +3,24 @@

 import argparse
 import asyncio
+import codecs
 import json
 import time
 from http import HTTPStatus
 from typing import AsyncGenerator, Dict, List, Optional, Tuple, Union

+from aioprometheus import MetricsMiddleware
+from aioprometheus.asgi.starlette import metrics
 import fastapi
 import uvicorn
 from fastapi import Request
 from fastapi.exceptions import RequestValidationError
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse, StreamingResponse, Response
-from packaging import version

 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
+from vllm.engine.metrics import add_global_metrics_labels
 from vllm.entrypoints.openai.protocol import (
    CompletionRequest, CompletionResponse, CompletionResponseChoice,
    CompletionResponseStreamChoice, CompletionStreamResponse,
@ -31,20 +34,59 @@ from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer import get_tokenizer
 from vllm.utils import random_uuid

-try:
-    import fastchat
-    from fastchat.conversation import Conversation, SeparatorStyle
-    from fastchat.model.model_adapter import get_conversation_template
-    _fastchat_available = True
-except ImportError:
-    _fastchat_available = False
-
 TIMEOUT_KEEP_ALIVE = 5  # seconds

 logger = init_logger(__name__)
 served_model = None
 app = fastapi.FastAPI()
 engine = None
+response_role = None
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="vLLM OpenAI-Compatible RESTful API server.")
+    parser.add_argument("--host", type=str, default=None, help="host name")
+    parser.add_argument("--port", type=int, default=8000, help="port number")
+    parser.add_argument("--allow-credentials",
+                        action="store_true",
+                        help="allow credentials")
+    parser.add_argument("--allowed-origins",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed origins")
+    parser.add_argument("--allowed-methods",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed methods")
+    parser.add_argument("--allowed-headers",
+                        type=json.loads,
+                        default=["*"],
+                        help="allowed headers")
+    parser.add_argument("--served-model-name",
+                        type=str,
+                        default=None,
+                        help="The model name used in the API. If not "
+                        "specified, the model name will be the same as "
+                        "the huggingface name.")
+    parser.add_argument("--chat-template",
+                        type=str,
+                        default=None,
+                        help="The file path to the chat template, "
+                        "or the template in single-line form "
+                        "for the specified model")
+    parser.add_argument("--response-role",
+                        type=str,
+                        default="assistant",
+                        help="The role name to return if "
+                        "`request.add_generation_prompt=true`.")
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    return parser.parse_args()
+
+
+app.add_middleware(MetricsMiddleware)  # Trace HTTP server metrics
+app.add_route("/metrics", metrics)  # Exposes HTTP metrics


 def create_error_response(status_code: HTTPStatus,
@ -54,8 +96,27 @@ def create_error_response(status_code: HTTPStatus,
                        status_code=status_code.value)


+def load_chat_template(args, tokenizer):
+    if args.chat_template is not None:
+        try:
+            with open(args.chat_template, "r") as f:
+                chat_template = f.read()
+        except OSError:
+            # If opening a file fails, set chat template to be args to
+            # ensure we decode so our escape are interpreted correctly
+            chat_template = codecs.decode(args.chat_template, "unicode_escape")
+
+        tokenizer.chat_template = chat_template
+        logger.info(
+            f"Using supplied chat template:\n{tokenizer.chat_template}")
+    elif tokenizer.chat_template is not None:
+        logger.info(f"Using default chat template:\n{tokenizer.chat_template}")
+    else:
+        logger.warning("No chat template provided. Chat API will not work.")
+
+
@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
+async def validation_exception_handler(_, exc):
    return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))


@ -69,53 +130,6 @@ async def check_model(request) -> Optional[JSONResponse]:
    return ret


-async def get_gen_prompt(request) -> str:
-    if not _fastchat_available:
-        raise ModuleNotFoundError(
-            "fastchat is not installed. Please install fastchat to use "
-            "the chat completion and conversation APIs: `$ pip install fschat`"
-        )
-    if version.parse(fastchat.__version__) < version.parse("0.2.23"):
-        raise ImportError(
-            f"fastchat version is low. Current version: {fastchat.__version__} "
-            "Please upgrade fastchat to use: `$ pip install -U fschat`")
-
-    conv = get_conversation_template(request.model)
-    conv = Conversation(
-        name=conv.name,
-        system_template=conv.system_template,
-        system_message=conv.system_message,
-        roles=conv.roles,
-        messages=list(conv.messages),  # prevent in-place modification
-        offset=conv.offset,
-        sep_style=SeparatorStyle(conv.sep_style),
-        sep=conv.sep,
-        sep2=conv.sep2,
-        stop_str=conv.stop_str,
-        stop_token_ids=conv.stop_token_ids,
-    )
-
-    if isinstance(request.messages, str):
-        prompt = request.messages
-    else:
-        for message in request.messages:
-            msg_role = message["role"]
-            if msg_role == "system":
-                conv.system_message = message["content"]
-            elif msg_role == "user":
-                conv.append_message(conv.roles[0], message["content"])
-            elif msg_role == "assistant":
-                conv.append_message(conv.roles[1], message["content"])
-            else:
-                raise ValueError(f"Unknown role: {msg_role}")
-
-        # Add a blank message for the assistant.
-        conv.append_message(conv.roles[1], None)
-        prompt = conv.get_prompt()
-
-    return prompt
-
-
 async def check_length(
    request: Union[ChatCompletionRequest, CompletionRequest],
    prompt: Optional[str] = None,
@ -124,10 +138,8 @@ async def check_length(
    assert (not (prompt is None and prompt_ids is None)
            and not (prompt is not None and prompt_ids is not None)
            ), "Either prompt or prompt_ids should be provided."
-    if prompt_ids is not None:
-        input_ids = prompt_ids
-    else:
-        input_ids = tokenizer(prompt).input_ids
+    input_ids = prompt_ids if prompt_ids is not None else tokenizer(
+        prompt).input_ids
    token_num = len(input_ids)

    if request.max_tokens is None:
@ -162,16 +174,26 @@ async def show_available_models():
    return ModelList(data=model_cards)


-def create_logprobs(token_ids: List[int],
-                    id_logprobs: List[Dict[int, float]],
-                    initial_text_offset: int = 0) -> LogProbs:
+def create_logprobs(
+    token_ids: List[int],
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None,
+    num_output_top_logprobs: Optional[int] = None,
+    initial_text_offset: int = 0,
+) -> LogProbs:
    """Create OpenAI-style logprobs."""
    logprobs = LogProbs()
    last_token_len = 0
-    for token_id, id_logprob in zip(token_ids, id_logprobs):
+    if num_output_top_logprobs:
+        logprobs.top_logprobs = []
+    for i, token_id in enumerate(token_ids):
+        step_top_logprobs = top_logprobs[i]
+        if step_top_logprobs is not None:
+            token_logprob = step_top_logprobs[token_id]
+        else:
+            token_logprob = None
        token = tokenizer.convert_ids_to_tokens(token_id)
        logprobs.tokens.append(token)
-        logprobs.token_logprobs.append(id_logprob[token_id])
+        logprobs.token_logprobs.append(token_logprob)
        if len(logprobs.text_offset) == 0:
            logprobs.text_offset.append(initial_text_offset)
        else:
@ -179,10 +201,11 @@ def create_logprobs(token_ids: List[int],
                                        last_token_len)
        last_token_len = len(token)

-        logprobs.top_logprobs.append({
-            tokenizer.convert_ids_to_tokens(i): p
-            for i, p in id_logprob.items()
-        })
+        if num_output_top_logprobs:
+            logprobs.top_logprobs.append({
+                tokenizer.convert_ids_to_tokens(i): p
+                for i, p in step_top_logprobs.items()
+            } if step_top_logprobs else None)
    return logprobs


@ -198,8 +221,6 @@ async def create_chat_completion(request: ChatCompletionRequest,
        - function_call (Users should implement this by themselves)
        - logit_bias (to be supported by vLLM engine)
    """
-    logger.info(f"Received chat completion request: {request}")
-
    error_check_ret = await check_model(request)
    if error_check_ret is not None:
        return error_check_ret
@ -209,7 +230,15 @@ async def create_chat_completion(request: ChatCompletionRequest,
        return create_error_response(HTTPStatus.BAD_REQUEST,
                                     "logit_bias is not currently supported")

-    prompt = await get_gen_prompt(request)
+    try:
+        prompt = tokenizer.apply_chat_template(
+            conversation=request.messages,
+            tokenize=False,
+            add_generation_prompt=request.add_generation_prompt)
+    except Exception as e:
+        logger.error(f"Error in applying chat template from request: {str(e)}")
+        return create_error_response(HTTPStatus.BAD_REQUEST, str(e))
+
    token_ids, error_check_ret = await check_length(request, prompt=prompt)
    if error_check_ret is not None:
        return error_check_ret
@ -217,6 +246,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
    model_name = request.model
    request_id = f"cmpl-{random_uuid()}"
    created_time = int(time.monotonic())
+    chunk_object_type = "chat.completion.chunk"
    try:
        spaces_between_special_tokens = request.spaces_between_special_tokens
        sampling_params = SamplingParams(
@ -241,128 +271,162 @@ async def create_chat_completion(request: ChatCompletionRequest,
    result_generator = engine.generate(prompt, sampling_params, request_id,
                                       token_ids)

-    def create_stream_response_json(
-        index: int,
-        text: str,
-        finish_reason: Optional[str] = None,
-        usage: Optional[UsageInfo] = None,
-    ) -> str:
-        choice_data = ChatCompletionResponseStreamChoice(
-            index=index,
-            delta=DeltaMessage(content=text),
-            finish_reason=finish_reason,
-        )
-        response = ChatCompletionStreamResponse(
-            id=request_id,
-            created=created_time,
-            model=model_name,
-            choices=[choice_data],
-        )
-        if usage is not None:
-            response.usage = usage
-        # exclude unset to leave details out of each sse
-        response_json = response.json(exclude_unset=True, ensure_ascii=False)
-
-        return response_json
+    def get_role() -> str:
+        if request.add_generation_prompt:
+            return response_role
+        else:
+            return request.messages[-1]["role"]

    async def completion_stream_generator() -> AsyncGenerator[str, None]:
-        # First chunk with role
+        # Send first response for each request.n (index) with the role
+        role = get_role()
        for i in range(request.n):
            choice_data = ChatCompletionResponseStreamChoice(
-                index=i,
-                delta=DeltaMessage(role="assistant"),
-                finish_reason=None,
-            )
+                index=i, delta=DeltaMessage(role=role), finish_reason=None)
            chunk = ChatCompletionStreamResponse(id=request_id,
+                                                 object=chunk_object_type,
+                                                 created=created_time,
                                                 choices=[choice_data],
                                                 model=model_name)
            data = chunk.json(exclude_unset=True, ensure_ascii=False)
            yield f"data: {data}\n\n"

+        # Send response to echo the input portion of the last message
+        if request.echo:
+            last_msg_content = ""
+            if request.messages and isinstance(
+                    request.messages, list) and request.messages[-1].get(
+                        "content") and request.messages[-1].get(
+                            "role") == role:
+                last_msg_content = request.messages[-1]["content"]
+            if last_msg_content:
+                for i in range(request.n):
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=last_msg_content),
+                        finish_reason=None)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+
+        # Send response for each token for each request.n (index)
        previous_texts = [""] * request.n
        previous_num_tokens = [0] * request.n
+        finish_reason_sent = [False] * request.n
        async for res in result_generator:
            res: RequestOutput
            for output in res.outputs:
                i = output.index
-                delta_text = output.text[len(previous_texts[i]):]
-                previous_texts[i] = output.text
-                completion_tokens = len(output.token_ids)
-                previous_num_tokens[i] = completion_tokens
-                response_json = create_stream_response_json(
-                    index=i,
-                    text=delta_text,
-                )
-                yield f"data: {response_json}\n\n"
-                if output.finish_reason is not None:
+
+                if finish_reason_sent[i]:
+                    continue
+
+                if output.finish_reason is None:
+                    # Send token-by-token response for each request.n
+                    delta_text = output.text[len(previous_texts[i]):]
+                    previous_texts[i] = output.text
+                    completion_tokens = len(output.token_ids)
+                    previous_num_tokens[i] = completion_tokens
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i,
+                        delta=DeltaMessage(content=delta_text),
+                        finish_reason=None)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    data = chunk.json(exclude_unset=True, ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+                else:
+                    # Send the finish response for each request.n only once
                    prompt_tokens = len(res.prompt_token_ids)
                    final_usage = UsageInfo(
                        prompt_tokens=prompt_tokens,
                        completion_tokens=completion_tokens,
                        total_tokens=prompt_tokens + completion_tokens,
                    )
-                    response_json = create_stream_response_json(
-                        index=i,
-                        text="",
-                        finish_reason=output.finish_reason,
-                        usage=final_usage,
-                    )
-                    yield f"data: {response_json}\n\n"
+                    choice_data = ChatCompletionResponseStreamChoice(
+                        index=i, delta=[], finish_reason=output.finish_reason)
+                    chunk = ChatCompletionStreamResponse(
+                        id=request_id,
+                        object=chunk_object_type,
+                        created=created_time,
+                        choices=[choice_data],
+                        model=model_name)
+                    if final_usage is not None:
+                        chunk.usage = final_usage
+                    data = chunk.json(exclude_unset=True,
+                                      exclude_none=True,
+                                      ensure_ascii=False)
+                    yield f"data: {data}\n\n"
+                    finish_reason_sent[i] = True
+        # Send the final done message after all response.n are finished
        yield "data: [DONE]\n\n"

+    async def completion_full_generator():
+        final_res: RequestOutput = None
+        async for res in result_generator:
+            if await raw_request.is_disconnected():
+                # Abort the request if the client disconnects.
+                await engine.abort(request_id)
+                return create_error_response(HTTPStatus.BAD_REQUEST,
+                                             "Client disconnected")
+            final_res = res
+        assert final_res is not None
+
+        choices = []
+        role = get_role()
+        for output in final_res.outputs:
+            choice_data = ChatCompletionResponseChoice(
+                index=output.index,
+                message=ChatMessage(role=role, content=output.text),
+                finish_reason=output.finish_reason,
+            )
+            choices.append(choice_data)
+
+        if request.echo:
+            last_msg_content = ""
+            if request.messages and isinstance(
+                    request.messages, list) and request.messages[-1].get(
+                        "content") and request.messages[-1].get(
+                            "role") == role:
+                last_msg_content = request.messages[-1]["content"]
+
+            for choice in choices:
+                full_message = last_msg_content + choice.message.content
+                choice.message.content = full_message
+
+        num_prompt_tokens = len(final_res.prompt_token_ids)
+        num_generated_tokens = sum(
+            len(output.token_ids) for output in final_res.outputs)
+        usage = UsageInfo(
+            prompt_tokens=num_prompt_tokens,
+            completion_tokens=num_generated_tokens,
+            total_tokens=num_prompt_tokens + num_generated_tokens,
+        )
+        response = ChatCompletionResponse(
+            id=request_id,
+            created=created_time,
+            model=model_name,
+            choices=choices,
+            usage=usage,
+        )
+
+        return response
+
    # Streaming response
    if request.stream:
        return StreamingResponse(completion_stream_generator(),
                                 media_type="text/event-stream")
-
-    # Non-streaming response
-    final_res: RequestOutput = None
-    async for res in result_generator:
-        if await raw_request.is_disconnected():
-            # Abort the request if the client disconnects.
-            await engine.abort(request_id)
-            return create_error_response(HTTPStatus.BAD_REQUEST,
-                                         "Client disconnected")
-        final_res = res
-    assert final_res is not None
-    choices = []
-    for output in final_res.outputs:
-        choice_data = ChatCompletionResponseChoice(
-            index=output.index,
-            message=ChatMessage(role="assistant", content=output.text),
-            finish_reason=output.finish_reason,
-        )
-        choices.append(choice_data)
-
-    num_prompt_tokens = len(final_res.prompt_token_ids)
-    num_generated_tokens = sum(
-        len(output.token_ids) for output in final_res.outputs)
-    usage = UsageInfo(
-        prompt_tokens=num_prompt_tokens,
-        completion_tokens=num_generated_tokens,
-        total_tokens=num_prompt_tokens + num_generated_tokens,
-    )
-    response = ChatCompletionResponse(
-        id=request_id,
-        created=created_time,
-        model=model_name,
-        choices=choices,
-        usage=usage,
-    )
-
-    if request.stream:
-        # When user requests streaming but we don't stream, we still need to
-        # return a streaming response with a single event.
-        response_json = response.json(ensure_ascii=False)
-
-        async def fake_stream_generator() -> AsyncGenerator[str, None]:
-            yield f"data: {response_json}\n\n"
-            yield "data: [DONE]\n\n"
-
-        return StreamingResponse(fake_stream_generator(),
-                                 media_type="text/event-stream")
-
-    return response
+    else:
+        return await completion_full_generator()


@app.post("/v1/completions")
@ -373,23 +437,17 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    for the API specification. This API mimics the OpenAI Completion API.

    NOTE: Currently we do not support the following features:
-        - echo (since the vLLM engine does not currently support
-          getting the logprobs of prompt tokens)
        - suffix (the language models we currently support do not support
          suffix)
        - logit_bias (to be supported by vLLM engine)
    """
-    logger.info(f"Received completion request: {request}")

    error_check_ret = await check_model(request)
    if error_check_ret is not None:
        return error_check_ret

-    if request.echo:
-        # We do not support echo since the vLLM engine does not
-        # currently support getting the logprobs of prompt tokens.
-        return create_error_response(HTTPStatus.BAD_REQUEST,
-                                     "echo is not currently supported")
+    # OpenAI API supports echoing the prompt when max_tokens is 0.
+    echo_without_generation = request.echo and request.max_tokens == 0

    if request.suffix is not None:
        # The language models we currently support do not support suffix.
@ -445,9 +503,11 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
            stop=request.stop,
            stop_token_ids=request.stop_token_ids,
            ignore_eos=request.ignore_eos,
-            max_tokens=request.max_tokens,
+            max_tokens=request.max_tokens
+            if not echo_without_generation else 1,
            logprobs=request.logprobs,
            use_beam_search=request.use_beam_search,
+            prompt_logprobs=request.logprobs if request.echo else None,
            skip_special_tokens=request.skip_special_tokens,
            spaces_between_special_tokens=spaces_between_special_tokens,
        )
@ -497,24 +557,42 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
    async def completion_stream_generator() -> AsyncGenerator[str, None]:
        previous_texts = [""] * request.n
        previous_num_tokens = [0] * request.n
+        has_echoed = [False] * request.n
        async for res in result_generator:
            res: RequestOutput
            for output in res.outputs:
                i = output.index
                delta_text = output.text[len(previous_texts[i]):]
+                token_ids = output.token_ids[previous_num_tokens[i]:]
+                top_logprobs = output.logprobs[previous_num_tokens[i]:]
+                offsets = len(previous_texts[i])
+                if request.echo and not has_echoed[i]:
+                    if not echo_without_generation:
+                        delta_text = res.prompt + delta_text
+                        token_ids = res.prompt_token_ids + token_ids
+                        top_logprobs = res.prompt_logprobs + top_logprobs
+                    else:
+                        delta_text = res.prompt
+                        token_ids = res.prompt_token_ids
+                        top_logprobs = res.prompt_logprobs
+                    has_echoed[i] = True
                if request.logprobs is not None:
                    logprobs = create_logprobs(
-                        output.token_ids[previous_num_tokens[i]:],
-                        output.logprobs[previous_num_tokens[i]:],
-                        len(previous_texts[i]))
+                        token_ids=token_ids,
+                        top_logprobs=top_logprobs,
+                        num_output_top_logprobs=request.logprobs,
+                        initial_text_offset=offsets,
+                    )
                else:
                    logprobs = None
                previous_texts[i] = output.text
                previous_num_tokens[i] = len(output.token_ids)
+                finish_reason = output.finish_reason
                response_json = create_stream_response_json(
                    index=i,
                    text=delta_text,
                    logprobs=logprobs,
+                    finish_reason=finish_reason,
                )
                yield f"data: {response_json}\n\n"
                if output.finish_reason is not None:
@ -553,14 +631,36 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
        final_res = res
    assert final_res is not None
    choices = []
+    prompt_token_ids = final_res.prompt_token_ids
+    prompt_logprobs = final_res.prompt_logprobs
+    prompt_text = final_res.prompt
    for output in final_res.outputs:
        if request.logprobs is not None:
-            logprobs = create_logprobs(output.token_ids, output.logprobs)
+            if not echo_without_generation:
+                token_ids = output.token_ids
+                top_logprobs = output.logprobs
+                if request.echo:
+                    token_ids = prompt_token_ids + token_ids
+                    top_logprobs = prompt_logprobs + top_logprobs
+            else:
+                token_ids = prompt_token_ids
+                top_logprobs = prompt_logprobs
+            logprobs = create_logprobs(
+                token_ids=token_ids,
+                top_logprobs=top_logprobs,
+                num_output_top_logprobs=request.logprobs,
+            )
        else:
            logprobs = None
+        if not echo_without_generation:
+            output_text = output.text
+            if request.echo:
+                output_text = prompt_text + output_text
+        else:
+            output_text = prompt_text
        choice_data = CompletionResponseChoice(
            index=output.index,
-            text=output.text,
+            text=output_text,
            logprobs=logprobs,
            finish_reason=output.finish_reason,
        )
@ -598,34 +698,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):


 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="vLLM OpenAI-Compatible RESTful API server.")
-    parser.add_argument("--host", type=str, default=None, help="host name")
-    parser.add_argument("--port", type=int, default=8000, help="port number")
-    parser.add_argument("--allow-credentials",
-                        action="store_true",
-                        help="allow credentials")
-    parser.add_argument("--allowed-origins",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed origins")
-    parser.add_argument("--allowed-methods",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed methods")
-    parser.add_argument("--allowed-headers",
-                        type=json.loads,
-                        default=["*"],
-                        help="allowed headers")
-    parser.add_argument("--served-model-name",
-                        type=str,
-                        default=None,
-                        help="The model name used in the API. If not "
-                        "specified, the model name will be the same as "
-                        "the huggingface name.")
-
-    parser = AsyncEngineArgs.add_cli_args(parser)
-    args = parser.parse_args()
+    args = parse_args()

    app.add_middleware(
        CORSMiddleware,
@ -642,6 +715,8 @@ if __name__ == "__main__":
    else:
        served_model = args.model

+    response_role = args.response_role
+
    engine_args = AsyncEngineArgs.from_cli_args(args)
    engine = AsyncLLMEngine.from_engine_args(engine_args)
    engine_model_config = asyncio.run(engine.get_model_config())
@ -652,6 +727,10 @@ if __name__ == "__main__":
        engine_model_config.tokenizer,
        tokenizer_mode=engine_model_config.tokenizer_mode,
        trust_remote_code=engine_model_config.trust_remote_code)
+    load_chat_template(args, tokenizer)
+
+    # Register labels for metrics
+    add_global_metrics_labels(model_name=engine_args.model)

    uvicorn.run(app,
                host=args.host,
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@ -73,6 +73,8 @@ class ChatCompletionRequest(BaseModel):
    stop_token_ids: Optional[List[int]] = Field(default_factory=list)
    skip_special_tokens: Optional[bool] = True
    spaces_between_special_tokens: Optional[bool] = True
+    add_generation_prompt: Optional[bool] = True
+    echo: Optional[bool] = False


 class CompletionRequest(BaseModel):
@ -106,8 +108,7 @@ class LogProbs(BaseModel):
    text_offset: List[int] = Field(default_factory=list)
    token_logprobs: List[Optional[float]] = Field(default_factory=list)
    tokens: List[str] = Field(default_factory=list)
-    top_logprobs: List[Optional[Dict[str,
-                                     float]]] = Field(default_factory=list)
+    top_logprobs: Optional[List[Optional[Dict[int, float]]]] = None


 class CompletionResponseChoice(BaseModel):
--- a/vllm/model_executor/init.py
+++ b/vllm/model_executor/init.py
@ -1,9 +1,11 @@
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_random_seed

 __all__ = [
    "InputMetadata",
    "get_model",
+    "SamplingMetadata",
    "set_random_seed",
 ]
--- a/vllm/model_executor/input_metadata.py
+++ b/vllm/model_executor/input_metadata.py
@ -1,91 +1,42 @@
-from typing import Dict, List, Optional, Tuple
+from typing import List, Optional

 import torch
-from xformers.ops import AttentionBias
-
-from vllm.sampling_params import SamplingParams, SamplingType
-from vllm.sequence import SequenceData


 class InputMetadata:
-    """Metadata for input sequences. Used for PagedAttention.
+    """Metadata for input sequences. Used in PagedAttention.

    Args:
-        seq_groups: List of (seq_ids, sampling_params).
-        seq_data: Seq_id -> SequenceData.
        prompt_lens: Lengths of prompts.
        slot_mapping: The address to write the new KV to of each token.
-        context_lens: the length of attention context for each generation token.
        max_context_len: The maximum context length.
+        context_lens: the length of attention context for each sequence.
        block_tables: The block tables. (Seq id -> list of physical block)
    """

    def __init__(
        self,
-        seq_groups: List[Tuple[List[int], SamplingParams]],
-        seq_data: Dict[int, SequenceData],
        prompt_lens: List[int],
        slot_mapping: torch.Tensor,
-        context_lens: torch.Tensor,
-        max_context_len: int,
-        block_tables: torch.Tensor,
-        selected_token_indices: torch.Tensor,
-        categorized_sample_indices: Dict[SamplingType, torch.Tensor],
-        sliding_window: Optional[int] = None,
+        max_context_len: Optional[int],
+        context_lens: Optional[torch.Tensor],
+        block_tables: Optional[torch.Tensor],
    ) -> None:
-        self.seq_groups = seq_groups
-        self.seq_data = seq_data
        self.prompt_lens = prompt_lens
+        self.max_context_len = max_context_len
        self.slot_mapping = slot_mapping
        self.context_lens = context_lens
-        self.max_context_len = max_context_len
        self.block_tables = block_tables
-        self.selected_token_indices = selected_token_indices
-        self.categorized_sample_indices = categorized_sample_indices
-
-        self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
-        self.to_cache = None
-        if sliding_window is not None:
-            # We need to keep the positions of sliding windows within
-            # the key / value tables, this is helpful to know which
-            # elements we need to cache.
-            to_cache, start_idx = [], 0
-            for prompt_len in self.prompt_lens:
-                to_cache.extend(
-                    range(
-                        start_idx + max(0, prompt_len - sliding_window),
-                        start_idx + prompt_len,
-                    ))
-                start_idx += self.max_prompt_len
-            to_cache.extend(range(start_idx, slot_mapping.shape[0]))
-            self.to_cache = torch.tensor(to_cache,
-                                         dtype=torch.int32,
-                                         device=self.slot_mapping.device)
-
-        self.num_prompts = len(prompt_lens)
-        self.num_prompt_tokens = self.num_prompts * self.max_prompt_len
-        self.num_generation_tokens = context_lens.shape[0]
-        if block_tables.numel() > 0:
-            self.max_num_blocks_per_seq = block_tables.shape[1]
-        else:
-            self.max_num_blocks_per_seq = 0
-        assert block_tables.shape[0] == self.num_generation_tokens

+        self.is_prompt = len(prompt_lens) > 0
        # Set during the execution of the first attention op.
-        self.attn_bias: Optional[AttentionBias] = None
+        # FIXME(woosuk): This is a hack.
+        self.attn_bias = None

    def __repr__(self) -> str:
-        # Print only useful metadata.
-        return (
-            f'InputMetadata('
-            f'num_prompt_tokens={self.num_prompt_tokens}, '
-            f'num_prompts={self.num_prompts}, '
-            f'prompt_lens={self.prompt_lens}, '
-            f'num_generation_tokens={self.num_generation_tokens}, '
-            f'context_lens={self.context_lens}, '
-            f'max_context_len={self.max_context_len}), '
-            f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
-            f'block_tables={self.block_tables}, '
-            f'selected_token_indices={self.selected_token_indices}, '
-            f'categorized_sample_indices={self.categorized_sample_indices}, '
-            f'slot_mapping={self.slot_mapping})')
+        return ("InputMetadata("
+                f"prompt_lens={self.prompt_lens}, "
+                f"max_context_len={self.max_context_len}, "
+                f"slot_mapping={self.slot_mapping}, "
+                f"context_lens={self.context_lens}, "
+                f"block_tables={self.block_tables})")
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@ -1,11 +1,17 @@
 """Custom activation functions."""
+import math
 from typing import Optional

 import torch
 import torch.nn as nn
+import torch.nn.functional as F

-from vllm import activation_ops
+from vllm._C import ops
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.parallel_utils.parallel_state import (
+    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.parallel_utils.utils import divide
+from vllm.model_executor.utils import set_weight_attrs


 class SiluAndMul(nn.Module):
@ -18,27 +24,43 @@ class SiluAndMul(nn.Module):
        return: (batch_size, seq_len, d) or (num_tokens, d)
    """

+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        d = x.shape[-1] // 2
+        return F.silu(x[..., :d]) * x[..., d:]
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        d = x.shape[-1] // 2
        output_shape = (x.shape[:-1] + (d, ))
        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        activation_ops.silu_and_mul(out, x)
+        ops.silu_and_mul(out, x)
        return out


 class NewGELU(nn.Module):

+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        c = math.sqrt(2.0 / math.pi)
+        return 0.5 * x * (1.0 + torch.tanh(c *
+                                           (x + 0.044715 * torch.pow(x, 3.0))))
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
-        activation_ops.gelu_new(out, x)
+        ops.gelu_new(out, x)
        return out


 class FastGELU(nn.Module):

+    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+        """PyTorch-native implementation equivalent to forward()."""
+        return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
+                                           (1.0 + 0.044715 * x * x)))
+
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        out = torch.empty_like(x)
-        activation_ops.gelu_fast(out, x)
+        ops.gelu_fast(out, x)
        return out


@ -51,17 +73,40 @@ class ScaledActivation(nn.Module):
    def __init__(
        self,
        act_module: nn.Module,
-        hidden_size: int,
-        params_dtype: torch.dtype,
+        intermediate_size: int,
+        input_is_parallel: bool = True,
+        params_dtype: Optional[torch.dtype] = None,
    ):
        super().__init__()
        self.act = act_module
+        self.input_is_parallel = input_is_parallel
+        if input_is_parallel:
+            tp_size = get_tensor_model_parallel_world_size()
+            intermediate_size_per_partition = divide(intermediate_size,
+                                                     tp_size)
+        else:
+            intermediate_size_per_partition = intermediate_size
+        if params_dtype is None:
+            params_dtype = torch.get_default_dtype()
        self.scales = nn.Parameter(
-            torch.empty(hidden_size, dtype=params_dtype, device="cuda"))
+            torch.empty(intermediate_size_per_partition,
+                        dtype=params_dtype,
+                        device="cuda"))
+        set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})

-    def forward(self, x: torch.Tensor):
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.act(x) / self.scales

+    def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
+        param_data = param.data
+        if self.input_is_parallel:
+            tp_rank = get_tensor_model_parallel_rank()
+            shard_size = param_data.shape[0]
+            start_idx = tp_rank * shard_size
+            loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
+        assert param_data.shape == loaded_weight.shape
+        param_data.copy_(loaded_weight)
+

 _ACTIVATION_REGISTRY = {
    "gelu": nn.GELU(),
@ -76,6 +121,8 @@ def get_act_fn(
    act_fn_name: str,
    quant_config: Optional[QuantizationConfig] = None,
    intermediate_size: Optional[int] = None,
+    input_is_parallel: bool = True,
+    params_dtype: Optional[torch.dtype] = None,
 ) -> nn.Module:
    """Get an activation function by name."""
    act_fn_name = act_fn_name.lower()
@ -84,15 +131,11 @@ def get_act_fn(
            f"Activation function {act_fn_name!r} is not supported.")

    act_fn = _ACTIVATION_REGISTRY[act_fn_name]
-    if quant_config is not None:
-        if act_fn_name in quant_config.get_scaled_act_names():
-            if intermediate_size is None:
-                raise ValueError(
-                    "intermediate_size must be specified for scaled "
-                    "activation functions.")
-            return ScaledActivation(
-                act_fn,
-                intermediate_size,
-                params_dtype=torch.get_default_dtype(),
-            )
+    if (quant_config is not None
+            and act_fn_name in quant_config.get_scaled_act_names()):
+        if intermediate_size is None:
+            raise ValueError("intermediate_size must be specified for scaled "
+                             "activation functions.")
+        return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
+                                params_dtype)
    return act_fn
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@ -1,5 +1,5 @@
 """Multi-head attention."""
-from typing import Any, Dict, List, Optional
+from typing import List, Optional

 import torch
 import torch.nn as nn
@ -7,10 +7,9 @@ from xformers import ops as xops
 from xformers.ops.fmha.attn_bias import (BlockDiagonalCausalMask,
                                         LowerTriangularMaskWithTensorBias)

-from vllm import attention_ops
-from vllm import cache_ops
+from vllm._C import ops
+from vllm._C import cache_ops
 from vllm.model_executor.input_metadata import InputMetadata
-from vllm.model_executor.layers.rotary_embedding import get_rope

 _SUPPORTED_HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
@ -18,38 +17,39 @@ _PARTITION_SIZE = 512


 class PagedAttention(nn.Module):
-    # pylint: disable=line-too-long
-    """GPT-style multi-head PagedAttention.
+    """MHA/MQA/GQA layer with PagedAttention.

    This class takes query, key, and value tensors as input. The input tensors
-    can either contain prompt tokens or generation tokens, in addition to
-    paddings.
-
+    can either contain prompt tokens or generation tokens.
    The class does the following:
-    1. Perform multi_query_kv_attention for the prompts. This operation does
-        not use the KV cache.
-    2. Wait for the cache operations (e.g., swap, copy) to finish. The cache
+
+    1. Wait for the cache operations (e.g., swap, copy) to finish. The cache
        operations are issued by the cache engine before executing the forward
        pass of the model, and they are executed asynchronously.
-    3. Reshape and store the input key and value tensors in the KV cache.
-    4. Perform single_query_cached_kv_attention for the generation tokens.
-        This operation reads the previous key and value tensors from the KV
-        cache.
-    5. Return the output tensor.
+    2. Reshape and store the input key and value tensors in the KV cache.
+    3. Perform (multi-head/multi-query/grouped-query) attention using either
+        xformers or the PagedAttention custom op.
+    4. Return the output tensor.
    """

-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 num_kv_heads: Optional[int] = None,
-                 sliding_window: Optional[int] = None) -> None:
+    def __init__(
+        self,
+        num_heads: int,
+        head_size: int,
+        scale: float,
+        num_kv_heads: Optional[int] = None,
+        alibi_slopes: Optional[List[float]] = None,
+        sliding_window: Optional[int] = None,
+    ) -> None:
        super().__init__()
        self.num_heads = num_heads
        self.head_size = head_size
        self.scale = float(scale)
        self.num_kv_heads = num_heads if num_kv_heads is None else num_kv_heads
        self.sliding_window = sliding_window
+        if alibi_slopes is not None:
+            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
+        self.register_buffer("alibi_slopes", alibi_slopes, persistent=False)

        assert self.num_heads % self.num_kv_heads == 0
        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
@ -61,148 +61,6 @@ class PagedAttention(nn.Module):
            raise ValueError(f"head_size ({self.head_size}) is not supported. "
                             f"Supported head sizes: {_SUPPORTED_HEAD_SIZES}.")

-    def set_attn_bias(
-        self,
-        input_metadata: InputMetadata,
-        dtype: torch.dtype,
-    ) -> None:
-        del dtype  # Unused.
-        if input_metadata.attn_bias is not None:
-            # Already set by a previous layer.
-            return
-        prompt_lens = [input_metadata.max_prompt_len
-                       ] * input_metadata.num_prompts
-        attn_bias = BlockDiagonalCausalMask.from_seqlens(prompt_lens)
-        if self.sliding_window is not None:
-            attn_bias = attn_bias.make_local_attention(self.sliding_window)
-        input_metadata.attn_bias = attn_bias
-
-    def multi_query_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        """Normal attention for the prompt tokens.
-
-        Args:
-            output: shape = [num_prompt_tokens, num_heads, head_size]
-            query: shape = [num_prompt_tokens, num_heads, head_size]
-            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            input_metadata: metadata for paged attention.
-        """
-        if self.num_kv_heads != self.num_heads:
-            # Project the key and value tensors to the desired number of heads.
-            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
-            value = torch.repeat_interleave(value,
-                                            self.num_queries_per_kv,
-                                            dim=1)
-
-        # TODO(woosuk): The unsqueeze op may incur some CPU overhead. Optimize.
-        out = xops.memory_efficient_attention_forward(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=input_metadata.attn_bias,
-            p=0.0,
-            scale=self.scale,
-        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output.copy_(out.squeeze(0))
-        return output
-
-    def get_alibi_slopes(self) -> Optional[torch.Tensor]:
-        """Returns the slopes for the alibi attention bias.
-
-        Returns:
-            slopes: shape = [num_heads]
-        """
-        return None
-
-    def single_query_cached_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        input_metadata: InputMetadata,
-        alibi_slopes: Optional[torch.Tensor],
-    ) -> None:
-        """PagedAttention for the generation tokens.
-
-        Args:
-            output: shape = [num_generation_tokens, num_heads, head_size]
-            query: shape = [num_generation_tokens, num_heads, head_size]
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
-                block_size, x]
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
-                block_size]
-            input_metadata: metadata for paged attention.
-            alibi_slopes: shape = [num_heads]
-        """
-        block_size = value_cache.shape[3]
-        num_seqs, num_heads, head_size = query.shape
-        max_num_partitions = (
-            (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
-            _PARTITION_SIZE)
-        # NOTE(woosuk): We use a simple heuristic to decide whether to use
-        # PagedAttention V1 or V2. If the number of partitions is 1, we use
-        # V1 to avoid the overhead of reduction. Also, if the number of
-        # sequences or heads is large, we use V1 since there is enough work
-        # to parallelize.
-        # TODO(woosuk): Tune this heuristic.
-        # For context len > 8192, use V2 kernel to avoid shared memory shortage.
-        use_v1 = input_metadata.max_context_len <= 8192 and (
-            max_num_partitions == 1 or num_seqs * num_heads > 512)
-        if use_v1:
-            # Run PagedAttention V1.
-            attention_ops.paged_attention_v1(
-                output,
-                query,
-                key_cache,
-                value_cache,
-                self.head_mapping,
-                self.scale,
-                input_metadata.block_tables,
-                input_metadata.context_lens,
-                block_size,
-                input_metadata.max_context_len,
-                alibi_slopes,
-            )
-        else:
-            # Run PagedAttention V2.
-            assert _PARTITION_SIZE % block_size == 0
-            tmp_output = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions, head_size),
-                dtype=output.dtype,
-                device=output.device,
-            )
-            exp_sums = torch.empty(
-                size=(num_seqs, num_heads, max_num_partitions),
-                dtype=torch.float32,
-                device=output.device,
-            )
-            max_logits = torch.empty_like(exp_sums)
-            attention_ops.paged_attention_v2(
-                output,
-                exp_sums,
-                max_logits,
-                tmp_output,
-                query,
-                key_cache,
-                value_cache,
-                self.head_mapping,
-                self.scale,
-                input_metadata.block_tables,
-                input_metadata.context_lens,
-                block_size,
-                input_metadata.max_context_len,
-                alibi_slopes,
-            )
-
    def forward(
        self,
        query: torch.Tensor,
@ -215,9 +73,6 @@ class PagedAttention(nn.Module):
    ) -> torch.Tensor:
        """PagedAttention forward pass.

-        NOTE: The query, key, and value tensors must be sliced from a qkv
-        tensor of shape [batch_size, seq_len, 3 * num_heads * head_size].
-
        Args:
            query: shape = [batch_size, seq_len, num_heads * head_size]
            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
@ -226,226 +81,202 @@ class PagedAttention(nn.Module):
                block_size, x]
            value_cache: shape = [num_blocks, num_kv_heads, head_size,
                block_size]
-            input_metadata: metadata for paged attention.
+            input_metadata: metadata for the inputs.
            cache_event: event to wait for the cache operations to finish.
-
        Returns:
            shape = [batch_size, seq_len, num_heads * head_size]
        """
-        batch_size, seq_len, _ = query.shape
+        batch_size, seq_len, hidden_size = query.shape
        # Reshape the query, key, and value tensors.
        query = query.view(-1, self.num_heads, self.head_size)
        key = key.view(-1, self.num_kv_heads, self.head_size)
        value = value.view(-1, self.num_kv_heads, self.head_size)
+        slot_mapping = input_metadata.slot_mapping.flatten()

-        # Pre-allocate the output tensor.
-        output = torch.empty_like(query)
-
-        # Compute the attention op for prompts.
-        num_prompt_tokens = input_metadata.num_prompt_tokens
-        if num_prompt_tokens > 0:
-            # Prompt run.
-            assert input_metadata.num_generation_tokens == 0
-            self.set_attn_bias(input_metadata, dtype=query.dtype)
-            self.multi_query_kv_attention(
-                output,
-                query,
-                key,
-                value,
-                input_metadata,
-            )
-
-        # Wait until the cache op is done.
        if cache_event is not None:
            cache_event.wait()

        # Reshape the keys and values and store them in the cache.
-        # When key_cache and value_cache are not provided, the new key
-        # and value vectors will not be cached.
+        # If key_cache and value_cache are not provided, the new key and value
+        # vectors will not be cached. This happens during the initial memory
+        # profiling run.
        if key_cache is not None and value_cache is not None:
-            key_to_cache = key
-            value_to_cache = value
-            slot_mapping = input_metadata.slot_mapping.view(-1)
-            if input_metadata.to_cache is not None:
-                key_to_cache = key_to_cache[input_metadata.to_cache]
-                value_to_cache = value_to_cache[input_metadata.to_cache]
-                slot_mapping = slot_mapping[input_metadata.to_cache]
-
            cache_ops.reshape_and_cache(
-                key_to_cache,
-                value_to_cache,
+                key,
+                value,
                key_cache,
                value_cache,
                slot_mapping,
            )

-        if input_metadata.num_generation_tokens > 0:
+        if input_metadata.is_prompt:
+            # Prompt run.
+            if self.num_kv_heads != self.num_heads:
+                # As of Nov 2023, xformers only supports MHA. For MQA/GQA,
+                # project the key and value tensors to the desired number of
+                # heads.
+                # TODO(woosuk): Use MQA/GQA kernels for higher performance.
+                query = query.view(query.shape[0], self.num_kv_heads,
+                                   self.num_queries_per_kv, query.shape[-1])
+                key = key[:, :,
+                          None, :].expand(key.shape[0], self.num_kv_heads,
+                                          self.num_queries_per_kv,
+                                          key.shape[-1])
+                value = value[:, :, None, :].expand(value.shape[0],
+                                                    self.num_kv_heads,
+                                                    self.num_queries_per_kv,
+                                                    value.shape[-1])
+
+            # Set attention bias if not provided. This typically happens at the
+            # very attention layer of every iteration.
+            # FIXME(woosuk): This is a hack.
+            if input_metadata.attn_bias is None:
+                if self.alibi_slopes is None:
+                    attn_bias = BlockDiagonalCausalMask.from_seqlens(
+                        [seq_len] * batch_size)
+                    if self.sliding_window is not None:
+                        attn_bias = attn_bias.make_local_attention(
+                            self.sliding_window)
+                    input_metadata.attn_bias = attn_bias
+                else:
+                    input_metadata.attn_bias = _make_alibi_bias(
+                        self.alibi_slopes, batch_size, seq_len, query.dtype)
+
+            # TODO(woosuk): Too many view operations. Let's try to reduce them
+            # in the future for code readability.
+            if self.alibi_slopes is None:
+                query = query.unsqueeze(0)
+                key = key.unsqueeze(0)
+                value = value.unsqueeze(0)
+            else:
+                query = query.unflatten(0, (batch_size, seq_len))
+                key = key.unflatten(0, (batch_size, seq_len))
+                value = value.unflatten(0, (batch_size, seq_len))
+
+            out = xops.memory_efficient_attention_forward(
+                query,
+                key,
+                value,
+                attn_bias=input_metadata.attn_bias,
+                p=0.0,
+                scale=self.scale,
+            )
+            output = out.view_as(query)
+        else:
            # Decoding run.
-            assert input_metadata.num_prompt_tokens == 0
-            assert key_cache is not None and value_cache is not None, (
-                "key_cache and value_cache must be provided when "
-                "generating tokens.")
-            # Compute the attention op for generation tokens.
-            self.single_query_cached_kv_attention(output, query, key_cache,
-                                                  value_cache, input_metadata,
-                                                  self.get_alibi_slopes())
+            output = _paged_attention(
+                query,
+                key_cache,
+                value_cache,
+                input_metadata,
+                self.head_mapping,
+                self.scale,
+                self.alibi_slopes,
+            )

        # Reshape the output tensor.
-        # NOTE(woosuk): The output tensor may include paddings.
-        return output.view(batch_size, seq_len,
-                           self.num_heads * self.head_size)
+        return output.view(batch_size, seq_len, hidden_size)


-class PagedAttentionWithRoPE(PagedAttention):
-    """PagedAttention with rotary positional embedding."""
+def _make_alibi_bias(
+    alibi_slopes: torch.Tensor,
+    batch_size: int,
+    seq_len: int,
+    dtype: torch.dtype,
+) -> LowerTriangularMaskWithTensorBias:
+    bias = torch.arange(seq_len, dtype=dtype)
+    # NOTE(zhuohan): HF uses
+    #     `bias = bias[None, :].repeat(prompt_len, 1)`
+    # here. We find that both biases give the same results, but
+    # the bias below more accurately follows the original ALiBi
+    # paper.
+    bias = bias[None, :] - bias[:, None]
+    bias = bias.to(alibi_slopes.device)

-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        rotary_dim: int,
-        max_position: int = 8192,
-        base: int = 10000,
-        num_kv_heads: Optional[int] = None,
-        is_neox_style: bool = True,
-        rope_scaling: Optional[Dict[str, Any]] = None,
-        sliding_window: Optional[int] = None,
-    ) -> None:
-        super().__init__(num_heads,
-                         head_size,
-                         scale,
-                         num_kv_heads,
-                         sliding_window=sliding_window)
-        self.rotary_emb = get_rope(head_size, rotary_dim, max_position, base,
-                                   is_neox_style, rope_scaling)
+    # When using custom attention bias, xformers requires the bias to
+    # be sliced from a tensor whose length is a multiple of 8.
+    padded_len = (seq_len + 7) // 8 * 8
+    bias = torch.empty(
+        batch_size,
+        alibi_slopes.shape[0],
+        seq_len,
+        padded_len,
+        device=alibi_slopes.device,
+        dtype=dtype,
+    )[:, :, :, :seq_len].copy_(bias)
+    bias.mul_(alibi_slopes[:, None, None])
+    attn_bias = LowerTriangularMaskWithTensorBias(bias)
+    return attn_bias

-    def forward(
-        self,
-        positions: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        key_cache: torch.Tensor,
-        value_cache: torch.Tensor,
-        input_metadata: InputMetadata,
-        cache_event: Optional[torch.cuda.Event],
-    ) -> torch.Tensor:
-        """ PagedAttention forward pass with rotary embedding.

-        Args:
-            positions: shape = [batch_size, seq_len]
-            query: shape = [batch_size, seq_len, num_heads * head_size]
-            key: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            value: shape = [batch_size, seq_len, num_kv_heads * head_size]
-            key_cache: shape = [num_blocks, num_kv_heads, head_size/x,
-                block_size, x]
-            value_cache: shape = [num_blocks, num_kv_heads, head_size,
-                block_size]
-            input_metadata: metadata for paged attention.
-            cache_event: event to wait for the cache operations to finish.
+def _paged_attention(
+    query: torch.Tensor,
+    key_cache: torch.Tensor,
+    value_cache: torch.Tensor,
+    input_metadata: InputMetadata,
+    head_mapping: torch.Tensor,
+    scale: float,
+    alibi_slopes: Optional[torch.Tensor],
+) -> torch.Tensor:
+    output = torch.empty_like(query)

-        Returns:
-            shape = [batch_size, seq_len, num_heads * head_size]
-        """
-
-        # Apply rotary embedding to the query and key before passing them
-        # to the attention op.
-        query, key = self.rotary_emb(positions, query, key)
-        return super().forward(
+    block_size = value_cache.shape[3]
+    num_seqs, num_heads, head_size = query.shape
+    max_num_partitions = (
+        (input_metadata.max_context_len + _PARTITION_SIZE - 1) //
+        _PARTITION_SIZE)
+    # NOTE(woosuk): We use a simple heuristic to decide whether to use
+    # PagedAttention V1 or V2. If the number of partitions is 1, we use
+    # V1 to avoid the overhead of reduction. Also, if the number of
+    # sequences or heads is large, we use V1 since there is enough work
+    # to parallelize.
+    # TODO(woosuk): Tune this heuristic.
+    # For context len > 8192, use V2 kernel to avoid shared memory shortage.
+    use_v1 = input_metadata.max_context_len <= 8192 and (
+        max_num_partitions == 1 or num_seqs * num_heads > 512)
+    if use_v1:
+        # Run PagedAttention V1.
+        ops.paged_attention_v1(
+            output,
            query,
-            key,
-            value,
            key_cache,
            value_cache,
-            input_metadata,
-            cache_event,
+            head_mapping,
+            scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+            alibi_slopes,
        )
-
-
-class PagedAttentionWithALiBi(PagedAttention):
-    """PagedAttention with ALiBi attention bias."""
-
-    def __init__(self,
-                 num_heads: int,
-                 head_size: int,
-                 scale: float,
-                 slopes: List[float],
-                 num_kv_heads: Optional[int] = None) -> None:
-        super().__init__(num_heads, head_size, scale, num_kv_heads)
-        assert len(slopes) == num_heads
-
-        slopes = torch.tensor(slopes, dtype=torch.float32)
-        self.register_buffer("alibi_slopes", slopes, persistent=False)
-
-    def set_attn_bias(self, input_metadata: InputMetadata,
-                      dtype: torch.dtype) -> None:
-        if input_metadata.attn_bias is not None:
-            # Already set by a previous layer.
-            return
-        # Generates ALiBi mask based on the max prompt length.
-        max_prompt_len = input_metadata.max_prompt_len
-        bias = torch.arange(max_prompt_len, dtype=dtype)
-        # NOTE(zhuohan): HF uses
-        #     `bias = bias[None, :].repeat(prompt_len, 1)`
-        # here. We find that both biases give the same results, but
-        # the bias below more accurately follows the original ALiBi
-        # paper.
-        bias = bias[None, :] - bias[:, None]
-        bias = bias.to(self.alibi_slopes.device)
-
-        # When using custom attention bias, xformers requires the bias to
-        # be sliced from a tensor whose length is a multiple of 8.
-        padded_len = (max_prompt_len + 7) // 8 * 8
-        bias = torch.empty(
-            input_metadata.num_prompts,
-            self.num_heads,
-            max_prompt_len,
-            padded_len,
-            device=self.alibi_slopes.device,
-            dtype=dtype,
-        )[:, :, :, :max_prompt_len].copy_(bias)
-        bias.mul_(self.alibi_slopes[:, None, None])
-        attn_bias = LowerTriangularMaskWithTensorBias(bias)
-        input_metadata.attn_bias = attn_bias
-
-    def multi_query_kv_attention(
-        self,
-        output: torch.Tensor,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        input_metadata: InputMetadata,
-    ) -> torch.Tensor:
-        """Attention with ALiBi bias for the prompt tokens.
-
-        Args:
-            output: shape = [num_prompt_tokens, num_heads, head_size]
-            query: shape = [num_prompt_tokens, num_heads, head_size]
-            key: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            value: shape = [num_prompt_tokens, num_kv_heads, head_size]
-            input_metadata: metadata for paged attention.
-        """
-        if self.num_kv_heads != self.num_heads:
-            # Project the key and value tensors to the desired number of heads.
-            key = torch.repeat_interleave(key, self.num_queries_per_kv, dim=1)
-            value = torch.repeat_interleave(value,
-                                            self.num_queries_per_kv,
-                                            dim=1)
-        batch_size = input_metadata.num_prompts
-        seq_len = input_metadata.max_prompt_len
-
-        out = xops.memory_efficient_attention_forward(
-            query.view(batch_size, seq_len, self.num_heads, self.head_size),
-            key.view(batch_size, seq_len, self.num_heads, self.head_size),
-            value.view(batch_size, seq_len, self.num_heads, self.head_size),
-            attn_bias=input_metadata.attn_bias,
-            p=0.0,
-            scale=self.scale,
+    else:
+        # Run PagedAttention V2.
+        assert _PARTITION_SIZE % block_size == 0
+        tmp_output = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions, head_size),
+            dtype=output.dtype,
+            device=output.device,
        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output.copy_(out.view(-1, self.num_heads, self.head_size))
-        return output
-
-    def get_alibi_slopes(self) -> Optional[torch.Tensor]:
-        return self.alibi_slopes
+        exp_sums = torch.empty(
+            size=(num_seqs, num_heads, max_num_partitions),
+            dtype=torch.float32,
+            device=output.device,
+        )
+        max_logits = torch.empty_like(exp_sums)
+        ops.paged_attention_v2(
+            output,
+            exp_sums,
+            max_logits,
+            tmp_output,
+            query,
+            key_cache,
+            value_cache,
+            head_mapping,
+            scale,
+            input_metadata.block_tables,
+            input_metadata.context_lens,
+            block_size,
+            input_metadata.max_context_len,
+            alibi_slopes,
+        )
+    return output
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@ -4,7 +4,7 @@ from typing import Optional, Tuple, Union
 import torch
 import torch.nn as nn

-from vllm import layernorm_ops
+from vllm._C import ops


 class RMSNorm(nn.Module):
@ -23,13 +23,33 @@ class RMSNorm(nn.Module):
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

+    def _forward(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        """PyTorch-native implementation equivalent to forward()."""
+        orig_dtype = x.dtype
+        x = x.to(torch.float32)
+        if residual is not None:
+            x = x + residual.to(torch.float32)
+            residual = x.to(orig_dtype)
+
+        variance = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = x.to(orig_dtype) * self.weight
+        if residual is None:
+            return x
+        else:
+            return x, residual
+
    def forward(
        self,
        x: torch.Tensor,
        residual: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        if residual is not None:
-            layernorm_ops.fused_add_rms_norm(
+            ops.fused_add_rms_norm(
                x,
                residual,
                self.weight.data,
@ -37,7 +57,7 @@ class RMSNorm(nn.Module):
            )
            return x, residual
        out = torch.empty_like(x)
-        layernorm_ops.rms_norm(
+        ops.rms_norm(
            out,
            x,
            self.weight.data,
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter

-from vllm import quantization_ops
+from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
@ -50,7 +50,7 @@ class AWQConfig(QuantizationConfig):
    def get_config_filenames() -> List[str]:
        return [
            "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
-            "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq  # pylint: disable=line-too-long
+            "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq
        ]

    @classmethod
@ -151,8 +151,7 @@ class AWQLinearMethod(LinearMethodBase):
        pack_factor = self.quant_config.pack_factor
        out_shape = (x.shape[:-1] + (qweight.shape[-1] * pack_factor, ))
        reshaped_x = x.reshape(-1, x.shape[-1])
-        out = quantization_ops.awq_gemm(reshaped_x, qweight, scales, qzeros,
-                                        pack_factor)
+        out = ops.awq_gemm(reshaped_x, qweight, scales, qzeros, pack_factor)
        if bias is not None:
            out = out + bias
        return out.reshape(out_shape)
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@ -3,7 +3,7 @@ from typing import Any, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter

-from vllm import quantization_ops
+from vllm._C import ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
@ -116,8 +116,7 @@ class SqueezeLLMLinearMethod(LinearMethodBase):
        reshaped_x = x.reshape(-1, x.shape[-1])
        # NOTE: The output tensor should be zero-initialized.
        out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
-        quantization_ops.squeezellm_gemm(reshaped_x, qweight, out,
-                                         lookup_table)
+        ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)

        if bias is not None:
            out = out + bias
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@ -27,7 +27,20 @@ from typing import Any, Dict, Optional, Tuple, Union
 import torch
 import torch.nn as nn

-from vllm import pos_encoding_ops
+from vllm._C import ops
+
+
+def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., :x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2:]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[..., ::2]
+    x2 = x[..., 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)


 class RotaryEmbedding(nn.Module):
@ -81,17 +94,57 @@ class RotaryEmbedding(nn.Module):
        cache = torch.cat((cos, sin), dim=-1)
        return cache

+    def _forward(
+        self,
+        positions: torch.Tensor,
+        query: torch.Tensor,
+        key: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """PyTorch-native implementation equivalent to forward()."""
+        query = query.view(*query.shape[:-1], -1, self.head_size)
+        key = key.view(*key.shape[:-1], -1, self.head_size)
+
+        query_rot = query[..., :self.rotary_dim]
+        key_rot = key[..., :self.rotary_dim]
+        if self.rotary_dim < self.head_size:
+            query_pass = query[..., self.rotary_dim:]
+            key_pass = key[..., self.rotary_dim:]
+
+        cos_sin = self.cos_sin_cache[positions]
+        cos, sin = cos_sin.chunk(2, dim=-1)
+        if self.is_neox_style:
+            # NOTE(woosuk): Here we assume that the positions tensor has the
+            # shape [batch_size, seq_len].
+            cos = cos.repeat(1, 1, 2).unsqueeze(-2)
+            sin = sin.repeat(1, 1, 2).unsqueeze(-2)
+        else:
+            cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2)
+            sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2)
+
+        rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj
+        query_rot = query_rot * cos + rotate_fn(query_rot) * sin
+        key_rot = key_rot * cos + rotate_fn(key_rot) * sin
+
+        if self.rotary_dim < self.head_size:
+            query = torch.cat((query_rot, query_pass), dim=-1)
+            key = torch.cat((key_rot, key_pass), dim=-1)
+        else:
+            query = query_rot
+            key = key_rot
+        query = query.flatten(-2)
+        key = key.flatten(-2)
+        return query, key
+
    def forward(
        self,
        positions: torch.Tensor,
        query: torch.Tensor,
        key: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        # pos_encoding_ops.rotary_embedding() is an in-place operation that
+        # ops.rotary_embedding() is an in-place operation that
        # updates the query and key tensors.
-        pos_encoding_ops.rotary_embedding(positions, query, key,
-                                          self.head_size, self.cos_sin_cache,
-                                          self.is_neox_style)
+        ops.rotary_embedding(positions, query, key, self.head_size,
+                             self.cos_sin_cache, self.is_neox_style)
        return query, key


@ -273,14 +326,22 @@ class YaRNScalingRotaryEmbedding(RotaryEmbedding):
        return cache


+_ROPE_DICT: Dict[Tuple, RotaryEmbedding] = {}
+
+
 def get_rope(
    head_size: int,
    rotary_dim: int,
    max_position: int,
    base: int,
-    is_neox_style: bool,
-    rope_scaling: Optional[Dict[str, Any]],
+    is_neox_style: bool = True,
+    rope_scaling: Optional[Dict[str, Any]] = None,
 ) -> RotaryEmbedding:
+    key = (head_size, rotary_dim, max_position, base, is_neox_style,
+           tuple(rope_scaling.items()) if rope_scaling is not None else None)
+    if key in _ROPE_DICT:
+        return _ROPE_DICT[key]
+
    if rope_scaling is None:
        rotary_emb = RotaryEmbedding(head_size, rotary_dim, max_position, base,
                                     is_neox_style)
@ -313,4 +374,5 @@ def get_rope(
                                                    **extra_kwargs)
        else:
            raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+    _ROPE_DICT[key] = rotary_emb
    return rotary_emb
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@ -4,12 +4,12 @@ from typing import Dict, List, Optional, Tuple
 import torch
 import torch.nn as nn

-from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_gather)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.sampling_params import SamplingParams, SamplingType
 from vllm.sequence import (PromptLogprobs, SampleLogprobs, SamplerOutput,
-                           SequenceData, SequenceGroupOutputs, SequenceOutputs)
+                           SequenceData, SequenceGroupOutput, SequenceOutput)

 _SAMPLING_EPS = 1e-5

@ -21,7 +21,7 @@ class Sampler(nn.Module):
    1. Discard the hidden states that are not used for sampling (i.e., all
        tokens except the final one in each prompt).
    2. Compute the logits for the next tokens.
-    3. Apply presence and frequency penalties.
+    3. Apply presence, frequency and repetition penalties.
    4. Apply temperature scaling.
    5. Apply top-p and top-k truncation.
    6. Sample the next tokens.
@ -37,31 +37,30 @@ class Sampler(nn.Module):
        self,
        embedding: torch.Tensor,
        hidden_states: torch.Tensor,
-        input_metadata: InputMetadata,
+        sampling_metadata: SamplingMetadata,
        embedding_bias: Optional[torch.Tensor] = None,
    ) -> SamplerOutput:
        # Get the hidden states that we use for sampling.
-        hidden_states = _prune_hidden_states(hidden_states, input_metadata)
+        hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)

        # Get the logits for the next tokens.
        logits = _get_logits(hidden_states, embedding, embedding_bias,
                             self.vocab_size)

        # Apply logits processors (if any).
-        logits = _apply_logits_processors(logits, input_metadata)
+        logits = _apply_logits_processors(logits, sampling_metadata)
        # Apply presence and frequency penalties.
-        output_tokens = _get_output_tokens(input_metadata)
-        assert len(output_tokens) == logits.shape[0]
        presence_penalties, frequency_penalties, repetition_penalties = (
-            _get_penalties(input_metadata))
+            _get_penalties(sampling_metadata))
        assert len(presence_penalties) == logits.shape[0]
        assert len(frequency_penalties) == logits.shape[0]
        assert len(repetition_penalties) == logits.shape[0]
-        logits = _apply_penalties(logits, output_tokens, presence_penalties,
-                                  frequency_penalties, repetition_penalties)
+        logits = _apply_penalties(logits, sampling_metadata,
+                                  presence_penalties, frequency_penalties,
+                                  repetition_penalties)

        # Apply temperature scaling.
-        temperatures = _get_temperatures(input_metadata)
+        temperatures = _get_temperatures(sampling_metadata)
        assert len(temperatures) == logits.shape[0]
        if any(t != 1.0 for t in temperatures):
            t = torch.tensor(temperatures,
@ -72,7 +71,7 @@ class Sampler(nn.Module):

        # Apply top-p and top-k truncation.
        top_ps, top_ks, min_ps = _get_top_p_top_k_min_p(
-            input_metadata, self.vocab_size)
+            sampling_metadata, self.vocab_size)
        assert len(top_ps) == len(top_ks) == logits.shape[0]
        do_top_p = any(p < 1.0 - _SAMPLING_EPS for p in top_ps)
        do_top_k = any(k != self.vocab_size for k in top_ks)
@ -91,11 +90,11 @@ class Sampler(nn.Module):
        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)

        # Sample the next tokens.
-        sample_results = _sample(probs, logprobs, input_metadata)
+        sample_results = _sample(probs, logprobs, sampling_metadata)
        # Get the logprobs query results.
        prompt_logprobs, sample_logprobs = _get_logprobs(
-            logprobs, input_metadata, sample_results)
-        return _build_sampler_output(sample_results, input_metadata,
+            logprobs, sampling_metadata, sample_results)
+        return _build_sampler_output(sample_results, sampling_metadata,
                                     prompt_logprobs, sample_logprobs)


@ -114,29 +113,30 @@ def _get_logits(hidden_states: torch.Tensor, embedding: torch.Tensor,

 def _prune_hidden_states(
    hidden_states: torch.Tensor,
-    input_metadata: InputMetadata,
+    sampling_metadata: SamplingMetadata,
 ) -> torch.Tensor:
    hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
-    return hidden_states.index_select(0, input_metadata.selected_token_indices)
+    return hidden_states.index_select(0,
+                                      sampling_metadata.selected_token_indices)


 def _get_penalties(
-    input_metadata: InputMetadata
+    sampling_metadata: SamplingMetadata
 ) -> Tuple[List[float], List[float], List[float]]:
    # Collect the presence and frequency penalties.
    presence_penalties: List[float] = []
    frequency_penalties: List[float] = []
    repetition_penalties: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
        seq_ids, sampling_params = seq_group
        p = sampling_params.presence_penalty
        f = sampling_params.frequency_penalty
        r = sampling_params.repetition_penalty
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
            # NOTE: We do not apply presence and frequency penalties for the
            # prompt token positions where we don't sample new tokens.
-            prompt_len = input_metadata.prompt_lens[i]
+            prompt_len = sampling_metadata.prompt_lens[i]
            presence_penalties += [0] * (prompt_len - 1)
            frequency_penalties += [0] * (prompt_len - 1)
            repetition_penalties += [1] * (prompt_len - 1)
@ -146,33 +146,66 @@ def _get_penalties(
    return presence_penalties, frequency_penalties, repetition_penalties


-def _get_output_tokens(input_metadata: InputMetadata) -> List[List[int]]:
+def _get_prompt_and_output_tokens(
+    sampling_metadata: SamplingMetadata,
+) -> Tuple[List[List[int]], List[List[int]]]:
+    prompt_tokens: List[List[int]] = []
    output_tokens: List[List[int]] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
        seq_ids, sampling_params = seq_group
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
            # NOTE: prompt token positions do not need output tokens to
            # compute penalties.
-            prompt_len = input_metadata.prompt_lens[i]
+            prompt_len = sampling_metadata.prompt_lens[i]
+            prompt_tokens.extend([] for _ in range(prompt_len - 1))
            output_tokens.extend([] for _ in range(prompt_len - 1))
        for seq_id in seq_ids:
-            seq_data = input_metadata.seq_data[seq_id]
+            seq_data = sampling_metadata.seq_data[seq_id]
+            prompt_tokens.append(seq_data.prompt_token_ids)
            output_tokens.append(seq_data.output_token_ids)
-    return output_tokens
+    return prompt_tokens, output_tokens


-def _apply_logits_processors(logits: torch.Tensor,
-                             input_metadata: InputMetadata) -> torch.Tensor:
+def _get_bin_counts_and_mask(
+    logits: torch.Tensor,
+    tokens: List[List[int]],
+    vocab_size: int,
+    num_seqs: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    max_len = max(len(tokens) for tokens in tokens)
+    padded_tokens = [
+        tokens + [vocab_size] * (max_len - len(tokens)) for tokens in tokens
+    ]
+    tokens_tensor = torch.tensor(padded_tokens,
+                                 dtype=torch.long,
+                                 device=logits.device)
+
+    # Compute the bin counts for the tokens.
+    # vocab_size + 1 for padding.
+    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
+                             dtype=torch.long,
+                             device=logits.device)
+    bin_counts.scatter_add_(1, tokens_tensor, torch.ones_like(tokens_tensor))
+    bin_counts = bin_counts[:, :vocab_size]
+    mask = bin_counts > 0
+
+    return bin_counts, mask
+
+
+def _apply_logits_processors(
+    logits: torch.Tensor,
+    sampling_metadata: SamplingMetadata,
+) -> torch.Tensor:
    logits_row_idx = 0
    found_logits_processors = False
-    for seq_ids, sampling_params in input_metadata.seq_groups:
+    for seq_ids, sampling_params in sampling_metadata.seq_groups:
        logits_processors = sampling_params.logits_processors
        if logits_processors:
            found_logits_processors = True
            for seq_id in seq_ids:
                logits_row = logits[logits_row_idx]
-                token_ids = input_metadata.seq_data[seq_id].output_token_ids
+                token_ids = sampling_metadata.seq_data[seq_id].output_token_ids
                for logits_processor in logits_processors:
                    logits_row = logits_processor(token_ids, logits_row)
                logits[logits_row_idx] = logits_row
@ -186,15 +219,13 @@ def _apply_logits_processors(logits: torch.Tensor,

 def _apply_penalties(
    logits: torch.Tensor,
-    output_tokens: List[List[int]],
+    sampling_metadata: SamplingMetadata,
    presence_penalties: List[float],
    frequency_penalties: List[float],
    repetition_penalties: List[float],
 ) -> torch.Tensor:
    num_seqs, vocab_size = logits.shape
    for i in range(num_seqs):
-        if not output_tokens[i]:
-            continue
        p = presence_penalties[i]
        f = frequency_penalties[i]
        r = repetition_penalties[i]
@ -206,24 +237,15 @@ def _apply_penalties(
        # Return early if all sequences have zero penalties.
        return logits

-    max_output_len = max(len(tokens) for tokens in output_tokens)
-    padded_output_tokens = [
-        tokens + [vocab_size] * (max_output_len - len(tokens))
-        for tokens in output_tokens
-    ]
-    output_tokens_tensor = torch.tensor(padded_output_tokens,
-                                        dtype=torch.long,
-                                        device=logits.device)
+    prompt_tokens, output_tokens = (
+        _get_prompt_and_output_tokens(sampling_metadata))
+    assert len(prompt_tokens) == logits.shape[0]
+    assert len(output_tokens) == logits.shape[0]

-    # Compute the bin counts for the output tokens.
-    # vocab_size + 1 for padding.
-    bin_counts = torch.zeros((num_seqs, vocab_size + 1),
-                             dtype=torch.long,
-                             device=logits.device)
-    bin_counts.scatter_add_(1, output_tokens_tensor,
-                            torch.ones_like(output_tokens_tensor))
-    bin_counts = bin_counts[:, :vocab_size]  # Remove the padding bin.
-    mask = bin_counts > 0
+    prompt_bin_counts, prompt_mask = _get_bin_counts_and_mask(
+        logits, prompt_tokens, vocab_size, num_seqs)
+    output_bin_counts, output_mask = _get_bin_counts_and_mask(
+        logits, output_tokens, vocab_size, num_seqs)

    repetition_penalties = torch.tensor(repetition_penalties,
                                        dtype=logits.dtype,
@ -236,21 +258,21 @@ def _apply_penalties(
                                      device=logits.device)

    repetition_penalties = repetition_penalties[:, None].repeat(1, vocab_size)
-    repetition_penalties[~mask] = 1.0
+    repetition_penalties[~(prompt_mask | output_mask)] = 1.0
    logits = torch.where(logits > 0, logits / repetition_penalties,
                         logits * repetition_penalties)

    # We follow the definition in OpenAI API.
    # Refer to https://platform.openai.com/docs/api-reference/parameter-details
-    logits -= frequency_penalties.unsqueeze(dim=1) * bin_counts
-    logits -= presence_penalties.unsqueeze(dim=1) * mask
+    logits -= frequency_penalties.unsqueeze(dim=1) * output_bin_counts
+    logits -= presence_penalties.unsqueeze(dim=1) * output_mask
    return logits


-def _get_temperatures(input_metadata: InputMetadata) -> List[float]:
+def _get_temperatures(sampling_metadata: SamplingMetadata) -> List[float]:
    # Collect the temperatures for the logits.
    temperatures: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
        seq_ids, sampling_params = seq_group
        temperature = sampling_params.temperature
        if temperature < _SAMPLING_EPS:
@ -258,22 +280,22 @@ def _get_temperatures(input_metadata: InputMetadata) -> List[float]:
            # (i.e., greedy sampling or beam search).
            # Set the temperature to 1 to avoid division by zero.
            temperature = 1.0
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
-            prompt_len = input_metadata.prompt_lens[i]
+            prompt_len = sampling_metadata.prompt_lens[i]
            temperatures += [temperature] * (prompt_len - 1)
        temperatures += [temperature] * len(seq_ids)
    return temperatures


 def _get_top_p_top_k_min_p(
-    input_metadata: InputMetadata,
+    sampling_metadata: SamplingMetadata,
    vocab_size: int,
 ) -> Tuple[List[float], List[int], List[float]]:
    top_ps: List[float] = []
    top_ks: List[int] = []
    min_ps: List[float] = []
-    for i, seq_group in enumerate(input_metadata.seq_groups):
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
        seq_ids, sampling_params = seq_group
        top_p = sampling_params.top_p
        min_p = sampling_params.min_p
@ -281,9 +303,9 @@ def _get_top_p_top_k_min_p(
        top_k = min(sampling_params.top_k, vocab_size)
        # k=-1 means no truncation.
        top_k = vocab_size if top_k == -1 else top_k
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
-            prompt_len = input_metadata.prompt_lens[i]
+            prompt_len = sampling_metadata.prompt_lens[i]
            top_ps += [top_p] * (prompt_len - 1)
            top_ks += [top_k] * (prompt_len - 1)
            min_ps += [min_p] * (prompt_len - 1)
@ -453,11 +475,11 @@ def _beam_search_sample(
 def _sample(
    probs: torch.Tensor,
    logprobs: torch.Tensor,
-    input_metadata: InputMetadata,
+    sampling_metadata: SamplingMetadata,
 ) -> List[Tuple[List[int], List[int]]]:
    categorized_seq_group_ids = {t: [] for t in SamplingType}
-    categorized_sample_indices = input_metadata.categorized_sample_indices
-    for i, seq_group in enumerate(input_metadata.seq_groups):
+    categorized_sample_indices = sampling_metadata.categorized_sample_indices
+    for i, seq_group in enumerate(sampling_metadata.seq_groups):
        _, sampling_params = seq_group
        sampling_type = sampling_params.sampling_type
        categorized_seq_group_ids[sampling_type].append(i)
@ -465,8 +487,8 @@ def _sample(
    sample_results_dict: Dict[int, Tuple[List[int], List[int]]] = {}
    for sampling_type in SamplingType:
        seq_group_ids = categorized_seq_group_ids[sampling_type]
-        seq_groups = [input_metadata.seq_groups[i] for i in seq_group_ids]
-        is_prompts = [i < input_metadata.num_prompts for i in seq_group_ids]
+        seq_groups = [sampling_metadata.seq_groups[i] for i in seq_group_ids]
+        is_prompts = [i < sampling_metadata.num_prompts for i in seq_group_ids]
        sample_indices = categorized_sample_indices[sampling_type]
        num_tokens = len(sample_indices)
        if num_tokens == 0:
@ -481,21 +503,22 @@ def _sample(
        elif sampling_type == SamplingType.BEAM:
            category_logprobs = logprobs[sample_indices]
            sample_results = _beam_search_sample(seq_groups, is_prompts,
-                                                 input_metadata.seq_data,
+                                                 sampling_metadata.seq_data,
                                                 category_logprobs)
        else:
            raise ValueError(f"Unsupported sampling type: {sampling_type}")
        sample_results_dict.update(zip(seq_group_ids, sample_results))

    sample_results = [
-        sample_results_dict[i] for i in range(len(input_metadata.seq_groups))
+        sample_results_dict[i]
+        for i in range(len(sampling_metadata.seq_groups))
    ]
    return sample_results


 def _get_logprobs(
    logprobs: torch.Tensor,
-    input_metadata: InputMetadata,
+    sampling_metadata: SamplingMetadata,
    sample_results: List[Tuple[List[int], List[int]]],
 ) -> Tuple[List[Optional[List[Optional[Dict[int, float]]]]], List[List[Dict[
        int, float]]]]:
@ -505,16 +528,16 @@ def _get_logprobs(
    largest_num_logprobs = 0
    sample_idx = 0
    for i, (seq_group, sample_result) in enumerate(
-            zip(input_metadata.seq_groups, sample_results)):
+            zip(sampling_metadata.seq_groups, sample_results)):
        seq_ids, sampling_params = seq_group
        next_token_ids, parent_ids = sample_result
        num_parent_seqs = len(seq_ids)
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
            largest_num_logprobs = max(largest_num_logprobs,
                                       sampling_params.prompt_logprobs)
-            prompt_len = input_metadata.prompt_lens[i]
-            prompt_tokens = input_metadata.seq_data[
+            prompt_len = sampling_metadata.prompt_lens[i]
+            prompt_tokens = sampling_metadata.seq_data[
                seq_ids[0]].prompt_token_ids
            batched_logprobs_query_seq_indices.extend(
                sample_idx + j for j in range(prompt_len - 1))
@ -552,16 +575,16 @@ def _get_logprobs(
    sample_idx = 0
    query_result_idx = 0
    for i, (seq_group, sample_result) in enumerate(
-            zip(input_metadata.seq_groups, sample_results)):
+            zip(sampling_metadata.seq_groups, sample_results)):
        seq_ids, sampling_params = seq_group
        next_token_ids, parent_ids = sample_result

        # Prompt logprobs
-        if (i < input_metadata.num_prompts
+        if (i < sampling_metadata.num_prompts
                and sampling_params.prompt_logprobs is not None):
            num_logprobs = sampling_params.prompt_logprobs
-            prompt_len = input_metadata.prompt_lens[i]
-            prompt_tokens = input_metadata.seq_data[
+            prompt_len = sampling_metadata.prompt_lens[i]
+            prompt_tokens = sampling_metadata.seq_data[
                seq_ids[0]].prompt_token_ids
            group_prompt_logprobs: PromptLogprobs = [None]
            for token_id in prompt_tokens[1:]:
@ -607,13 +630,13 @@ def _get_logprobs(

 def _build_sampler_output(
    sample_results: List[Tuple[List[int], List[int]]],
-    input_metadata: InputMetadata,
+    sampling_metadata: SamplingMetadata,
    prompt_logprobs: List[Optional[PromptLogprobs]],
    sample_logprobs: List[SampleLogprobs],
 ) -> SamplerOutput:
    sampler_output = []
    for (seq_group, sample_result, group_prompt_logprobs,
-         group_sample_logprobs) in zip(input_metadata.seq_groups,
+         group_sample_logprobs) in zip(sampling_metadata.seq_groups,
                                       sample_results, prompt_logprobs,
                                       sample_logprobs):
        seq_ids, _ = seq_group
@ -623,7 +646,7 @@ def _build_sampler_output(
                                                      next_token_ids,
                                                      group_sample_logprobs):
            seq_outputs.append(
-                SequenceOutputs(seq_ids[parent_id], next_token_id, logprobs))
+                SequenceOutput(seq_ids[parent_id], next_token_id, logprobs))
        sampler_output.append(
-            SequenceGroupOutputs(seq_outputs, group_prompt_logprobs))
+            SequenceGroupOutput(seq_outputs, group_prompt_logprobs))
    return sampler_output
--- a/vllm/model_executor/model_loader.py
+++ b/vllm/model_executor/model_loader.py
@ -7,7 +7,7 @@ import torch.nn as nn
 from transformers import PretrainedConfig

 from vllm.config import ModelConfig
-from vllm.model_executor.models import *  # pylint: disable=wildcard-import
+from vllm.model_executor.models import *
 from vllm.model_executor.weight_utils import (get_quant_config,
                                              initialize_dummy_weights)

@ -87,9 +87,9 @@ def get_model(model_config: ModelConfig) -> nn.Module:
    with _set_default_torch_dtype(model_config.dtype):
        # Create a model instance.
        # The weights will be initialized as empty tensors.
-        model = model_class(model_config.hf_config, linear_method)
+        with torch.device("cuda"):
+            model = model_class(model_config.hf_config, linear_method)
        if model_config.load_format == "dummy":
-            model = model.cuda()
            # NOTE(woosuk): For accurate performance evaluation, we assign
            # random values to the weights.
            initialize_dummy_weights(model)
@ -97,5 +97,4 @@ def get_model(model_config: ModelConfig) -> nn.Module:
            # Load the weights from the cached or downloaded files.
            model.load_weights(model_config.model, model_config.download_dir,
                               model_config.load_format, model_config.revision)
-            model = model.cuda()
    return model.eval()
--- a/vllm/model_executor/models/aquila.py
+++ b/vllm/model_executor/models/aquila.py
@ -20,11 +20,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
 from typing import Any, Dict, List, Optional, Tuple

 import torch
@ -32,16 +28,18 @@ from torch import nn

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -142,15 +140,17 @@ class AquilaAttention(nn.Module):
            bias=False,
            linear_method=linear_method,
        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
            rotary_dim=self.head_dim,
-            num_kv_heads=self.num_kv_heads,
-            rope_scaling=rope_scaling)
+            max_position=self.max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)

    def forward(
        self,
@ -162,9 +162,10 @@ class AquilaAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -261,10 +262,7 @@ class AquilaModel(nn.Module):
    ) -> torch.Tensor:
        hidden_states = self.embed_tokens(input_ids)
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states = layer(
                positions,
@ -299,11 +297,18 @@ class AquilaForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/baichuan.py
+++ b/vllm/model_executor/models/baichuan.py
@ -17,11 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only BaiChuan model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only BaiChuan model compatible with HuggingFace weights."""
 import math
 from typing import List, Optional, Tuple

@ -30,18 +26,19 @@ from torch import nn

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import (PagedAttentionWithRoPE,
-                                                  PagedAttentionWithALiBi)
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -154,17 +151,20 @@ class BaiChuanAttention(nn.Module):
            alibi_slopes = alibi_slopes[head_start:head_end].tolist()

            scaling = self.head_dim**-0.5
-            self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                                scaling, alibi_slopes)
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       scaling,
+                                       alibi_slopes=alibi_slopes)
        else:
-            self.scaling = self.head_dim**-0.5
-            self.attn = PagedAttentionWithRoPE(
-                self.num_heads,
+            self.rotary_emb = get_rope(
                self.head_dim,
-                self.scaling,
                rotary_dim=self.head_dim,
+                max_position=self.max_position_embeddings,
                base=self.rope_theta,
-                max_position=self.max_position_embeddings)
+            )
+            self.scaling = self.head_dim**-0.5
+            self.attn = PagedAttention(self.num_heads, self.head_dim,
+                                       self.scaling)

    def forward(
        self,
@ -176,14 +176,11 @@ class BaiChuanAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.W_pack(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        if self.postion_embedding != "ALIBI":
+            q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        if self.postion_embedding == "ALIBI":
-            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                    cache_event)
-        else:
-            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                    input_metadata, cache_event)
-
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -281,10 +278,7 @@ class BaiChuanModel(nn.Module):
        hidden_states = self.embed_tokens(input_ids)
        residual = None
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
@ -318,11 +312,18 @@ class BaiChuanBaseForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
@ -340,6 +341,17 @@ class BaiChuanBaseForCausalLM(nn.Module):
                model_name_or_path, cache_dir, load_format, revision):
            if "rotary_emb.inv_freq" in name:
                continue
+            if name == "lm_head.weight":
+                # Unlike Baichuan, Baichuan2 normalizes the head weights. Refer to:
+                # https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat/blob/84603cde5ebffb6084e476cfaeceaf0b8b91fe54/modeling_baichuan.py#L508
+                # Distinguish between Baichuan and Baichuan2 by checking the
+                # vocab size. This is suggested by
+                # https://github.com/vllm-project/vllm/pull/1022#discussion_r1325652704
+                is_baichuan2 = self.config.vocab_size == 125696
+                if is_baichuan2:
+                    loaded_weight = torch.nn.functional.normalize(
+                        loaded_weight)
+
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
--- a/vllm/model_executor/models/bloom.py
+++ b/vllm/model_executor/models/bloom.py
@ -15,11 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only BLOOM model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only BLOOM model compatible with HuggingFace weights."""
 import math
 from typing import List, Optional, Tuple

@ -29,7 +25,7 @@ from transformers import BloomConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
@ -39,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -110,8 +107,10 @@ class BloomAttention(nn.Module):
        alibi_slopes = alibi_slopes[head_start:head_end].tolist()

        scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                            scaling, alibi_slopes)
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   scaling,
+                                   alibi_slopes=alibi_slopes)

    def forward(
        self,
@ -256,10 +255,7 @@ class BloomModel(nn.Module):
        hidden_states = self.word_embeddings(input_ids)
        hidden_states = self.word_embeddings_layernorm(hidden_states)
        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(
                position_ids,
@ -293,11 +289,18 @@ class BloomForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/chatglm.py
+++ b/vllm/model_executor/models/chatglm.py
@ -1,11 +1,7 @@
 # coding=utf-8
 # Adapted from
 # https://github.com/THUDM/ChatGLM2-6B
-"""Inference-only ChatGLM model compatible with THUDM weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only ChatGLM model compatible with THUDM weights."""
 from typing import List, Optional, Tuple

 import torch
@ -14,17 +10,19 @@ from torch.nn import LayerNorm

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -79,13 +77,21 @@ class GLMAttention(nn.Module):
            linear_method=linear_method,
        )

-        self.attn = PagedAttentionWithRoPE(
+        # https://huggingface.co/THUDM/chatglm3-6b-32k/blob/e210410255278dd9d74463cf396ba559c0ef801c/modeling_chatglm.py#L141
+        rope_ratio = getattr(config, "rope_ratio", 1.0)
+        max_positions = getattr(config, "seq_length", 8192)
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim // 2,
+            max_position=max_positions,
+            base=10000 * rope_ratio,
+            is_neox_style=False,
+        )
+        self.attn = PagedAttention(
            self.num_heads,
            self.head_dim,
            self.scaling,
-            rotary_dim=self.head_dim // 2,
            num_kv_heads=self.num_kv_heads,
-            is_neox_style=False,
        )

    def forward(
@ -98,10 +104,9 @@ class GLMAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.query_key_value(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
        key_cache, value_cache = kv_cache
-
        context_layer = self.attn(
-            position_ids,
            q,
            k,
            v,
@ -110,9 +115,7 @@ class GLMAttention(nn.Module):
            input_metadata,
            cache_event,
        )
-
        attn_output, _ = self.dense(context_layer)
-
        return attn_output


@ -269,10 +272,7 @@ class GLMTransformer(nn.Module):
        cache_events: Optional[List[torch.cuda.Event]],
    ) -> torch.Tensor:
        for i in range(self.num_layers):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states = layer(
                hidden_states=hidden_states,
@ -351,11 +351,18 @@ class ChatGLMForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/falcon.py
+++ b/vllm/model_executor/models/falcon.py
@ -28,13 +28,12 @@ from transformers import FalconConfig as HF_FalconConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import (PagedAttention,
-                                                  PagedAttentionWithALiBi,
-                                                  PagedAttentionWithRoPE)
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
@ -42,6 +41,7 @@ from vllm.model_executor.parallel_utils.communication_op import (
    tensor_model_parallel_all_reduce)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -144,14 +144,16 @@ class FalconAttention(nn.Module):
            rope_theta = getattr(config, "rope_theta", 10000)
            max_position_embeddings = getattr(config,
                                              "max_position_embeddings", 8192)
-            self.attn = PagedAttentionWithRoPE(
-                self.num_heads,
+            self.rotary_emb = get_rope(
                self.head_dim,
-                self.inv_norm_factor,
-                base=rope_theta,
-                max_position=max_position_embeddings,
                rotary_dim=self.head_dim,
-                num_kv_heads=self.num_kv_heads)
+                max_position=max_position_embeddings,
+                base=rope_theta,
+            )
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.inv_norm_factor,
+                                       num_kv_heads=self.num_kv_heads)
        elif self.use_alibi:
            tp_rank = get_tensor_model_parallel_rank()
            head_start = tp_rank * self.num_heads
@ -159,11 +161,11 @@ class FalconAttention(nn.Module):
            alibi_slopes = (_get_alibi_slopes(self.total_num_heads) *
                            self.inv_norm_factor)
            alibi_slopes = alibi_slopes[head_start:head_end].tolist()
-            self.attn = PagedAttentionWithALiBi(self.num_heads,
-                                                self.head_dim,
-                                                self.inv_norm_factor,
-                                                alibi_slopes,
-                                                num_kv_heads=self.num_kv_heads)
+            self.attn = PagedAttention(self.num_heads,
+                                       self.head_dim,
+                                       self.inv_norm_factor,
+                                       num_kv_heads=self.num_kv_heads,
+                                       alibi_slopes=alibi_slopes)
        else:
            self.attn = PagedAttention(self.num_heads,
                                       self.head_dim,
@ -182,13 +184,11 @@ class FalconAttention(nn.Module):
        if bias is not None:
            qkv += bias
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
-        k_cache, v_cache = kv_cache
        if self.use_rotary:
-            attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                    input_metadata, cache_event)
-        else:
-            attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
-                                    cache_event)
+            q, k = self.rotary_emb(positions, q, k)
+        k_cache, v_cache = kv_cache
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        attn_output, bias = self.dense(attn_output)
        return attn_output, bias

@ -353,10 +353,7 @@ class FalconModel(nn.Module):
    ) -> torch.Tensor:
        hidden_states = self.word_embeddings(input_ids)
        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(
                positions,
@ -393,7 +390,7 @@ class FalconForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(
            input_ids,
            positions,
@ -401,9 +398,15 @@ class FalconForCausalLM(nn.Module):
            input_metadata,
            cache_events,
        )
-        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+        return hidden_states

+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/gpt2.py
+++ b/vllm/model_executor/models/gpt2.py
@ -16,11 +16,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GPT-2 model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only GPT-2 model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -39,6 +35,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -206,10 +203,7 @@ class GPT2Model(nn.Module):
        hidden_states = inputs_embeds + position_embeds

        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
                                  cache_event)
@ -239,11 +233,18 @@ class GPT2LMHeadModel(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/gpt_bigcode.py
+++ b/vllm/model_executor/models/gpt_bigcode.py
@ -17,11 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GPTBigCode model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only GPTBigCode model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -40,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -225,10 +222,7 @@ class GPTBigCodeModel(nn.Module):
        hidden_states = inputs_embeds + position_embeds

        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
                                  cache_event)
@ -258,11 +252,18 @@ class GPTBigCodeForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/gpt_j.py
+++ b/vllm/model_executor/models/gpt_j.py
@ -15,11 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GPT-J model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only GPT-J model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -28,16 +24,18 @@ from transformers import GPTJConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -81,15 +79,14 @@ class GPTJAttention(nn.Module):
        rope_theta = getattr(config, "rope_theta", 10000)
        max_position_embeddings = getattr(config, "max_position_embeddings",
                                          8192)
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_size,
-            scaling,
-            config.rotary_dim,
-            base=rope_theta,
+            rotary_dim=config.rotary_dim,
            max_position=max_position_embeddings,
-            is_neox_style=False)
-        self.warmup = False
+            base=rope_theta,
+            is_neox_style=False,
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)

    def forward(
        self,
@ -101,9 +98,10 @@ class GPTJAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        attn_output, _ = self.out_proj(attn_output)
        return attn_output

@ -147,10 +145,7 @@ class GPTJBlock(nn.Module):
        linear_method: Optional[LinearMethodBase] = None,
    ):
        super().__init__()
-        if config.n_inner is None:
-            inner_dim = 4 * config.n_embd
-        else:
-            inner_dim = config.n_inner
+        inner_dim = 4 * config.n_embd if config.n_inner is None else config.n_inner
        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
        self.attn = GPTJAttention(config, linear_method)
        self.mlp = GPTJMLP(inner_dim, config, linear_method)
@ -205,10 +200,7 @@ class GPTJModel(nn.Module):
    ) -> torch.Tensor:
        hidden_states = self.wte(input_ids)
        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(
                position_ids,
@ -247,11 +239,18 @@ class GPTJForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata, self.lm_head.bias)
+                                   sampling_metadata, self.lm_head.bias)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/gpt_neox.py
+++ b/vllm/model_executor/models/gpt_neox.py
@ -15,11 +15,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only GPT-NeoX model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only GPT-NeoX model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -28,16 +24,18 @@ from transformers import GPTNeoXConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -81,13 +79,13 @@ class GPTNeoXAttention(nn.Module):
        rope_theta = getattr(config, "rope_theta", 10000)
        max_position_embeddings = getattr(config, "max_position_embeddings",
                                          8192)
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_size,
-            scaling,
-            rotary_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
            base=rope_theta,
-            max_position=max_position_embeddings)
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)

    def forward(
        self,
@ -99,9 +97,10 @@ class GPTNeoXAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.query_key_value(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.dense(attn_output)
        return output

@ -216,10 +215,7 @@ class GPTNeoXModel(nn.Module):
    ) -> torch.Tensor:
        hidden_states = self.embed_in(input_ids)
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states = layer(
                position_ids,
@ -256,11 +252,18 @@ class GPTNeoXForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.gpt_neox(input_ids, positions, kv_caches,
                                      input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.embed_out.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/internlm.py
+++ b/vllm/model_executor/models/internlm.py
@ -7,17 +7,19 @@ from transformers import LlamaConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -92,13 +94,13 @@ class InternLMAttention(nn.Module):
            bias=bias,
            linear_method=linear_method,
        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
+            rotary_dim=self.head_dim,
            max_position=self.max_position_embeddings,
-            rotary_dim=self.head_dim)
+            base=self.rope_theta,
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)

    def forward(
        self,
@ -110,9 +112,10 @@ class InternLMAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -213,10 +216,7 @@ class InternLMModel(nn.Module):
        hidden_states = self.embed_tokens(input_ids)
        residual = None
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
@ -251,11 +251,18 @@ class InternLMForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@ -20,11 +20,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only LLaMA model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only LLaMA model compatible with HuggingFace weights."""
 from typing import Any, Dict, List, Optional, Tuple

 import torch
@ -33,17 +29,19 @@ from transformers import LlamaConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -130,15 +128,18 @@ class LlamaAttention(nn.Module):
            bias=False,
            linear_method=linear_method,
        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+
+        self.rotary_emb = get_rope(
            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
            rotary_dim=self.head_dim,
-            num_kv_heads=self.num_kv_heads,
-            rope_scaling=rope_scaling)
+            max_position=max_position_embeddings,
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)

    def forward(
        self,
@ -150,9 +151,10 @@ class LlamaAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -253,10 +255,7 @@ class LlamaModel(nn.Module):
        hidden_states = self.embed_tokens(input_ids)
        residual = None
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
@ -291,11 +290,18 @@ class LlamaForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/mistral.py
+++ b/vllm/model_executor/models/mistral.py
@ -20,11 +20,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only Mistral model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only Mistral model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -33,17 +29,19 @@ from transformers import MistralConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -128,14 +126,18 @@ class MistralAttention(nn.Module):
            bias=False,
            linear_method=linear_method,
        )
-        self.attn = PagedAttentionWithRoPE(self.num_heads,
-                                           self.head_dim,
-                                           self.scaling,
-                                           base=self.rope_theta,
-                                           max_position=max_position,
-                                           rotary_dim=self.head_dim,
-                                           num_kv_heads=self.num_kv_heads,
-                                           sliding_window=self.sliding_window)
+
+        self.rotary_emb = get_rope(
+            self.head_dim,
+            rotary_dim=self.head_dim,
+            max_position=max_position,
+            base=self.rope_theta,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads,
+                                   sliding_window=self.sliding_window)

    def forward(
        self,
@ -147,9 +149,10 @@ class MistralAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -248,10 +251,7 @@ class MistralModel(nn.Module):
        hidden_states = self.embed_tokens(input_ids)
        residual = None
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
@ -286,11 +286,18 @@ class MistralForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/mpt.py
+++ b/vllm/model_executor/models/mpt.py
@ -8,7 +8,7 @@ import torch.nn as nn

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithALiBi
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
@ -18,6 +18,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -87,8 +88,10 @@ class MPTAttention(nn.Module):

        self.head_dim = self.d_model // self.total_num_heads
        scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithALiBi(self.num_heads, self.head_dim,
-                                            scaling, alibi_slopes)
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   scaling,
+                                   alibi_slopes=alibi_slopes)

    def forward(
        self,
@ -203,10 +206,10 @@ class MPTModel(nn.Module):
        self.norm_f = nn.LayerNorm(config.d_model)
        if config.no_bias:
            for module in self.modules():
-                if hasattr(module, "bias"):
-                    if isinstance(module.bias, nn.Parameter):
-                        # Remove the bias term in Linear and LayerNorm.
-                        module.register_parameter("bias", None)
+                if hasattr(module, "bias") and isinstance(
+                        module.bias, nn.Parameter):
+                    # Remove the bias term in Linear and LayerNorm.
+                    module.register_parameter("bias", None)

    def forward(
        self,
@ -218,10 +221,7 @@ class MPTModel(nn.Module):
    ) -> torch.Tensor:
        hidden_states = self.wte(input_ids)
        for i in range(len(self.blocks)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            block = self.blocks[i]
            hidden_states = block(
                position_ids,
@ -257,11 +257,18 @@ class MPTForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/opt.py
+++ b/vllm/model_executor/models/opt.py
@ -16,11 +16,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only OPT model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only OPT model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -40,6 +36,7 @@ from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -129,9 +126,6 @@ class OPTDecoderLayer(nn.Module):
            linear_method=linear_method,
        )
        self.do_layer_norm_before = config.do_layer_norm_before
-        quant_config = getattr(linear_method, "quant_config", None)
-        self.activation_fn = get_act_fn(config.activation_function,
-                                        quant_config, config.ffn_dim)

        self.self_attn_layer_norm = nn.LayerNorm(
            self.embed_dim,
@ -142,6 +136,9 @@ class OPTDecoderLayer(nn.Module):
            bias=config.enable_bias,
            linear_method=linear_method,
        )
+        quant_config = getattr(linear_method, "quant_config", None)
+        self.activation_fn = get_act_fn(config.activation_function,
+                                        quant_config, config.ffn_dim)
        self.fc2 = RowParallelLinear(
            config.ffn_dim,
            self.embed_dim,
@ -257,10 +254,7 @@ class OPTDecoder(nn.Module):
        hidden_states = inputs_embeds + pos_embeds

        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states = layer(hidden_states, kv_caches[i], input_metadata,
                                  cache_event)
@ -315,11 +309,18 @@ class OPTForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head_weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
@ -338,6 +339,9 @@ class OPTForCausalLM(nn.Module):
                model_name_or_path, cache_dir, load_format, revision):
            if "lm_head.weight" in name:
                continue
+            if name.startswith("decoder."):
+                name = "model." + name
+
            for (param_name, weight_name, shard_id) in stacked_params_mapping:
                if weight_name not in name:
                    continue
--- a/vllm/model_executor/models/phi_1_5.py
+++ b/vllm/model_executor/models/phi_1_5.py
@ -34,11 +34,7 @@
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-"""Inference-only Phi-1.5 model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only Phi-1.5 model compatible with HuggingFace weights."""
 from typing import List, Optional, Tuple

 import torch
@ -47,16 +43,18 @@ from transformers import PretrainedConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import get_act_fn
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                               LinearMethodBase,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -123,13 +121,13 @@ class PhiAttention(nn.Module):
        # https://huggingface.co/microsoft/phi-1_5/blob/d212a789620c380ff32ca1d1ee9943a777360987/modeling_phi.py#L518
        rope_theta = 10000
        max_position_embeddings = getattr(config, "n_positions", 2048)
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_size,
-            scaling,
-            rotary_dim,
+            rotary_dim=rotary_dim,
+            max_position=max_position_embeddings,
            base=rope_theta,
-            max_position=max_position_embeddings)
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_size, scaling)

    def forward(
        self,
@ -141,9 +139,10 @@ class PhiAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.Wqkv(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
+        q, k = self.rotary_emb(position_ids, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(position_ids, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.out_proj(attn_output)
        return output

@ -212,28 +211,6 @@ class PhiLayer(nn.Module):
        return hidden_states


-class PhiCausalLMHead(nn.Module):
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.ln = nn.LayerNorm(config.hidden_size,
-                               eps=config.layer_norm_epsilon)
-        self.linear = ParallelLMHead(config.vocab_size,
-                                     config.hidden_size,
-                                     bias=True)
-        self.sampler = Sampler(config.vocab_size)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        input_metadata: InputMetadata,
-    ):
-        hidden_states = self.ln(hidden_states)
-        next_tokens = self.sampler(self.linear.weight, hidden_states,
-                                   input_metadata, self.linear.bias)
-        return next_tokens
-
-
 class PhiModel(nn.Module):

    def __init__(self,
@ -255,13 +232,10 @@ class PhiModel(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.embd(input_ids)
        for i in range(self.config.num_hidden_layers):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states = layer(
                positions,
@ -273,6 +247,17 @@ class PhiModel(nn.Module):
        return hidden_states


+class PhiCausalLMHead(nn.Module):
+
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.ln = nn.LayerNorm(config.hidden_size,
+                               eps=config.layer_norm_epsilon)
+        self.linear = ParallelLMHead(config.vocab_size,
+                                     config.hidden_size,
+                                     bias=True)
+
+
 class PhiForCausalLM(nn.Module):

    def __init__(self,
@ -284,6 +269,7 @@ class PhiForCausalLM(nn.Module):

        self.transformer = PhiModel(config, linear_method)
        self.lm_head = PhiCausalLMHead(config)
+        self.sampler = Sampler(config.vocab_size)

    def forward(
        self,
@ -292,11 +278,21 @@ class PhiForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
-        lm_logits = self.lm_head(hidden_states, input_metadata)
-        return lm_logits
+        hidden_states = self.lm_head.ln(hidden_states)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
+        head = self.lm_head.linear
+        next_tokens = self.sampler(head.weight, hidden_states,
+                                   sampling_metadata, head.bias)
+        return next_tokens

    def load_weights(self,
                     model_name_or_path: str,
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@ -3,11 +3,7 @@
 # https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py
 # Copyright (c) Alibaba Cloud.
 # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
-"""Inference-only QWen model compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only QWen model compatible with HuggingFace weights."""
 from typing import Any, Dict, List, Optional, Tuple

 import torch
@ -15,17 +11,19 @@ from torch import nn

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -99,14 +97,15 @@ class QWenAttention(nn.Module):
            linear_method=linear_method,
        )
        self.scaling = self.head_dim**-0.5
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+
+        self.rotary_emb = get_rope(
            self.head_dim,
-            self.scaling,
            rotary_dim=self.head_dim,
-            base=rope_theta,
            max_position=max_position_embeddings,
-            rope_scaling=rope_scaling)
+            base=rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads, self.head_dim, self.scaling)

    def forward(
        self,
@ -118,10 +117,10 @@ class QWenAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.c_attn(hidden_states)
        q, k, v = qkv.chunk(chunks=3, dim=-1)
-
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)

        output, _ = self.c_proj(attn_output)
        return output
@ -213,10 +212,7 @@ class QWenModel(nn.Module):
        hidden_states = self.wte(input_ids)
        residual = None
        for i in range(len(self.h)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.h[i]
            hidden_states, residual = layer(
                positions,
@ -251,11 +247,18 @@ class QWenLMHeadModel(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.transformer(input_ids, positions, kv_caches,
                                         input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/models/yi.py
+++ b/vllm/model_executor/models/yi.py
@ -20,11 +20,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights.
-
-The input of the model is flattened to a 1D tensor of tokens. The model uses
-InputMetadata to extract the original 2D shape of the input.
-"""
+"""Inference-only Yi model (https://01.ai) compatible with HuggingFace weights."""
 from typing import Any, Dict, List, Optional, Tuple

 import torch
@ -33,17 +29,19 @@ from vllm.transformers_utils.configs.yi import YiConfig

 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
-from vllm.model_executor.layers.attention import PagedAttentionWithRoPE
+from vllm.model_executor.layers.attention import PagedAttention
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                               MergedColumnParallelLinear,
                                               QKVParallelLinear,
                                               RowParallelLinear)
+from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
    VocabParallelEmbedding, ParallelLMHead)
 from vllm.model_executor.parallel_utils.parallel_state import (
    get_tensor_model_parallel_world_size)
+from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.weight_utils import (default_weight_loader,
                                              hf_model_weights_iterator)
 from vllm.sequence import SamplerOutput
@ -130,15 +128,17 @@ class YiAttention(nn.Module):
            bias=False,
            linear_method=linear_method,
        )
-        self.attn = PagedAttentionWithRoPE(
-            self.num_heads,
+        self.rotary_emb = get_rope(
            self.head_dim,
-            self.scaling,
-            base=self.rope_theta,
-            max_position=self.max_position_embeddings,
            rotary_dim=self.head_dim,
-            num_kv_heads=self.num_kv_heads,
-            rope_scaling=rope_scaling)
+            max_position=max_position_embeddings,
+            base=self.rope_theta,
+            rope_scaling=rope_scaling,
+        )
+        self.attn = PagedAttention(self.num_heads,
+                                   self.head_dim,
+                                   self.scaling,
+                                   num_kv_heads=self.num_kv_heads)

    def forward(
        self,
@ -150,9 +150,10 @@ class YiAttention(nn.Module):
    ) -> torch.Tensor:
        qkv, _ = self.qkv_proj(hidden_states)
        q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+        q, k = self.rotary_emb(positions, q, k)
        k_cache, v_cache = kv_cache
-        attn_output = self.attn(positions, q, k, v, k_cache, v_cache,
-                                input_metadata, cache_event)
+        attn_output = self.attn(q, k, v, k_cache, v_cache, input_metadata,
+                                cache_event)
        output, _ = self.o_proj(attn_output)
        return output

@ -249,10 +250,7 @@ class YiModel(nn.Module):
        hidden_states = self.embed_tokens(input_ids)
        residual = None
        for i in range(len(self.layers)):
-            if cache_events is None:
-                cache_event = None
-            else:
-                cache_event = cache_events[i]
+            cache_event = None if cache_events is None else cache_events[i]
            layer = self.layers[i]
            hidden_states, residual = layer(
                positions,
@ -287,11 +285,18 @@ class YiForCausalLM(nn.Module):
        kv_caches: List[KVCache],
        input_metadata: InputMetadata,
        cache_events: Optional[List[torch.cuda.Event]],
-    ) -> SamplerOutput:
+    ) -> torch.Tensor:
        hidden_states = self.model(input_ids, positions, kv_caches,
                                   input_metadata, cache_events)
+        return hidden_states
+
+    def sample(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> SamplerOutput:
        next_tokens = self.sampler(self.lm_head.weight, hidden_states,
-                                   input_metadata)
+                                   sampling_metadata)
        return next_tokens

    def load_weights(self,
--- a/vllm/model_executor/parallel_utils/parallel_state.py
+++ b/vllm/model_executor/parallel_utils/parallel_state.py
@ -2,7 +2,7 @@
 # Adapted from
 # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
-"""Model and data parallel groups."""
+"""Tensor and pipeline parallel groups."""

 import torch

@ -84,7 +84,7 @@ def initialize_model_parallel(


 def model_parallel_is_initialized():
-    """Check if model and data parallel groups are initialized."""
+    """Check if tensor and pipeline parallel groups are initialized."""
    return (_TENSOR_MODEL_PARALLEL_GROUP is not None
            and _PIPELINE_MODEL_PARALLEL_GROUP is not None)

--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@ -0,0 +1,43 @@
+from typing import Dict, List, Tuple
+
+import torch
+
+from vllm.sampling_params import SamplingParams, SamplingType
+from vllm.sequence import SequenceData
+
+
+class SamplingMetadata:
+    """Metadata for input sequences. Used in sampler.
+
+    Args:
+        seq_groups: List of (seq_ids, sampling_params).
+        seq_data: Seq_id -> SequenceData.
+        prompt_lens: Lengths of prompts.
+        selected_token_indices: Token indices selected for sampling.
+        categorized_sample_indices: SamplingType -> token indicies to sample.
+    """
+
+    def __init__(
+        self,
+        seq_groups: List[Tuple[List[int], SamplingParams]],
+        seq_data: Dict[int, SequenceData],
+        prompt_lens: List[int],
+        selected_token_indices: torch.Tensor,
+        categorized_sample_indices: Dict[SamplingType, torch.Tensor],
+    ) -> None:
+        self.seq_groups = seq_groups
+        self.seq_data = seq_data
+        self.prompt_lens = prompt_lens
+        self.selected_token_indices = selected_token_indices
+        self.categorized_sample_indices = categorized_sample_indices
+
+        self.num_prompts = len(prompt_lens)
+
+    def __repr__(self) -> str:
+        return (
+            "SamplingMetadata("
+            f"seq_groups={self.seq_groups}, "
+            f"seq_data={self.seq_data}, "
+            f"prompt_lens={self.prompt_lens}, "
+            f"selected_token_indices={self.selected_token_indices}, "
+            f"categorized_sample_indices={self.categorized_sample_indices})")
--- a/vllm/model_executor/weight_utils.py
+++ b/vllm/model_executor/weight_utils.py
@ -131,11 +131,9 @@ def prepare_hf_model_weights(
 ) -> Tuple[str, List[str], bool]:
    # Download model weights from huggingface.
    is_local = os.path.isdir(model_name_or_path)
-    if use_safetensors:
-        allow_patterns = ["*.safetensors"]
-    else:
-        # Some quantized models use .pt files for storing the weights.
-        allow_patterns = ["*.bin", "*.pt"]
+    # Some quantized models use .pt files for storing the weights.
+    allow_patterns = ["*.safetensors"
+                      ] if use_safetensors else ["*.bin", "*.pt"]
    if not is_local:
        # Use file lock to prevent multiple processes from
        # downloading the same model weights at the same time.
@ -242,7 +240,7 @@ def hf_model_weights_iterator(
    elif use_safetensors:
        for st_file in hf_weights_files:
            with safe_open(st_file, framework="pt") as f:
-                for name in f.keys():
+                for name in f.keys():  # noqa: SIM118
                    param = f.get_tensor(name)
                    yield name, param
    else:
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@ -42,9 +42,9 @@ class SamplingParams:
            model to use new tokens, while values < 0 encourage the model to
            repeat tokens.
        repetition_penalty: Float that penalizes new tokens based on whether
-            they appear in the generated text so far. Values > 1 encourage the
-            model to use new tokens, while values < 1 encourage the model to
-            repeat tokens.
+            they appear in the prompt and the generated text so far. Values > 1
+            encourage the model to use new tokens, while values < 1 encourage
+            the model to repeat tokens.
        temperature: Float that controls the randomness of the sampling. Lower
            values make the model more deterministic, while higher values make
            the model more random. Zero means greedy sampling.
@ -69,7 +69,7 @@ class SamplingParams:
            The returned output will not contain the stop strings.
        stop_token_ids: List of tokens that stop the generation when they are
            generated. The returned output will contain the stop tokens unless
-            the stop tokens are sepcial tokens.
+            the stop tokens are special tokens.
        ignore_eos: Whether to ignore the EOS token and continue generating
            tokens after the EOS token is generated.
        max_tokens: Maximum number of tokens to generate per output sequence.
@ -147,6 +147,8 @@ class SamplingParams:
            self._verify_non_beam_search()
            if self.temperature < _SAMPLING_EPS:
                # Zero temperature means greedy sampling.
+                self.top_p = 1.0
+                self.top_k = -1
                self._verify_greedy_sampling()

    def _verify_args(self) -> None:
@ -214,10 +216,6 @@ class SamplingParams:
        if self.best_of > 1:
            raise ValueError("best_of must be 1 when using greedy sampling."
                             f"Got {self.best_of}.")
-        if self.top_p < 1.0 - _SAMPLING_EPS:
-            raise ValueError("top_p must be 1 when using greedy sampling.")
-        if self.top_k != -1:
-            raise ValueError("top_k must be -1 when using greedy sampling.")

    @cached_property
    def sampling_type(self) -> SamplingType:
@ -241,6 +239,7 @@ class SamplingParams:
                f"length_penalty={self.length_penalty}, "
                f"early_stopping={self.early_stopping}, "
                f"stop={self.stop}, "
+                f"stop_token_ids={self.stop_token_ids}, "
                f"ignore_eos={self.ignore_eos}, "
                f"max_tokens={self.max_tokens}, "
                f"logprobs={self.logprobs}, "
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@ -352,7 +352,7 @@ class SequenceGroupMetadata:
        self.block_tables = block_tables


-class SequenceOutputs:
+class SequenceOutput:
    """The model output associated with a sequence.

    Args:
@ -374,40 +374,40 @@ class SequenceOutputs:
        self.logprobs = logprobs

    def __repr__(self) -> str:
-        return (f"SequenceOutputs(parent_seq_id={self.parent_seq_id}, "
+        return (f"SequenceOutput(parent_seq_id={self.parent_seq_id}, "
                f"output_token={self.output_token}, "
                f"logprobs={self.logprobs})")

    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, SequenceOutputs):
+        if not isinstance(other, SequenceOutput):
            raise NotImplementedError()
        return (self.parent_seq_id == other.parent_seq_id
                and self.output_token == other.output_token
                and self.logprobs == other.logprobs)


-class SequenceGroupOutputs:
-    """The model outputs associated with a sequence group."""
+class SequenceGroupOutput:
+    """The model output associated with a sequence group."""

    def __init__(
        self,
-        samples: List[SequenceOutputs],
+        samples: List[SequenceOutput],
        prompt_logprobs: Optional[PromptLogprobs],
    ) -> None:
        self.samples = samples
        self.prompt_logprobs = prompt_logprobs

    def __repr__(self) -> str:
-        return (f"SequenceGroupOutputs(samples={self.samples}, "
+        return (f"SequenceGroupOutput(samples={self.samples}, "
                f"prompt_logprobs={self.prompt_logprobs})")

    def __eq__(self, other: object) -> bool:
-        if not isinstance(other, SequenceGroupOutputs):
+        if not isinstance(other, SequenceGroupOutput):
            raise NotImplementedError()
        return (self.samples == other.samples
                and self.prompt_logprobs == other.prompt_logprobs)


-# For each sequence group, we generate a list of SequenceOutputs object,
+# For each sequence group, we generate a list of SequenceOutput object,
 # each of which contains one possible candidate for the next token.
-SamplerOutput = List[SequenceGroupOutputs]
+SamplerOutput = List[SequenceGroupOutput]
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@ -2,7 +2,7 @@ from typing import Optional

 from transformers import AutoConfig, PretrainedConfig

-from vllm.transformers_utils.configs import *  # pylint: disable=wildcard-import
+from vllm.transformers_utils.configs import *

 _CONFIG_REGISTRY = {
    "aquila": AquilaConfig,
--- a/vllm/transformers_utils/configs/mpt.py
+++ b/vllm/transformers_utils/configs/mpt.py
@ -62,7 +62,6 @@ class MPTConfig(PretrainedConfig):
                 fc_type: str = 'torch',
                 verbose: Optional[int] = None,
                 **kwargs: Any):
-        # pylint: disable=line-too-long
        """The MPT configuration class.
        Args:
            d_model (int): The size of the embedding dimension of the model.
@ -139,10 +138,10 @@ class MPTConfig(PretrainedConfig):
        self.init_config = init_config
        self.fc_type = fc_type
        if verbose is not None:
-            warnings.warn(
-                DeprecationWarning(
-                    'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
-                ))
+            warnings.warn(DeprecationWarning(
+                'verbose argument for MPTConfig is now ignored and will be removed. Use python_log_level instead.'
+            ),
+                          stacklevel=2)
        if 'name' in kwargs:
            del kwargs['name']
        if 'loss_fn' in kwargs:
@ -150,8 +149,8 @@ class MPTConfig(PretrainedConfig):
        if self.attn_config.get('alibi', False):
            self.learned_pos_emb = False
            warnings.warn(
-                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`'
-            )
+                f'alibi is turned on, setting `learned_pos_emb` to {self.learned_pos_emb}`',
+                stacklevel=2)
        super().__init__(**kwargs)
        self._validate_config()

@ -211,7 +210,8 @@ class MPTConfig(PretrainedConfig):
            )
        if not self.learned_pos_emb and (not self.attn_config['alibi']):
            warnings.warn(
-                'Positional information not being provided to the model.')
+                'Positional information not being provided to the model.',
+                stacklevel=2)
        if self.fc_type == 'te' or self.ffn_config['ffn_type'] == 'te_ln_mlp':
            try:
                # pylint: disable=import-outside-toplevel
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@ -4,6 +4,7 @@ from transformers import (AutoTokenizer, PreTrainedTokenizer,
                          PreTrainedTokenizerFast)

 from vllm.logger import init_logger
+from vllm.transformers_utils.tokenizers import *

 logger = init_logger(__name__)

@ -61,6 +62,18 @@ def get_tokenizer(
            raise RuntimeError(err_msg) from e
        else:
            raise e
+    except AttributeError as e:
+        if "BaichuanTokenizer" in str(e):
+            # This is for the error "'BaichuanTokenizer' object has no
+            # attribute 'sp_model'".
+            tokenizer = BaichuanTokenizer.from_pretrained(
+                tokenizer_name,
+                *args,
+                trust_remote_code=trust_remote_code,
+                tokenizer_revision=tokenizer_revision,
+                **kwargs)
+        else:
+            raise e

    if not isinstance(tokenizer, PreTrainedTokenizerFast):
        logger.warning(
--- a/vllm/transformers_utils/tokenizers/init.py
+++ b/vllm/transformers_utils/tokenizers/init.py
@ -0,0 +1,5 @@
+from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
+
+__all__ = [
+    "BaichuanTokenizer",
+]
--- a/vllm/transformers_utils/tokenizers/baichuan.py
+++ b/vllm/transformers_utils/tokenizers/baichuan.py
@ -0,0 +1,263 @@
+# yapf: disable
+# Adapted from
+# https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
+# This includes a fix suggested in
+# https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058
+# Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
+
+import os
+from shutil import copyfile
+from typing import Any, Dict, List, Optional, Tuple
+
+import sentencepiece as spm
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {},
+    "tokenizer_file": {},
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {}
+
+
+class BaichuanTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token=None,
+        sp_model_kwargs: Optional[Dict[str, Any]] = None,
+        add_bos_token=True,
+        add_eos_token=False,
+        clean_up_tokenization_spaces=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        bos_token = (
+            AddedToken(bos_token, lstrip=False, rstrip=False)
+            if isinstance(bos_token, str)
+            else bos_token
+        )
+        eos_token = (
+            AddedToken(eos_token, lstrip=False, rstrip=False)
+            if isinstance(eos_token, str)
+            else eos_token
+        )
+        unk_token = (
+            AddedToken(unk_token, lstrip=False, rstrip=False)
+            if isinstance(unk_token, str)
+            else unk_token
+        )
+        pad_token = (
+            AddedToken(pad_token, lstrip=False, rstrip=False)
+            if isinstance(pad_token, str)
+            else pad_token
+        )
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            add_bos_token=add_bos_token,
+            add_eos_token=add_eos_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + VOCAB_FILES_NAMES["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = bos_token_id + token_ids_0 + eos_token_id
+
+        if token_ids_1 is not None:
+            output = output + bos_token_id + token_ids_1 + eos_token_id
+
+        return output
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        bos_token_id = [1] if self.add_bos_token else []
+        eos_token_id = [1] if self.add_eos_token else []
+
+        if token_ids_1 is None:
+            return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
+        return (
+            bos_token_id
+            + ([0] * len(token_ids_0))
+            + eos_token_id
+            + bos_token_id
+            + ([0] * len(token_ids_1))
+            + eos_token_id
+        )
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
+        sequence pair mask has the following format:
+
+        ```
+        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence |
+        ```
+
+        if token_ids_1 is None, only returns the first portion of the mask (0s).
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
+        """
+        bos_token_id = [self.bos_token_id] if self.add_bos_token else []
+        eos_token_id = [self.eos_token_id] if self.add_eos_token else []
+
+        output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
+
+        if token_ids_1 is not None:
+            output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
+
+        return output
--- a/vllm/utils.py
+++ b/vllm/utils.py
@ -5,7 +5,7 @@ from platform import uname
 import psutil
 import torch

-from vllm import cuda_utils
+from vllm._C import cuda_utils


 class Device(enum.Enum):
@ -30,7 +30,7 @@ class Counter:
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
    """Returns the maximum shared memory per thread block in bytes."""
    # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
-    cudaDevAttrMaxSharedMemoryPerBlockOptin = 97  # pylint: disable=invalid-name
+    cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
    max_shared_mem = cuda_utils.get_device_attribute(
        cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
    return int(max_shared_mem)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Woosuk Kwon	0f90effc66	Bump up to v0.2.3 (#1903 )	2023-12-03 12:27:47 -08:00
Woosuk Kwon	464dd985e3	Fix num_gpus when TP > 1 (#1852 )	2023-12-03 12:24:30 -08:00
Massimiliano Pronesti	c07a442854	chore(examples-docs): upgrade to OpenAI V1 (#1785 )	2023-12-03 01:11:22 -08:00
Woosuk Kwon	cd3aa153a4	Fix broken worker test (#1900 )	2023-12-02 22:17:33 -08:00
Woosuk Kwon	9b294976a2	Add PyTorch-native implementation of custom layers (#1898 )	2023-12-02 21:18:40 -08:00
Simon Mo	5313c2cb8b	Add Production Metrics in Prometheus format (#1890 )	2023-12-02 16:37:44 -08:00
Woosuk Kwon	5f09cbdb63	Fix broken sampler tests (#1896 ) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>	2023-12-02 16:06:17 -08:00
Simon Mo	4cefa9b49b	[Docs] Update the AWQ documentation to highlight performance issue (#1883 )	2023-12-02 15:52:47 -08:00
Jerry	f86bd6190a	Fix the typo in SamplingParams' docstring (#1886 )	2023-12-01 02:06:36 -08:00
Woosuk Kwon	e5452ddfd6	Normalize head weights for Baichuan 2 (#1876 )	2023-11-30 20:03:58 -08:00
Woosuk Kwon	d06980dfa7	Fix Baichuan tokenizer error (#1874 )	2023-11-30 18:35:50 -08:00
Adam Brusselback	66785cc05c	Support chat template and `echo` for chat API (#1756 )	2023-11-30 16:43:13 -08:00
Massimiliano Pronesti	05a38612b0	docs: add instruction for langchain (#1162 )	2023-11-30 10:57:44 -08:00
Roy	d27f4bae39	Fix rope cache key error (#1867 )	2023-11-30 08:29:28 -08:00
aisensiy	8d8c2f6ffe	Support max-model-len argument for throughput benchmark (#1858 )	2023-11-30 08:10:24 -08:00
Woosuk Kwon	51d3cb951d	Remove max_num_seqs in latency benchmark script (#1855 )	2023-11-30 00:00:32 -08:00
Woosuk Kwon	e74b1736a1	Add profile option to latency benchmark script (#1839 )	2023-11-29 23:42:52 -08:00
Allen	f07c1ceaa5	[FIX] Fix docker build error (#1831 ) (#1832 ) Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>	2023-11-29 23:06:50 -08:00
Jee Li	63b2206ad0	Avoid multiple instantiations of the RoPE class (#1828 )	2023-11-29 23:06:27 -08:00
Woosuk Kwon	27feead2f8	Refactor Worker & InputMetadata (#1843 )	2023-11-29 22:16:37 -08:00
Michael McCulloch	c782195662	Disable Logs Requests should Disable Logging of requests. (#1779 ) Co-authored-by: Michael McCulloch <mjm.gitlab@fastmail.com>	2023-11-29 21:50:02 -08:00
Simon Mo	0f621c2c7d	[Docs] Add information about using shared memory in docker (#1845 )	2023-11-29 18:33:56 -08:00
Woosuk Kwon	a9e4574261	Refactor Attention (#1840 )	2023-11-29 15:37:31 -08:00
FlorianJoncour	0229c386c5	Better integration with Ray Serve (#1821 ) Co-authored-by: FlorianJoncour <florian@zetta-sys.com>	2023-11-29 13:25:43 -08:00
Woosuk Kwon	a7b3e33078	[Fix] Fix RoPE in ChatGLM-32K (#1841 )	2023-11-29 13:01:19 -08:00
Zhuohan Li	e19a64c7ef	[FIX] Fix formatting error in main branch (#1822 )	2023-11-28 16:56:43 -08:00
Zhuohan Li	1cb4ad8de9	[FIX] Fix formatting error	2023-11-29 00:40:19 +00:00
explainerauthors	6ed068a71a	Use the type BlockTable (#1791 )	2023-11-28 16:34:05 -08:00
Zhuohan Li	708e6c18b0	[FIX] Fix class naming (#1803 )	2023-11-28 14:08:01 -08:00
Woosuk Kwon	b943890484	Fix OPT param names (#1819 )	2023-11-28 11:22:44 -08:00
explainerauthors	a1125ad4df	Correct comments in parallel_state.py (#1818 )	2023-11-28 10:19:35 -08:00
ljss	a8b150c595	Init model on GPU to reduce CPU memory footprint (#1796 )	2023-11-27 11:18:26 -08:00
Yunmo Chen	665cbcec4b	Added echo function to OpenAI API server. (#1504 )	2023-11-26 21:29:17 -08:00
Woosuk Kwon	7c600440f7	Fix model docstrings (#1764 )	2023-11-23 23:04:44 -08:00
Yanming W	e0c6f556e8	[Build] Avoid building too many extensions (#1624 )	2023-11-23 16:31:19 -08:00
ljss	de23687d16	Fix repetition penalty aligned with huggingface (#1577 )	2023-11-22 14:41:44 -08:00
ljss	4cea74c73b	Set top_p=0 and top_k=-1 in greedy sampling (#1748 )	2023-11-22 12:51:09 -08:00
Casper	a921d8be9d	[DOCS] Add engine args documentation (#1741 )	2023-11-22 12:31:27 -08:00
陈序	094f716bf2	Add stop_token_ids in SamplingParams.__repr__ (#1745 )	2023-11-21 20:13:53 -08:00
Zhuohan Li	7d761fe3c1	[FIX] Fix the case when `input_is_parallel=False` for `ScaledActivation` (#1737 )	2023-11-20 23:56:48 -08:00
Woosuk Kwon	cf35d8f3d7	[BugFix] Fix TP support for AWQ (#1731 )	2023-11-20 21:42:45 -08:00
boydfd	4bb6b67188	fix RAM OOM when load large models in tensor parallel mode. (#1395 ) Co-authored-by: ran_lin <rlin@thoughtworks.com>	2023-11-20 19:02:42 -08:00
ljss	819b18e7ba	Rewrite torch.repeat_interleave to remove cpu synchronization (#1599 )	2023-11-20 17:46:32 -08:00
Zhuofan	19849db573	[Fix] Fix bugs in scheduler (#1727 )	2023-11-20 16:10:50 -08:00
陈序	3d4ceb292c	Fix hanging in the scheduler caused by long prompts (#1534 )	2023-11-20 16:06:49 -08:00
Woosuk Kwon	f5a37c6c6c	[BugFix] Fix a bug in loading safetensors (#1732 )	2023-11-20 15:51:18 -08:00
Zhuohan Li	32c927b53f	[FIX] Update the doc link in README.md (#1730 )	2023-11-20 12:46:24 -08:00
Simon Mo	5ffc0d13a2	Migrate linter from `pylint` to `ruff` (#1665 )	2023-11-20 11:58:01 -08:00
Wen Sun	112627e8b2	[Docs] Fix the code block's format in deploying_with_docker page (#1722 )	2023-11-20 01:22:39 -08:00
Simon Mo	37c1e3c218	Documentation about official docker image (#1709 )	2023-11-19 20:56:26 -08:00
Woosuk Kwon	06e9ebebd5	Add instructions to install vLLM+cu118 (#1717 )	2023-11-18 23:48:58 -08:00