mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
76 Commits
Author | SHA1 | Date | |
---|---|---|---|
37cf1f27f2 | |||
45ea3c31a2 | |||
df866cfebf | |||
29596317b0 | |||
7b86860ff5 | |||
f386a9e56c | |||
efe73d0575 | |||
853487bc1b | |||
9ff2af6d2b | |||
70ca5484f5 | |||
5358cce5ff | |||
2155e95ef1 | |||
f95570a52d | |||
b6e7e3d58f | |||
e760fcef22 | |||
6bbf1795b7 | |||
9e0ef888f0 | |||
97abeb1daa | |||
34dad19e7b | |||
6db31e7a27 | |||
977180c912 | |||
c40784c794 | |||
baed180aa0 | |||
0b407479ef | |||
5eaf570050 | |||
d8ee5a2ca4 | |||
b9fca83256 | |||
32dffc2772 | |||
c438183e99 | |||
baba0389f7 | |||
c6c22f16d3 | |||
dd382e0fe3 | |||
849590a2a7 | |||
a4c23314c0 | |||
b942c094e3 | |||
b4bab81660 | |||
b91cb3fa5c | |||
71d1d75b7a | |||
72d14d0eed | |||
e34d130c16 | |||
7721ef1786 | |||
8369b7c2a9 | |||
3eb4ad53f3 | |||
90a2769f20 | |||
e60d422f19 | |||
0d914c81a2 | |||
6e428cdd7a | |||
93b9d9f499 | |||
af107d5a0e | |||
31c5d0a1b7 | |||
afb7cff1b9 | |||
d2e841a10a | |||
14601f5fba | |||
042d131f39 | |||
8e807cdfa4 | |||
e601efcb10 | |||
22dd9c2730 | |||
a6d795d593 | |||
a37d75bbec | |||
edd270bc78 | |||
110df74332 | |||
1ad69e8375 | |||
b8a498c9b2 | |||
923147b5e8 | |||
45877ef740 | |||
6e4bef1bea | |||
4ff79a136e | |||
448acad31e | |||
eb0b2d2f08 | |||
3112271f6e | |||
1fd471e957 | |||
2c5ebec064 | |||
2e610deb72 | |||
6e2c19ce22 | |||
47db8c2c15 | |||
462b269280 |
@ -48,10 +48,16 @@ function cpu_tests() {
|
|||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# Note: disable until supports V1
|
||||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model
|
|
||||||
|
# Note: disable Bart until supports V1
|
||||||
|
pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -v -s tests/models/multimodal/generation \
|
pytest -v -s tests/models/multimodal/generation \
|
||||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
@ -62,21 +68,15 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
|
||||||
|
|
||||||
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
# set -e
|
# set -e
|
||||||
# VLLM_USE_V1=0 pytest -s -v \
|
# VLLM_USE_V1=0 pytest -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -s -v -k cpu_model \
|
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
|
||||||
|
|
||||||
# online serving
|
# online serving
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
|
@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
|
|||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f "${container_name}" || true;
|
docker rm -f "${container_name}" || true;
|
||||||
docker image rm -f "${image_name}" || true;
|
docker image rm -f "${image_name}" || true;
|
||||||
docker system prune -f || true;
|
docker system prune -f || true;
|
||||||
}
|
}
|
||||||
@ -27,4 +27,8 @@ docker run \
|
|||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
sh -c '
|
sh -c '
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
|
cd tests
|
||||||
|
pytest -v -s v1/core
|
||||||
'
|
'
|
||||||
|
@ -170,7 +170,7 @@ repos:
|
|||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
@ -701,6 +701,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self,
|
self,
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
|
no_stream: bool = False,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -708,6 +709,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
self.load_stream = not no_stream
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
@ -716,7 +718,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=self.load_stream,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
self.data = self.data.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
|
@ -825,6 +825,7 @@ def main(args: argparse.Namespace):
|
|||||||
dataset_subset=args.hf_subset,
|
dataset_subset=args.hf_subset,
|
||||||
dataset_split=args.hf_split,
|
dataset_split=args.hf_split,
|
||||||
random_seed=args.seed,
|
random_seed=args.seed,
|
||||||
|
no_stream=args.no_stream,
|
||||||
).sample(
|
).sample(
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
@ -1033,6 +1034,11 @@ def create_argument_parser():
|
|||||||
help="Path to the sharegpt/sonnet dataset. "
|
help="Path to the sharegpt/sonnet dataset. "
|
||||||
"Or the huggingface dataset ID if using HF dataset.",
|
"Or the huggingface dataset ID if using HF dataset.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-concurrency",
|
"--max-concurrency",
|
||||||
type=int,
|
type=int,
|
||||||
|
@ -356,6 +356,7 @@ def get_requests(args, tokenizer):
|
|||||||
elif args.dataset_name == "burstgpt":
|
elif args.dataset_name == "burstgpt":
|
||||||
dataset_cls = BurstGPTDataset
|
dataset_cls = BurstGPTDataset
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
|
common_kwargs["no_stream"] = args.no_stream
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = VisionArenaDataset
|
dataset_cls = VisionArenaDataset
|
||||||
common_kwargs["dataset_subset"] = None
|
common_kwargs["dataset_subset"] = None
|
||||||
@ -610,6 +611,11 @@ def create_argument_parser():
|
|||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
default="sharegpt",
|
default="sharegpt",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset",
|
"--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
|
@ -153,7 +153,7 @@ struct ScaledEpilogueBias
|
|||||||
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
|
cutlass::epilogue::threadblock::Sm80EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
|
using Compute1 = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -210,7 +210,7 @@ struct ScaledEpilogueBiasAzp
|
|||||||
EVTComputeAzp>;
|
EVTComputeAzp>;
|
||||||
|
|
||||||
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
|
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -288,7 +288,7 @@ struct ScaledEpilogueBiasAzpToken
|
|||||||
EVTComputeAcc>;
|
EVTComputeAcc>;
|
||||||
|
|
||||||
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
|
using ComputeScaleBiasA = cutlass::epilogue::threadblock::VisitorCompute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -195,7 +195,7 @@ struct ScaledEpilogueBias
|
|||||||
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
|
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
|
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -238,7 +238,7 @@ struct ScaledEpilogueColumnBias
|
|||||||
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
|
cutlass::epilogue::fusion::Sm90EVT<Compute0, ScaleB, Accum>;
|
||||||
|
|
||||||
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
|
using Compute1 = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -295,7 +295,7 @@ struct ScaledEpilogueBiasAzp
|
|||||||
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
|
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAzp>;
|
||||||
|
|
||||||
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
|
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
@ -371,7 +371,7 @@ struct ScaledEpilogueBiasAzpToken
|
|||||||
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
|
cutlass::epilogue::fusion::Sm90EVT<ComputeScaleB, ScaleB, EVTComputeAcc>;
|
||||||
|
|
||||||
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
|
using ComputeScaleBiasA = cutlass::epilogue::fusion::Sm90Compute<
|
||||||
cutlass::multiply_add, ElementD, float,
|
cutlass::homogeneous_multiply_add, ElementD, float,
|
||||||
cutlass::FloatRoundStyle::round_to_nearest>;
|
cutlass::FloatRoundStyle::round_to_nearest>;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
constexpr uint64_t THREADS_PER_EXPERT = 512;
|
constexpr uint64_t THREADS_PER_EXPERT = 512;
|
||||||
|
|
||||||
__global__ void compute_problem_sizes(const uint32_t* __restrict__ topk_ids,
|
__global__ void compute_problem_sizes(const int32_t* __restrict__ topk_ids,
|
||||||
int32_t* problem_sizes1,
|
int32_t* problem_sizes1,
|
||||||
int32_t* problem_sizes2,
|
int32_t* problem_sizes2,
|
||||||
int32_t* atomic_buffer,
|
int32_t* atomic_buffer,
|
||||||
@ -62,7 +62,7 @@ __global__ void compute_expert_blockscale_offsets(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__global__ void compute_arg_sorts(const uint32_t* __restrict__ topk_ids,
|
__global__ void compute_arg_sorts(const int32_t* __restrict__ topk_ids,
|
||||||
const int32_t* __restrict__ expert_offsets,
|
const int32_t* __restrict__ expert_offsets,
|
||||||
int32_t* input_permutation,
|
int32_t* input_permutation,
|
||||||
int32_t* output_permutation,
|
int32_t* output_permutation,
|
||||||
@ -103,7 +103,7 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
|
|
||||||
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
int num_threads = min(THREADS_PER_EXPERT, topk_ids.numel());
|
||||||
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
|
compute_problem_sizes<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const uint32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
static_cast<int32_t*>(problem_sizes1.data_ptr()),
|
||||||
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
static_cast<int32_t*>(problem_sizes2.data_ptr()),
|
||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), topk_ids.numel(), n, k);
|
||||||
@ -120,7 +120,7 @@ void get_cutlass_moe_mm_data_caller(
|
|||||||
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
|
static_cast<int32_t*>(atomic_buffer.data_ptr()), num_experts);
|
||||||
}
|
}
|
||||||
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
compute_arg_sorts<<<num_experts, num_threads, 0, stream>>>(
|
||||||
static_cast<const uint32_t*>(topk_ids.data_ptr()),
|
static_cast<const int32_t*>(topk_ids.data_ptr()),
|
||||||
static_cast<const int32_t*>(expert_offsets.data_ptr()),
|
static_cast<const int32_t*>(expert_offsets.data_ptr()),
|
||||||
static_cast<int32_t*>(input_permutation.data_ptr()),
|
static_cast<int32_t*>(input_permutation.data_ptr()),
|
||||||
static_cast<int32_t*>(output_permutation.data_ptr()),
|
static_cast<int32_t*>(output_permutation.data_ptr()),
|
||||||
|
@ -47,7 +47,7 @@ FROM vllm-base AS vllm-openai
|
|||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
pip install accelerate hf_transfer pytest 'modelscope!=1.15.0'
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
ENV VLLM_USAGE_SOURCE production-docker-image \
|
||||||
TRITON_XPU_PROFILE 1
|
TRITON_XPU_PROFILE 1
|
||||||
|
@ -55,6 +55,7 @@ nav:
|
|||||||
- contributing/model/registration.md
|
- contributing/model/registration.md
|
||||||
- contributing/model/tests.md
|
- contributing/model/tests.md
|
||||||
- contributing/model/multimodal.md
|
- contributing/model/multimodal.md
|
||||||
|
- CI: contributing/ci
|
||||||
- Design Documents:
|
- Design Documents:
|
||||||
- V0: design
|
- V0: design
|
||||||
- V1: design/v1
|
- V1: design/v1
|
||||||
|
@ -48,4 +48,4 @@ For more information, check out the following:
|
|||||||
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
|
- [vLLM announcing blog post](https://vllm.ai) (intro to PagedAttention)
|
||||||
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
|
- [vLLM paper](https://arxiv.org/abs/2309.06180) (SOSP 2023)
|
||||||
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
|
- [How continuous batching enables 23x throughput in LLM inference while reducing p50 latency](https://www.anyscale.com/blog/continuous-batching-llm-inference) by Cade Daniel et al.
|
||||||
- [vLLM Meetups][meetups]
|
- [vLLM Meetups](community/meetups.md)
|
||||||
|
@ -64,7 +64,7 @@ vLLM provides experimental support for multi-modal models through the [vllm.mult
|
|||||||
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models]
|
||||||
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
via the `multi_modal_data` field in [vllm.inputs.PromptType][].
|
||||||
|
|
||||||
Looking to add your own multi-modal model? Please follow the instructions listed [here][supports-multimodal].
|
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
|
||||||
|
|
||||||
- [vllm.multimodal.MULTIMODAL_REGISTRY][]
|
- [vllm.multimodal.MULTIMODAL_REGISTRY][]
|
||||||
|
|
||||||
|
@ -16,7 +16,7 @@ vllm {chat,complete,serve,bench,collect-env,run-batch}
|
|||||||
|
|
||||||
Start the vLLM OpenAI Compatible API server.
|
Start the vLLM OpenAI Compatible API server.
|
||||||
|
|
||||||
??? Examples
|
??? console "Examples"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Start with a model
|
# Start with a model
|
||||||
|
@ -1,6 +1,3 @@
|
|||||||
---
|
# Contact Us
|
||||||
title: Contact Us
|
|
||||||
---
|
|
||||||
[](){ #contactus }
|
|
||||||
|
|
||||||
--8<-- "README.md:contact-us"
|
--8<-- "README.md:contact-us"
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Meetups
|
||||||
title: Meetups
|
|
||||||
---
|
|
||||||
[](){ #meetups }
|
|
||||||
|
|
||||||
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
|
||||||
|
|
||||||
|
@ -19,7 +19,20 @@ llm = LLM(model="ibm-granite/granite-3.1-8b-instruct",
|
|||||||
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
|
To ensure that vLLM initializes CUDA correctly, you should avoid calling related functions (e.g. [torch.cuda.set_device][])
|
||||||
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
before initializing vLLM. Otherwise, you may run into an error like `RuntimeError: Cannot re-initialize CUDA in forked subprocess`.
|
||||||
|
|
||||||
To control which devices are used, please instead set the `CUDA_VISIBLE_DEVICES` environment variable.
|
|
||||||
|
To control which devices are used, you can either set the `CUDA_VISIBLE_DEVICES`
|
||||||
|
environment variable, pass the `gpu_ids` parameter to the [LLM] constructor,
|
||||||
|
or use the `--gpu-ids` option with `vllm serve`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM
|
||||||
|
|
||||||
|
# Use GPUs 0 and 2 for execution without setting CUDA_VISIBLE_DEVICES env var
|
||||||
|
llm = LLM(
|
||||||
|
model="your-model",
|
||||||
|
gpu_ids=[0, 2],
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
With tensor parallelism enabled, each process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism).
|
||||||
@ -33,7 +46,7 @@ Quantized models take less memory at the cost of lower precision.
|
|||||||
Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
|
Statically quantized models can be downloaded from HF Hub (some popular ones are available at [Red Hat AI](https://huggingface.co/RedHatAI))
|
||||||
and used directly without extra configuration.
|
and used directly without extra configuration.
|
||||||
|
|
||||||
Dynamic quantization is also supported via the `quantization` option -- see [here][quantization-index] for more details.
|
Dynamic quantization is also supported via the `quantization` option -- see [here](../features/quantization/README.md) for more details.
|
||||||
|
|
||||||
## Context length and batch size
|
## Context length and batch size
|
||||||
|
|
||||||
@ -57,7 +70,7 @@ By default, we optimize model inference using CUDA graphs which take up extra me
|
|||||||
|
|
||||||
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
You can adjust `compilation_config` to achieve a better balance between inference speed and memory usage:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
@ -129,7 +142,7 @@ reduce the size of the processed multi-modal inputs, which in turn saves memory.
|
|||||||
|
|
||||||
Here are some examples:
|
Here are some examples:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
---
|
# Engine Arguments
|
||||||
title: Engine Arguments
|
|
||||||
---
|
|
||||||
[](){ #engine-args }
|
|
||||||
|
|
||||||
Engine arguments control the behavior of the vLLM engine.
|
Engine arguments control the behavior of the vLLM engine.
|
||||||
|
|
||||||
- For [offline inference][offline-inference], they are part of the arguments to [LLM][vllm.LLM] class.
|
- For [offline inference](../serving/offline_inference.md), they are part of the arguments to [LLM][vllm.LLM] class.
|
||||||
- For [online serving][serving-openai-compatible-server], they are part of the arguments to `vllm serve`.
|
- For [online serving](../serving/openai_compatible_server.md), they are part of the arguments to `vllm serve`.
|
||||||
|
|
||||||
You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
|
You can look at [EngineArgs][vllm.engine.arg_utils.EngineArgs] and [AsyncEngineArgs][vllm.engine.arg_utils.AsyncEngineArgs] to see the available engine arguments.
|
||||||
|
|
||||||
|
@ -7,7 +7,7 @@ vLLM uses the following environment variables to configure the system:
|
|||||||
|
|
||||||
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
|
All environment variables used by vLLM are prefixed with `VLLM_`. **Special care should be taken for Kubernetes users**: please do not name the service as `vllm`, otherwise environment variables set by Kubernetes might conflict with vLLM's environment variables, because [Kubernetes sets environment variables for each service with the capitalized service name as the prefix](https://kubernetes.io/docs/concepts/services-networking/service/#environment-variables).
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
--8<-- "vllm/envs.py:env-vars-definition"
|
--8<-- "vllm/envs.py:env-vars-definition"
|
||||||
|
@ -20,4 +20,4 @@ model = LLM(
|
|||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
Our [list of supported models][supported-models] shows the model architectures that are recognized by vLLM.
|
Our [list of supported models](../models/supported_models.md) shows the model architectures that are recognized by vLLM.
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Server Arguments
|
||||||
title: Server Arguments
|
|
||||||
---
|
|
||||||
[](){ #serve-args }
|
|
||||||
|
|
||||||
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
The `vllm serve` command is used to launch the OpenAI-compatible server.
|
||||||
|
|
||||||
@ -13,7 +10,7 @@ To see the available CLI arguments, run `vllm serve --help`!
|
|||||||
## Configuration file
|
## Configuration file
|
||||||
|
|
||||||
You can load CLI arguments via a [YAML](https://yaml.org/) config file.
|
You can load CLI arguments via a [YAML](https://yaml.org/) config file.
|
||||||
The argument names must be the long form of those outlined [above][serve-args].
|
The argument names must be the long form of those outlined [above](serve_args.md).
|
||||||
|
|
||||||
For example:
|
For example:
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ For additional features and advanced configurations, refer to the official [MkDo
|
|||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
??? note "Commands"
|
??? console "Commands"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements/dev.txt
|
pip install -r requirements/dev.txt
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Benchmark Suites
|
||||||
title: Benchmark Suites
|
|
||||||
---
|
|
||||||
[](){ #benchmarks }
|
|
||||||
|
|
||||||
vLLM contains two sets of benchmarks:
|
vLLM contains two sets of benchmarks:
|
||||||
|
|
||||||
|
@ -6,9 +6,9 @@ the failure?
|
|||||||
- Check the dashboard of current CI test failures:
|
- Check the dashboard of current CI test failures:
|
||||||
👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
|
👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
|
||||||
|
|
||||||
- If your failure **is already listed**, it's likely unrelated to your PR.
|
- If your failure **is already listed**, it's likely unrelated to your PR.
|
||||||
Help fixing it is always welcome!
|
Help fixing it is always welcome!
|
||||||
- Leave comments with links to additional instances of the failure.
|
- Leave comments with links to additional instances of the failure.
|
||||||
- React with a 👍 to signal how many are affected.
|
- React with a 👍 to signal how many are affected.
|
||||||
|
|
||||||
- If your failure **is not listed**, you should **file an issue**.
|
- If your failure **is not listed**, you should **file an issue**.
|
||||||
@ -19,25 +19,25 @@ the failure?
|
|||||||
👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
|
👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
|
||||||
|
|
||||||
- **Use this title format:**
|
- **Use this title format:**
|
||||||
|
|
||||||
```
|
```
|
||||||
[CI Failure]: failing-test-job - regex/matching/failing:test
|
[CI Failure]: failing-test-job - regex/matching/failing:test
|
||||||
```
|
```
|
||||||
|
|
||||||
- **For the environment field:**
|
- **For the environment field:**
|
||||||
|
|
||||||
```
|
```
|
||||||
Still failing on main as of commit abcdef123
|
Still failing on main as of commit abcdef123
|
||||||
```
|
```
|
||||||
|
|
||||||
- **In the description, include failing tests:**
|
- **In the description, include failing tests:**
|
||||||
|
|
||||||
```
|
```
|
||||||
FAILED failing/test.py:failing_test1 - Failure description
|
FAILED failing/test.py:failing_test1 - Failure description
|
||||||
FAILED failing/test.py:failing_test2 - Failure description
|
FAILED failing/test.py:failing_test2 - Failure description
|
||||||
https://github.com/orgs/vllm-project/projects/20
|
https://github.com/orgs/vllm-project/projects/20
|
||||||
https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
|
https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
|
||||||
FAILED failing/test.py:failing_test3 - Failure description
|
FAILED failing/test.py:failing_test3 - Failure description
|
||||||
```
|
```
|
||||||
|
|
||||||
- **Attach logs** (collapsible section example):
|
- **Attach logs** (collapsible section example):
|
||||||
@ -45,17 +45,17 @@ the failure?
|
|||||||
<summary>Logs:</summary>
|
<summary>Logs:</summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
|
ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
|
||||||
--- Logging error ---
|
--- Logging error ---
|
||||||
Traceback (most recent call last):
|
Traceback (most recent call last):
|
||||||
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model
|
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model
|
||||||
return self.model_executor.execute_model(scheduler_output)
|
return self.model_executor.execute_model(scheduler_output)
|
||||||
...
|
...
|
||||||
FAILED failing/test.py:failing_test1 - Failure description
|
FAILED failing/test.py:failing_test1 - Failure description
|
||||||
FAILED failing/test.py:failing_test2 - Failure description
|
FAILED failing/test.py:failing_test2 - Failure description
|
||||||
FAILED failing/test.py:failing_test3 - Failure description
|
FAILED failing/test.py:failing_test3 - Failure description
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
## Logs Wrangling
|
## Logs Wrangling
|
||||||
@ -78,7 +78,7 @@ tail -525 ci_build.log | wl-copy
|
|||||||
|
|
||||||
## Investigating a CI Test Failure
|
## Investigating a CI Test Failure
|
||||||
|
|
||||||
1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
|
1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
|
||||||
2. Bisect to find the first build that shows the issue.
|
2. Bisect to find the first build that shows the issue.
|
||||||
3. Add your findings to the GitHub issue.
|
3. Add your findings to the GitHub issue.
|
||||||
4. If you find a strong candidate PR, mention it in the issue and ping contributors.
|
4. If you find a strong candidate PR, mention it in the issue and ping contributors.
|
||||||
@ -97,9 +97,9 @@ CI test failures may be flaky. Use a bash loop to run repeatedly:
|
|||||||
|
|
||||||
If you submit a PR to fix a CI failure:
|
If you submit a PR to fix a CI failure:
|
||||||
|
|
||||||
- Link the PR to the issue:
|
- Link the PR to the issue:
|
||||||
Add `Closes #12345` to the PR description.
|
Add `Closes #12345` to the PR description.
|
||||||
- Add the `ci-failure` label:
|
- Add the `ci-failure` label:
|
||||||
This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
|
This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
|
||||||
|
|
||||||
## Other Resources
|
## Other Resources
|
@ -1,15 +1,12 @@
|
|||||||
---
|
# Update PyTorch version on vLLM OSS CI/CD
|
||||||
title: Update PyTorch version on vLLM OSS CI/CD
|
|
||||||
---
|
|
||||||
|
|
||||||
vLLM's current policy is to always use the latest PyTorch stable
|
vLLM's current policy is to always use the latest PyTorch stable
|
||||||
release in CI/CD. It is standard practice to submit a PR to update the
|
release in CI/CD. It is standard practice to submit a PR to update the
|
||||||
PyTorch version as early as possible when a new [PyTorch stable
|
PyTorch version as early as possible when a new [PyTorch stable
|
||||||
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
release](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-cadence) becomes available.
|
||||||
This process is non-trivial due to the gap between PyTorch
|
This process is non-trivial due to the gap between PyTorch
|
||||||
releases. Using [#16859](https://github.com/vllm-project/vllm/pull/16859) as
|
releases. Using <gh-pr:16859> as an example, this document outlines common steps to achieve this
|
||||||
an example, this document outlines common steps to achieve this update along with
|
update along with a list of potential issues and how to address them.
|
||||||
a list of potential issues and how to address them.
|
|
||||||
|
|
||||||
## Test PyTorch release candidates (RCs)
|
## Test PyTorch release candidates (RCs)
|
||||||
|
|
||||||
@ -19,11 +16,12 @@ by waiting for the next release or by implementing hacky workarounds in vLLM.
|
|||||||
The better solution is to test vLLM with PyTorch release candidates (RC) to ensure
|
The better solution is to test vLLM with PyTorch release candidates (RC) to ensure
|
||||||
compatibility before each release.
|
compatibility before each release.
|
||||||
|
|
||||||
PyTorch release candidates can be downloaded from PyTorch test index at https://download.pytorch.org/whl/test.
|
PyTorch release candidates can be downloaded from [PyTorch test index](https://download.pytorch.org/whl/test).
|
||||||
For example, torch2.7.0+cu12.8 RC can be installed using the following command:
|
For example, `torch2.7.0+cu12.8` RC can be installed using the following command:
|
||||||
|
|
||||||
```
|
```bash
|
||||||
uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/test/cu128
|
uv pip install torch torchvision torchaudio \
|
||||||
|
--index-url https://download.pytorch.org/whl/test/cu128
|
||||||
```
|
```
|
||||||
|
|
||||||
When the final RC is ready for testing, it will be announced to the community
|
When the final RC is ready for testing, it will be announced to the community
|
||||||
@ -31,13 +29,28 @@ on the [PyTorch dev-discuss forum](https://dev-discuss.pytorch.org/c/release-ann
|
|||||||
After this announcement, we can begin testing vLLM integration by drafting a pull request
|
After this announcement, we can begin testing vLLM integration by drafting a pull request
|
||||||
following this 3-step process:
|
following this 3-step process:
|
||||||
|
|
||||||
1. Update requirements files in https://github.com/vllm-project/vllm/tree/main/requirements
|
1. Update [requirements files](https://github.com/vllm-project/vllm/tree/main/requirements)
|
||||||
to point to the new releases for torch, torchvision, and torchaudio.
|
to point to the new releases for `torch`, `torchvision`, and `torchaudio`.
|
||||||
2. Use `--extra-index-url https://download.pytorch.org/whl/test/<PLATFORM>` to
|
|
||||||
get the final release candidates' wheels. Some common platforms are `cpu`, `cu128`,
|
2. Use the following option to get the final release candidates' wheels. Some common platforms are `cpu`, `cu128`, and `rocm6.2.4`.
|
||||||
and `rocm6.2.4`.
|
|
||||||
3. As vLLM uses uv, make sure that `unsafe-best-match` strategy is set either
|
```bash
|
||||||
via `UV_INDEX_STRATEGY` env variable or via `--index-strategy unsafe-best-match`.
|
--extra-index-url https://download.pytorch.org/whl/test/<PLATFORM>
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Since vLLM uses `uv`, ensure the following index strategy is applied:
|
||||||
|
|
||||||
|
- Via environment variable:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
export UV_INDEX_STRATEGY=unsafe-best-match
|
||||||
|
```
|
||||||
|
|
||||||
|
- Or via CLI flag:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
--index-strategy unsafe-best-match
|
||||||
|
```
|
||||||
|
|
||||||
If failures are found in the pull request, raise them as issues on vLLM and
|
If failures are found in the pull request, raise them as issues on vLLM and
|
||||||
cc the PyTorch release team to initiate discussion on how to address them.
|
cc the PyTorch release team to initiate discussion on how to address them.
|
||||||
@ -45,20 +58,25 @@ cc the PyTorch release team to initiate discussion on how to address them.
|
|||||||
## Update CUDA version
|
## Update CUDA version
|
||||||
|
|
||||||
The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
|
The PyTorch release matrix includes both stable and experimental [CUDA versions](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix). Due to limitations, only the latest stable CUDA version (for example,
|
||||||
torch2.7.0+cu12.6) is uploaded to PyPI. However, vLLM may require a different CUDA version,
|
`torch2.7.0+cu12.6`) is uploaded to PyPI. However, vLLM may require a different CUDA version,
|
||||||
such as 12.8 for Blackwell support.
|
such as 12.8 for Blackwell support.
|
||||||
This complicates the process as we cannot use the out-of-the-box
|
This complicates the process as we cannot use the out-of-the-box
|
||||||
`pip install torch torchvision torchaudio` command. The solution is to use
|
`pip install torch torchvision torchaudio` command. The solution is to use
|
||||||
`--extra-index-url` in vLLM's Dockerfiles.
|
`--extra-index-url` in vLLM's Dockerfiles.
|
||||||
|
|
||||||
1. Use `--extra-index-url https://download.pytorch.org/whl/cu128` to install torch+cu128.
|
- Important indexes at the moment include:
|
||||||
2. Other important indexes at the moment include:
|
|
||||||
1. CPU ‒ https://download.pytorch.org/whl/cpu
|
| Platform | `--extra-index-url` |
|
||||||
2. ROCm ‒ https://download.pytorch.org/whl/rocm6.2.4 and https://download.pytorch.org/whl/rocm6.3
|
|----------|-----------------|
|
||||||
3. XPU ‒ https://download.pytorch.org/whl/xpu
|
| CUDA 12.8| [https://download.pytorch.org/whl/cu128](https://download.pytorch.org/whl/cu128)|
|
||||||
3. Update .buildkite/release-pipeline.yaml and .buildkite/scripts/upload-wheels.sh to
|
| CPU | [https://download.pytorch.org/whl/cpu](https://download.pytorch.org/whl/cpu)|
|
||||||
match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested
|
| ROCm 6.2 | [https://download.pytorch.org/whl/rocm6.2.4](https://download.pytorch.org/whl/rocm6.2.4) |
|
||||||
on CI.
|
| ROCm 6.3 | [https://download.pytorch.org/whl/rocm6.3](https://download.pytorch.org/whl/rocm6.3) |
|
||||||
|
| XPU | [https://download.pytorch.org/whl/xpu](https://download.pytorch.org/whl/xpu) |
|
||||||
|
|
||||||
|
- Update the below files to match the CUDA version from step 1. This makes sure that the release vLLM wheel is tested on CI.
|
||||||
|
- `.buildkite/release-pipeline.yaml`
|
||||||
|
- `.buildkite/scripts/upload-wheels.sh`
|
||||||
|
|
||||||
## Address long vLLM build time
|
## Address long vLLM build time
|
||||||
|
|
||||||
@ -68,8 +86,8 @@ and timeout. Additionally, since vLLM's fastcheck pipeline runs in read-only mod
|
|||||||
it doesn't populate the cache, so re-running it to warm up the cache
|
it doesn't populate the cache, so re-running it to warm up the cache
|
||||||
is ineffective.
|
is ineffective.
|
||||||
|
|
||||||
While ongoing efforts like [#17419](https://github.com/vllm-project/vllm/issues/17419)
|
While ongoing efforts like [#17419](gh-issue:17419)
|
||||||
address the long build time at its source, the current workaround is to set VLLM_CI_BRANCH
|
address the long build time at its source, the current workaround is to set `VLLM_CI_BRANCH`
|
||||||
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
to a custom branch provided by @khluu (`VLLM_CI_BRANCH=khluu/use_postmerge_q`)
|
||||||
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
when manually triggering a build on Buildkite. This branch accomplishes two things:
|
||||||
|
|
||||||
@ -89,17 +107,18 @@ releases (which would take too much time), they can be built from
|
|||||||
source to unblock the update process.
|
source to unblock the update process.
|
||||||
|
|
||||||
### FlashInfer
|
### FlashInfer
|
||||||
Here is how to build and install it from source with torch2.7.0+cu128 in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
Here is how to build and install it from source with `torch2.7.0+cu128` in vLLM [Dockerfile](https://github.com/vllm-project/vllm/blob/27bebcd89792d5c4b08af7a65095759526f2f9e1/docker/Dockerfile#L259-L271):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'
|
||||||
export FLASHINFER_ENABLE_SM90=1
|
export FLASHINFER_ENABLE_SM90=1
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
uv pip install --system \
|
||||||
|
--no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@v0.2.6.post1"
|
||||||
```
|
```
|
||||||
|
|
||||||
One caveat is that building FlashInfer from source adds approximately 30
|
One caveat is that building FlashInfer from source adds approximately 30
|
||||||
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
|
minutes to the vLLM build time. Therefore, it's preferable to cache the wheel in a
|
||||||
public location for immediate installation, such as https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl. For future releases, contact the PyTorch release
|
public location for immediate installation, such as [this FlashInfer wheel link](https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.6.post1%2Bcu128torch2.7-cp39-abi3-linux_x86_64.whl). For future releases, contact the PyTorch release
|
||||||
team if you want to get the package published there.
|
team if you want to get the package published there.
|
||||||
|
|
||||||
### xFormers
|
### xFormers
|
||||||
@ -107,13 +126,15 @@ Similar to FlashInfer, here is how to build and install xFormers from source:
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
export TORCH_CUDA_ARCH_LIST='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
|
||||||
MAX_JOBS=16 uv pip install --system --no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
MAX_JOBS=16 uv pip install --system \
|
||||||
|
--no-build-isolation "git+https://github.com/facebookresearch/xformers@v0.0.30"
|
||||||
```
|
```
|
||||||
|
|
||||||
### Mamba
|
### Mamba
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
uv pip install --system \
|
||||||
|
--no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
|
||||||
```
|
```
|
||||||
|
|
||||||
### causal-conv1d
|
### causal-conv1d
|
||||||
@ -128,7 +149,6 @@ Rather than attempting to update all vLLM platforms in a single pull request, it
|
|||||||
to handle some platforms separately. The separation of requirements and Dockerfiles
|
to handle some platforms separately. The separation of requirements and Dockerfiles
|
||||||
for different platforms in vLLM CI/CD allows us to selectively choose
|
for different platforms in vLLM CI/CD allows us to selectively choose
|
||||||
which platforms to update. For instance, updating XPU requires the corresponding
|
which platforms to update. For instance, updating XPU requires the corresponding
|
||||||
release from https://github.com/intel/intel-extension-for-pytorch by Intel.
|
release from [Intel Extension for PyTorch](https://github.com/intel/intel-extension-for-pytorch) by Intel.
|
||||||
While https://github.com/vllm-project/vllm/pull/16859 updated vLLM to PyTorch
|
While <gh-pr:16859> updated vLLM to PyTorch 2.7.0 on CPU, CUDA, and ROCm,
|
||||||
2.7.0 on CPU, CUDA, and ROCm, https://github.com/vllm-project/vllm/pull/17444
|
<gh-pr:17444> completed the update for XPU.
|
||||||
completed the update for XPU.
|
|
@ -1,7 +1,7 @@
|
|||||||
# Dockerfile
|
# Dockerfile
|
||||||
|
|
||||||
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
We provide a <gh-file:docker/Dockerfile> to construct the image for running an OpenAI compatible server with vLLM.
|
||||||
More information about deploying with Docker can be found [here][deployment-docker].
|
More information about deploying with Docker can be found [here](../../deployment/docker.md).
|
||||||
|
|
||||||
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
|
||||||
|
|
||||||
|
@ -84,6 +84,7 @@ Below is an example of what the generated `CMakeUserPresets.json` might look lik
|
|||||||
```
|
```
|
||||||
|
|
||||||
**What do the various configurations mean?**
|
**What do the various configurations mean?**
|
||||||
|
|
||||||
- `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
|
- `CMAKE_CUDA_COMPILER`: Path to your `nvcc` binary. The script attempts to find this automatically.
|
||||||
- `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
|
- `CMAKE_C_COMPILER_LAUNCHER`, `CMAKE_CXX_COMPILER_LAUNCHER`, `CMAKE_CUDA_COMPILER_LAUNCHER`: Setting these to `ccache` (or `sccache`) significantly speeds up rebuilds by caching compilation results. Ensure `ccache` is installed (e.g., `sudo apt install ccache` or `conda install ccache`). The script sets these by default.
|
||||||
- `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
|
- `VLLM_PYTHON_EXECUTABLE`: Path to the Python executable in your vLLM development environment. The script will prompt for this, defaulting to the current Python environment if suitable.
|
||||||
|
@ -1,12 +1,9 @@
|
|||||||
---
|
# Summary
|
||||||
title: Summary
|
|
||||||
---
|
|
||||||
[](){ #new-model }
|
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
Many decoder language models can now be automatically loaded using the [Transformers backend][transformers-backend] without having to implement them in vLLM. See if `vllm serve <model>` works first!
|
||||||
|
|
||||||
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features][compatibility-matrix] to optimize their performance.
|
vLLM models are specialized [PyTorch](https://pytorch.org/) models that take advantage of various [features](../../features/compatibility_matrix.md) to optimize their performance.
|
||||||
|
|
||||||
The complexity of integrating a model into vLLM depends heavily on the model's architecture.
|
The complexity of integrating a model into vLLM depends heavily on the model's architecture.
|
||||||
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Basic Model
|
||||||
title: Basic Model
|
|
||||||
---
|
|
||||||
[](){ #new-model-basic }
|
|
||||||
|
|
||||||
This guide walks you through the steps to implement a basic vLLM model.
|
This guide walks you through the steps to implement a basic vLLM model.
|
||||||
|
|
||||||
@ -27,7 +24,7 @@ All vLLM modules within the model must include a `prefix` argument in their cons
|
|||||||
|
|
||||||
The initialization code should look like this:
|
The initialization code should look like this:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from torch import nn
|
from torch import nn
|
||||||
@ -108,7 +105,7 @@ This method should load the weights from the HuggingFace's checkpoint file and a
|
|||||||
|
|
||||||
## 5. Register your model
|
## 5. Register your model
|
||||||
|
|
||||||
See [this page][new-model-registration] for instructions on how to register your new model to be used by vLLM.
|
See [this page](registration.md) for instructions on how to register your new model to be used by vLLM.
|
||||||
|
|
||||||
## Frequently Asked Questions
|
## Frequently Asked Questions
|
||||||
|
|
||||||
|
@ -1,18 +1,15 @@
|
|||||||
---
|
# Multi-Modal Support
|
||||||
title: Multi-Modal Support
|
|
||||||
---
|
|
||||||
[](){ #supports-multimodal }
|
|
||||||
|
|
||||||
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs][multimodal-inputs].
|
This document walks you through the steps to extend a basic model so that it accepts [multi-modal inputs](../../features/multimodal_inputs.md).
|
||||||
|
|
||||||
## 1. Update the base vLLM model
|
## 1. Update the base vLLM model
|
||||||
|
|
||||||
It is assumed that you have already implemented the model in vLLM according to [these steps][new-model-basic].
|
It is assumed that you have already implemented the model in vLLM according to [these steps](basic.md).
|
||||||
Further update the model as follows:
|
Further update the model as follows:
|
||||||
|
|
||||||
- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
|
- Implement [get_placeholder_str][vllm.model_executor.models.interfaces.SupportsMultiModal.get_placeholder_str] to define the placeholder string which is used to represent the multi-modal item in the text prompt. This should be consistent with the chat template of the model.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourModelForImage2Seq(nn.Module):
|
class YourModelForImage2Seq(nn.Module):
|
||||||
@ -41,7 +38,7 @@ Further update the model as follows:
|
|||||||
|
|
||||||
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
- Implement [get_multimodal_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_multimodal_embeddings] that returns the embeddings from running the multimodal inputs through the multimodal tokenizer of the model. Below we provide a boilerplate of a typical implementation pattern, but feel free to adjust it to your own needs.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class YourModelForImage2Seq(nn.Module):
|
class YourModelForImage2Seq(nn.Module):
|
||||||
@ -71,7 +68,7 @@ Further update the model as follows:
|
|||||||
|
|
||||||
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
- Implement [get_input_embeddings][vllm.model_executor.models.interfaces.SupportsMultiModal.get_input_embeddings] to merge `multimodal_embeddings` with text embeddings from the `input_ids`. If input processing for the model is implemented correctly (see sections below), then you can leverage the utility function we provide to easily merge the embeddings.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from .utils import merge_multimodal_embeddings
|
from .utils import merge_multimodal_embeddings
|
||||||
@ -155,7 +152,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
Looking at the code of HF's `LlavaForConditionalGeneration`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L530-L544
|
||||||
@ -179,7 +176,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
The number of placeholder feature tokens per image is `image_features.shape[1]`.
|
||||||
`image_features` is calculated inside the `get_image_features` method:
|
`image_features` is calculated inside the `get_image_features` method:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/llava/modeling_llava.py#L290-L300
|
||||||
@ -217,7 +214,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
To find the sequence length, we turn to the code of `CLIPVisionEmbeddings`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
# https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/clip/modeling_clip.py#L247-L257
|
||||||
@ -244,7 +241,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
Overall, the number of placeholder feature tokens for an image can be calculated as:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def get_num_image_tokens(
|
def get_num_image_tokens(
|
||||||
@ -269,7 +266,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
Notice that the number of image tokens doesn't depend on the image width and height.
|
Notice that the number of image tokens doesn't depend on the image width and height.
|
||||||
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
We can simply use a dummy `image_size` to calculate the multimodal profiling data:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# NOTE: In actuality, this is usually implemented as part of the
|
# NOTE: In actuality, this is usually implemented as part of the
|
||||||
@ -314,7 +311,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
Looking at the code of HF's `FuyuForCausalLM`:
|
Looking at the code of HF's `FuyuForCausalLM`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/modeling_fuyu.py#L311-L322
|
||||||
@ -344,7 +341,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
In `FuyuImageProcessor.preprocess`, the images are resized and padded to the target `FuyuImageProcessor.size`,
|
||||||
returning the dimensions after resizing (but before padding) as metadata.
|
returning the dimensions after resizing (but before padding) as metadata.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L541-L544
|
||||||
@ -382,7 +379,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
In `FuyuImageProcessor.preprocess_with_tokenizer_info`, the images are split into patches based on this metadata:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L425
|
||||||
@ -420,7 +417,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
The number of patches is in turn defined by `FuyuImageProcessor.get_num_patches`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/image_processing_fuyu.py#L552-L562
|
||||||
@ -457,7 +454,7 @@ Assuming that the memory usage increases with the number of tokens, the dummy in
|
|||||||
|
|
||||||
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
For the multimodal image profiling data, the logic is very similar to LLaVA:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def get_dummy_mm_data(
|
def get_dummy_mm_data(
|
||||||
@ -483,7 +480,7 @@ Afterwards, create a subclass of [BaseMultiModalProcessor][vllm.multimodal.proce
|
|||||||
to fill in the missing details about HF processing.
|
to fill in the missing details about HF processing.
|
||||||
|
|
||||||
!!! info
|
!!! info
|
||||||
[Multi-Modal Data Processing][mm-processing]
|
[Multi-Modal Data Processing](../../design/mm_processing.md)
|
||||||
|
|
||||||
### Multi-modal fields
|
### Multi-modal fields
|
||||||
|
|
||||||
@ -546,7 +543,7 @@ return a schema of the tensors outputted by the HF processor that are related to
|
|||||||
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
In order to support the use of [MultiModalFieldConfig.batched][] like in LLaVA,
|
||||||
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
we remove the extra batch dimension by overriding [BaseMultiModalProcessor._call_hf_processor][]:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def _call_hf_processor(
|
def _call_hf_processor(
|
||||||
@ -623,7 +620,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
It simply repeats each input `image_token` a number of times equal to the number of placeholder feature tokens (`num_image_tokens`).
|
||||||
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
Based on this, we override [_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] as follows:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
@ -668,7 +665,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
|
|
||||||
We define a helper function to return `ncols` and `nrows` directly:
|
We define a helper function to return `ncols` and `nrows` directly:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def get_image_feature_grid_size(
|
def get_image_feature_grid_size(
|
||||||
@ -698,7 +695,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
|
|
||||||
Based on this, we can initially define our replacement tokens as:
|
Based on this, we can initially define our replacement tokens as:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def get_replacement(item_idx: int):
|
def get_replacement(item_idx: int):
|
||||||
@ -718,7 +715,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
However, this is not entirely correct. After `FuyuImageProcessor.preprocess_with_tokenizer_info` is called,
|
||||||
a BOS token (`<s>`) is also added to the promopt:
|
a BOS token (`<s>`) is also added to the promopt:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
# https://github.com/huggingface/transformers/blob/v4.48.3/src/transformers/models/fuyu/processing_fuyu.py#L417-L435
|
||||||
@ -745,7 +742,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
To assign the vision embeddings to only the image tokens, instead of a string
|
To assign the vision embeddings to only the image tokens, instead of a string
|
||||||
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
you can return an instance of [PromptUpdateDetails][vllm.multimodal.processing.PromptUpdateDetails]:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
hf_config = self.info.get_hf_config()
|
hf_config = self.info.get_hf_config()
|
||||||
@ -772,7 +769,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
Finally, noticing that the HF processor removes the `|ENDOFTEXT|` token from the tokenized prompt,
|
||||||
we can search for it to conduct the replacement at the start of the string:
|
we can search for it to conduct the replacement at the start of the string:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def _get_prompt_updates(
|
def _get_prompt_updates(
|
||||||
@ -819,7 +816,7 @@ Each [PromptUpdate][vllm.multimodal.processing.PromptUpdate] instance specifies
|
|||||||
After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
|
After you have defined [BaseProcessingInfo][vllm.multimodal.processing.BaseProcessingInfo] (Step 2),
|
||||||
[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
|
[BaseDummyInputsBuilder][vllm.multimodal.profiling.BaseDummyInputsBuilder] (Step 3),
|
||||||
and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
|
and [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] (Step 4),
|
||||||
decorate the model class with {meth}`MULTIMODAL_REGISTRY.register_processor <vllm.multimodal.registry.MultiModalRegistry.register_processor>`
|
decorate the model class with [MULTIMODAL_REGISTRY.register_processor][vllm.multimodal.processing.MultiModalRegistry.register_processor]
|
||||||
to register them to the multi-modal registry:
|
to register them to the multi-modal registry:
|
||||||
|
|
||||||
```diff
|
```diff
|
||||||
@ -846,7 +843,7 @@ Examples:
|
|||||||
|
|
||||||
### Handling prompt updates unrelated to multi-modal data
|
### Handling prompt updates unrelated to multi-modal data
|
||||||
|
|
||||||
[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design][mm-processing].
|
[_get_prompt_updates][vllm.multimodal.processing.BaseMultiModalProcessor._get_prompt_updates] assumes that each application of prompt update corresponds to one multi-modal item. If the HF processor performs additional processing regardless of how many multi-modal items there are, you should override [_apply_hf_processor_tokens_only][vllm.multimodal.processing.BaseMultiModalProcessor._apply_hf_processor_tokens_only] so that the processed token inputs are consistent with the result of applying the HF processor on text inputs. This is because token inputs bypass the HF processor according to [our design](../../design/mm_processing.md).
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
|
|
||||||
|
@ -1,10 +1,7 @@
|
|||||||
---
|
# Registering a Model
|
||||||
title: Registering a Model
|
|
||||||
---
|
|
||||||
[](){ #new-model-registration }
|
|
||||||
|
|
||||||
vLLM relies on a model registry to determine how to run each model.
|
vLLM relies on a model registry to determine how to run each model.
|
||||||
A list of pre-registered architectures can be found [here][supported-models].
|
A list of pre-registered architectures can be found [here](../../models/supported_models.md).
|
||||||
|
|
||||||
If your model is not on this list, you must register it to vLLM.
|
If your model is not on this list, you must register it to vLLM.
|
||||||
This page provides detailed instructions on how to do so.
|
This page provides detailed instructions on how to do so.
|
||||||
@ -14,16 +11,16 @@ This page provides detailed instructions on how to do so.
|
|||||||
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
To add a model directly to the vLLM library, start by forking our [GitHub repository](https://github.com/vllm-project/vllm) and then [build it from source][build-from-source].
|
||||||
This gives you the ability to modify the codebase and test your model.
|
This gives you the ability to modify the codebase and test your model.
|
||||||
|
|
||||||
After you have implemented your model (see [tutorial][new-model-basic]), put it into the <gh-dir:vllm/model_executor/models> directory.
|
After you have implemented your model (see [tutorial](basic.md)), put it into the <gh-dir:vllm/model_executor/models> directory.
|
||||||
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
Then, add your model class to `_VLLM_MODELS` in <gh-file:vllm/model_executor/models/registry.py> so that it is automatically registered upon importing vLLM.
|
||||||
Finally, update our [list of supported models][supported-models] to promote your model!
|
Finally, update our [list of supported models](../../models/supported_models.md) to promote your model!
|
||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
The list of models in each section should be maintained in alphabetical order.
|
The list of models in each section should be maintained in alphabetical order.
|
||||||
|
|
||||||
## Out-of-tree models
|
## Out-of-tree models
|
||||||
|
|
||||||
You can load an external model [using a plugin][plugin-system] without modifying the vLLM codebase.
|
You can load an external model [using a plugin](../../design/plugin_system.md) without modifying the vLLM codebase.
|
||||||
|
|
||||||
To register the model, use the following code:
|
To register the model, use the following code:
|
||||||
|
|
||||||
@ -51,4 +48,4 @@ def register():
|
|||||||
|
|
||||||
!!! important
|
!!! important
|
||||||
If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
|
If your model is a multimodal model, ensure the model class implements the [SupportsMultiModal][vllm.model_executor.models.interfaces.SupportsMultiModal] interface.
|
||||||
Read more about that [here][supports-multimodal].
|
Read more about that [here](multimodal.md).
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Unit Testing
|
||||||
title: Unit Testing
|
|
||||||
---
|
|
||||||
[](){ #new-model-tests }
|
|
||||||
|
|
||||||
This page explains how to write unit tests to verify the implementation of your model.
|
This page explains how to write unit tests to verify the implementation of your model.
|
||||||
|
|
||||||
|
@ -125,7 +125,7 @@ to manually kill the profiler and generate your `nsys-rep` report.
|
|||||||
|
|
||||||
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
|
You can view these profiles either as summaries in the CLI, using `nsys stats [profile-file]`, or in the GUI by installing Nsight [locally following the directions here](https://developer.nvidia.com/nsight-systems/get-started).
|
||||||
|
|
||||||
??? CLI example
|
??? console "CLI example"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
nsys stats report1.nsys-rep
|
nsys stats report1.nsys-rep
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Using Docker
|
||||||
title: Using Docker
|
|
||||||
---
|
|
||||||
[](){ #deployment-docker }
|
|
||||||
|
|
||||||
[](){ #deployment-docker-pre-built-image }
|
[](){ #deployment-docker-pre-built-image }
|
||||||
|
|
||||||
@ -32,7 +29,7 @@ podman run --gpus all \
|
|||||||
--model mistralai/Mistral-7B-v0.1
|
--model mistralai/Mistral-7B-v0.1
|
||||||
```
|
```
|
||||||
|
|
||||||
You can add any other [engine-args][engine-args] you need after the image tag (`vllm/vllm-openai:latest`).
|
You can add any other [engine-args](../configuration/engine_args.md) you need after the image tag (`vllm/vllm-openai:latest`).
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
You can either use the `ipc=host` flag or `--shm-size` flag to allow the
|
You can either use the `ipc=host` flag or `--shm-size` flag to allow the
|
||||||
@ -97,7 +94,7 @@ of PyTorch Nightly and should be considered **experimental**. Using the flag `--
|
|||||||
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
flags to speed up build process. However, ensure your `max_jobs` is substantially larger than `nvcc_threads` to get the most benefits.
|
||||||
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
Keep an eye on memory usage with parallel jobs as it can be substantial (see example below).
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
# Example of building on Nvidia GH200 server. (Memory usage: ~15GB, Build time: ~1475s / ~25 min, Image size: 6.93GB)
|
||||||
|
8
docs/deployment/frameworks/anyscale.md
Normal file
8
docs/deployment/frameworks/anyscale.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Anyscale
|
||||||
|
|
||||||
|
[](){ #deployment-anyscale }
|
||||||
|
|
||||||
|
[Anyscale](https://www.anyscale.com) is a managed, multi-cloud platform developed by the creators of Ray.
|
||||||
|
It hosts Ray clusters inside your own AWS, GCP, or Azure account, delivering the flexibility of open-source Ray
|
||||||
|
without the operational overhead of maintaining Kubernetes control planes, configuring autoscalers, or managing observability stacks.
|
||||||
|
When serving large language models with vLLM, Anyscale can rapidly provision [production-ready HTTPS endpoints](https://docs.anyscale.com/examples/deploy-ray-serve-llms) or [fault-tolerant batch inference jobs](https://docs.anyscale.com/examples/ray-data-llm).
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Anything LLM
|
||||||
title: Anything LLM
|
|
||||||
---
|
|
||||||
[](){ #deployment-anything-llm }
|
|
||||||
|
|
||||||
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
|
[Anything LLM](https://github.com/Mintplex-Labs/anything-llm) is a full-stack application that enables you to turn any document, resource, or piece of content into context that any LLM can use as references during chatting.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# AutoGen
|
||||||
title: AutoGen
|
|
||||||
---
|
|
||||||
[](){ #deployment-autogen }
|
|
||||||
|
|
||||||
[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
|
[AutoGen](https://github.com/microsoft/autogen) is a framework for creating multi-agent AI applications that can act autonomously or work alongside humans.
|
||||||
|
|
||||||
@ -30,7 +27,7 @@ python -m vllm.entrypoints.openai.api_server \
|
|||||||
|
|
||||||
- Call it with AutoGen:
|
- Call it with AutoGen:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# BentoML
|
||||||
title: BentoML
|
|
||||||
---
|
|
||||||
[](){ #deployment-bentoml }
|
|
||||||
|
|
||||||
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
|
[BentoML](https://github.com/bentoml/BentoML) allows you to deploy a large language model (LLM) server with vLLM as the backend, which exposes OpenAI-compatible endpoints. You can serve the model locally or containerize it as an OCI-compliant image and deploy it on Kubernetes.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Cerebrium
|
||||||
title: Cerebrium
|
|
||||||
---
|
|
||||||
[](){ #deployment-cerebrium }
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
|
<img src="https://i.ibb.co/hHcScTT/Screenshot-2024-06-13-at-10-14-54.png" alt="vLLM_plus_cerebrium"/>
|
||||||
@ -34,7 +31,7 @@ vllm = "latest"
|
|||||||
|
|
||||||
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
Next, let us add our code to handle inference for the LLM of your choice (`mistralai/Mistral-7B-Instruct-v0.1` for this example), add the following code to your `main.py`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -64,7 +61,7 @@ cerebrium deploy
|
|||||||
|
|
||||||
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
|
If successful, you should be returned a CURL command that you can call inference against. Just remember to end the url with the function name you are calling (in our case`/run`)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
curl -X POST https://api.cortex.cerebrium.ai/v4/p-xxxxxx/vllm/run \
|
||||||
@ -82,7 +79,7 @@ If successful, you should be returned a CURL command that you can call inference
|
|||||||
|
|
||||||
You should get a response like:
|
You should get a response like:
|
||||||
|
|
||||||
??? Response
|
??? console "Response"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
{
|
{
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Chatbox
|
||||||
title: Chatbox
|
|
||||||
---
|
|
||||||
[](){ #deployment-chatbox }
|
|
||||||
|
|
||||||
[Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
|
[Chatbox](https://github.com/chatboxai/chatbox) is a desktop client for LLMs, available on Windows, Mac, Linux.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Dify
|
||||||
title: Dify
|
|
||||||
---
|
|
||||||
[](){ #deployment-dify }
|
|
||||||
|
|
||||||
[Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
|
[Dify](https://github.com/langgenius/dify) is an open-source LLM app development platform. Its intuitive interface combines agentic AI workflow, RAG pipeline, agent capabilities, model management, observability features, and more, allowing you to quickly move from prototype to production.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# dstack
|
||||||
title: dstack
|
|
||||||
---
|
|
||||||
[](){ #deployment-dstack }
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
|
<img src="https://i.ibb.co/71kx6hW/vllm-dstack.png" alt="vLLM_plus_dstack"/>
|
||||||
@ -26,7 +23,7 @@ dstack init
|
|||||||
|
|
||||||
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-7b-chat-hf` for this example), create the following `serve.dstack.yml` file for the dstack `Service`:
|
||||||
|
|
||||||
??? Config
|
??? code "Config"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
type: service
|
type: service
|
||||||
@ -48,7 +45,7 @@ Next, to provision a VM instance with LLM of your choice (`NousResearch/Llama-2-
|
|||||||
|
|
||||||
Then, run the following CLI for provisioning:
|
Then, run the following CLI for provisioning:
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```console
|
```console
|
||||||
$ dstack run . -f serve.dstack.yml
|
$ dstack run . -f serve.dstack.yml
|
||||||
@ -79,7 +76,7 @@ Then, run the following CLI for provisioning:
|
|||||||
|
|
||||||
After the provisioning, you can interact with the model by using the OpenAI SDK:
|
After the provisioning, you can interact with the model by using the OpenAI SDK:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Haystack
|
||||||
title: Haystack
|
|
||||||
---
|
|
||||||
[](){ #deployment-haystack }
|
|
||||||
|
|
||||||
# Haystack
|
# Haystack
|
||||||
|
|
||||||
@ -27,7 +24,7 @@ vllm serve mistralai/Mistral-7B-Instruct-v0.1
|
|||||||
|
|
||||||
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
|
- Use the `OpenAIGenerator` and `OpenAIChatGenerator` components in Haystack to query the vLLM server.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from haystack.components.generators.chat import OpenAIChatGenerator
|
from haystack.components.generators.chat import OpenAIChatGenerator
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Helm
|
||||||
title: Helm
|
|
||||||
---
|
|
||||||
[](){ #deployment-helm }
|
|
||||||
|
|
||||||
A Helm chart to deploy vLLM for Kubernetes
|
A Helm chart to deploy vLLM for Kubernetes
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# LiteLLM
|
||||||
title: LiteLLM
|
|
||||||
---
|
|
||||||
[](){ #deployment-litellm }
|
|
||||||
|
|
||||||
[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
|
[LiteLLM](https://github.com/BerriAI/litellm) call all LLM APIs using the OpenAI format [Bedrock, Huggingface, VertexAI, TogetherAI, Azure, OpenAI, Groq etc.]
|
||||||
|
|
||||||
@ -34,7 +31,7 @@ vllm serve qwen/Qwen1.5-0.5B-Chat
|
|||||||
|
|
||||||
- Call it with litellm:
|
- Call it with litellm:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import litellm
|
import litellm
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Lobe Chat
|
||||||
title: Lobe Chat
|
|
||||||
---
|
|
||||||
[](){ #deployment-lobe-chat }
|
|
||||||
|
|
||||||
[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
|
[Lobe Chat](https://github.com/lobehub/lobe-chat) is an open-source, modern-design ChatGPT/LLMs UI/Framework.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# LWS
|
||||||
title: LWS
|
|
||||||
---
|
|
||||||
[](){ #deployment-lws }
|
|
||||||
|
|
||||||
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
|
LeaderWorkerSet (LWS) is a Kubernetes API that aims to address common deployment patterns of AI/ML inference workloads.
|
||||||
A major use case is for multi-host/multi-node distributed inference.
|
A major use case is for multi-host/multi-node distributed inference.
|
||||||
@ -17,7 +14,7 @@ vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kuber
|
|||||||
|
|
||||||
Deploy the following yaml file `lws.yaml`
|
Deploy the following yaml file `lws.yaml`
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
apiVersion: leaderworkerset.x-k8s.io/v1
|
apiVersion: leaderworkerset.x-k8s.io/v1
|
||||||
@ -177,7 +174,7 @@ curl http://localhost:8080/v1/completions \
|
|||||||
|
|
||||||
The output should be similar to the following
|
The output should be similar to the following
|
||||||
|
|
||||||
??? Output
|
??? console "Output"
|
||||||
|
|
||||||
```text
|
```text
|
||||||
{
|
{
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Modal
|
||||||
title: Modal
|
|
||||||
---
|
|
||||||
[](){ #deployment-modal }
|
|
||||||
|
|
||||||
vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
|
vLLM can be run on cloud GPUs with [Modal](https://modal.com), a serverless computing platform designed for fast auto-scaling.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Open WebUI
|
||||||
title: Open WebUI
|
|
||||||
---
|
|
||||||
[](){ #deployment-open-webui }
|
|
||||||
|
|
||||||
1. Install the [Docker](https://docs.docker.com/engine/install/)
|
1. Install the [Docker](https://docs.docker.com/engine/install/)
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Retrieval-Augmented Generation
|
||||||
title: Retrieval-Augmented Generation
|
|
||||||
---
|
|
||||||
[](){ #deployment-retrieval-augmented-generation }
|
|
||||||
|
|
||||||
[Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
|
[Retrieval-augmented generation (RAG)](https://en.wikipedia.org/wiki/Retrieval-augmented_generation) is a technique that enables generative artificial intelligence (Gen AI) models to retrieve and incorporate new information. It modifies interactions with a large language model (LLM) so that the model responds to user queries with reference to a specified set of documents, using this information to supplement information from its pre-existing training data. This allows LLMs to use domain-specific and/or updated information. Use cases include providing chatbot access to internal company data or generating responses based on authoritative sources.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# SkyPilot
|
||||||
title: SkyPilot
|
|
||||||
---
|
|
||||||
[](){ #deployment-skypilot }
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
|
<img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
|
||||||
@ -24,7 +21,7 @@ sky check
|
|||||||
|
|
||||||
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
|
See the vLLM SkyPilot YAML for serving, [serving.yaml](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml).
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
resources:
|
resources:
|
||||||
@ -95,7 +92,7 @@ HF_TOKEN="your-huggingface-token" \
|
|||||||
|
|
||||||
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
|
SkyPilot can scale up the service to multiple service replicas with built-in autoscaling, load-balancing and fault-tolerance. You can do it by adding a services section to the YAML file.
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
service:
|
service:
|
||||||
@ -111,7 +108,7 @@ SkyPilot can scale up the service to multiple service replicas with built-in aut
|
|||||||
max_completion_tokens: 1
|
max_completion_tokens: 1
|
||||||
```
|
```
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
service:
|
service:
|
||||||
@ -186,7 +183,7 @@ vllm 2 1 xx.yy.zz.245 18 mins ago 1x GCP([Spot]{'L4': 1}) R
|
|||||||
|
|
||||||
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
|
After the service is READY, you can find a single endpoint for the service and access the service with the endpoint:
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
|
ENDPOINT=$(sky serve status --endpoint 8081 vllm)
|
||||||
@ -220,7 +217,7 @@ service:
|
|||||||
|
|
||||||
This will scale the service up to when the QPS exceeds 2 for each replica.
|
This will scale the service up to when the QPS exceeds 2 for each replica.
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
service:
|
service:
|
||||||
@ -285,7 +282,7 @@ sky serve down vllm
|
|||||||
|
|
||||||
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
|
It is also possible to access the Llama-3 service with a separate GUI frontend, so the user requests send to the GUI will be load-balanced across replicas.
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
envs:
|
envs:
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Streamlit
|
||||||
title: Streamlit
|
|
||||||
---
|
|
||||||
[](){ #deployment-streamlit }
|
|
||||||
|
|
||||||
[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
|
[Streamlit](https://github.com/streamlit/streamlit) lets you transform Python scripts into interactive web apps in minutes, instead of weeks. Build dashboards, generate reports, or create chat apps.
|
||||||
|
|
||||||
|
@ -1,6 +1,3 @@
|
|||||||
---
|
# NVIDIA Triton
|
||||||
title: NVIDIA Triton
|
|
||||||
---
|
|
||||||
[](){ #deployment-triton }
|
|
||||||
|
|
||||||
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
|
The [Triton Inference Server](https://github.com/triton-inference-server) hosts a tutorial demonstrating how to quickly deploy a simple [facebook/opt-125m](https://huggingface.co/facebook/opt-125m) model using vLLM. Please see [Deploying a vLLM model in Triton](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton) for more details.
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# KServe
|
||||||
title: KServe
|
|
||||||
---
|
|
||||||
[](){ #deployment-kserve }
|
|
||||||
|
|
||||||
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
vLLM can be deployed with [KServe](https://github.com/kserve/kserve) on Kubernetes for highly scalable distributed model serving.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# KubeAI
|
||||||
title: KubeAI
|
|
||||||
---
|
|
||||||
[](){ #deployment-kubeai }
|
|
||||||
|
|
||||||
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
|
[KubeAI](https://github.com/substratusai/kubeai) is a Kubernetes operator that enables you to deploy and manage AI models on Kubernetes. It provides a simple and scalable way to deploy vLLM in production. Functionality such as scale-from-zero, load based autoscaling, model caching, and much more is provided out of the box with zero external dependencies.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Llama Stack
|
||||||
title: Llama Stack
|
|
||||||
---
|
|
||||||
[](){ #deployment-llamastack }
|
|
||||||
|
|
||||||
vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
|
vLLM is also available via [Llama Stack](https://github.com/meta-llama/llama-stack) .
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# llmaz
|
||||||
title: llmaz
|
|
||||||
---
|
|
||||||
[](){ #deployment-llmaz }
|
|
||||||
|
|
||||||
[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
|
[llmaz](https://github.com/InftyAI/llmaz) is an easy-to-use and advanced inference platform for large language models on Kubernetes, aimed for production use. It uses vLLM as the default model serving backend.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Production stack
|
||||||
title: Production stack
|
|
||||||
---
|
|
||||||
[](){ #deployment-production-stack }
|
|
||||||
|
|
||||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
|
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using the [vLLM production stack](https://github.com/vllm-project/production-stack). Born out of a Berkeley-UChicago collaboration, [vLLM production stack](https://github.com/vllm-project/production-stack) is an officially released, production-optimized codebase under the [vLLM project](https://github.com/vllm-project), designed for LLM deployment with:
|
||||||
|
|
||||||
@ -44,7 +41,8 @@ vllm-deployment-router-859d8fb668-2x2b7 1/1 Running 0 2m38
|
|||||||
vllm-opt125m-deployment-vllm-84dfc9bd7-vb9bs 1/1 Running 0 2m38s
|
vllm-opt125m-deployment-vllm-84dfc9bd7-vb9bs 1/1 Running 0 2m38s
|
||||||
```
|
```
|
||||||
|
|
||||||
**NOTE**: It may take some time for the containers to download the Docker images and LLM weights.
|
!!! note
|
||||||
|
It may take some time for the containers to download the Docker images and LLM weights.
|
||||||
|
|
||||||
### Send a Query to the Stack
|
### Send a Query to the Stack
|
||||||
|
|
||||||
@ -60,7 +58,7 @@ And then you can send out a query to the OpenAI-compatible API to check the avai
|
|||||||
curl -o- http://localhost:30080/models
|
curl -o- http://localhost:30080/models
|
||||||
```
|
```
|
||||||
|
|
||||||
??? Output
|
??? console "Output"
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -89,7 +87,7 @@ curl -X POST http://localhost:30080/completions \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
??? Output
|
??? console "Output"
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -121,7 +119,7 @@ sudo helm uninstall vllm
|
|||||||
|
|
||||||
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
|
The core vLLM production stack configuration is managed with YAML. Here is the example configuration used in the installation above:
|
||||||
|
|
||||||
??? Yaml
|
??? code "Yaml"
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
servingEngineSpec:
|
servingEngineSpec:
|
||||||
@ -152,6 +150,8 @@ In this YAML configuration:
|
|||||||
* **`requestGPU`**: Specifies the number of GPUs required.
|
* **`requestGPU`**: Specifies the number of GPUs required.
|
||||||
* **`pvcStorage`**: Allocates persistent storage for the model.
|
* **`pvcStorage`**: Allocates persistent storage for the model.
|
||||||
|
|
||||||
**NOTE:** If you intend to set up two pods, please refer to this [YAML file](https://github.com/vllm-project/production-stack/blob/main/tutorials/assets/values-01-2pods-minimal-example.yaml).
|
!!! note
|
||||||
|
If you intend to set up two pods, please refer to this [YAML file](https://github.com/vllm-project/production-stack/blob/main/tutorials/assets/values-01-2pods-minimal-example.yaml).
|
||||||
|
|
||||||
**NOTE:** vLLM production stack offers many more features (*e.g.* CPU offloading and a wide range of routing algorithms). Please check out these [examples and tutorials](https://github.com/vllm-project/production-stack/tree/main/tutorials) and our [repo](https://github.com/vllm-project/production-stack) for more details!
|
!!! tip
|
||||||
|
vLLM production stack offers many more features (*e.g.* CPU offloading and a wide range of routing algorithms). Please check out these [examples and tutorials](https://github.com/vllm-project/production-stack/tree/main/tutorials) and our [repo](https://github.com/vllm-project/production-stack) for more details!
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Using Kubernetes
|
||||||
title: Using Kubernetes
|
|
||||||
---
|
|
||||||
[](){ #deployment-k8s }
|
|
||||||
|
|
||||||
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
Deploying vLLM on Kubernetes is a scalable and efficient way to serve machine learning models. This guide walks you through deploying vLLM using native Kubernetes.
|
||||||
|
|
||||||
@ -29,7 +26,7 @@ Alternatively, you can deploy vLLM to Kubernetes using any of the following:
|
|||||||
|
|
||||||
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
First, create a Kubernetes PVC and Secret for downloading and storing Hugging Face model:
|
||||||
|
|
||||||
??? Config
|
??? console "Config"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cat <<EOF |kubectl apply -f -
|
cat <<EOF |kubectl apply -f -
|
||||||
@ -57,7 +54,7 @@ First, create a Kubernetes PVC and Secret for downloading and storing Hugging Fa
|
|||||||
|
|
||||||
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
Next, start the vLLM server as a Kubernetes Deployment and Service:
|
||||||
|
|
||||||
??? Config
|
??? console "Config"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
cat <<EOF |kubectl apply -f -
|
cat <<EOF |kubectl apply -f -
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Using Nginx
|
||||||
title: Using Nginx
|
|
||||||
---
|
|
||||||
[](){ #nginxloadbalancer }
|
|
||||||
|
|
||||||
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
|
||||||
|
|
||||||
@ -36,7 +33,7 @@ docker build . -f Dockerfile.nginx --tag nginx-lb
|
|||||||
|
|
||||||
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
Create a file named `nginx_conf/nginx.conf`. Note that you can add as many servers as you'd like. In the below example we'll start with two. To add more, add another `server vllmN:8000 max_fails=3 fail_timeout=10000s;` entry to `upstream backend`.
|
||||||
|
|
||||||
??? Config
|
??? console "Config"
|
||||||
|
|
||||||
```console
|
```console
|
||||||
upstream backend {
|
upstream backend {
|
||||||
@ -95,7 +92,7 @@ Notes:
|
|||||||
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
- The below example assumes GPU backend used. If you are using CPU backend, remove `--gpus device=ID`, add `VLLM_CPU_KVCACHE_SPACE` and `VLLM_CPU_OMP_THREADS_BIND` environment variables to the docker run command.
|
||||||
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
- Adjust the model name that you want to use in your vLLM servers if you don't want to use `Llama-2-7b-chat-hf`.
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```console
|
```console
|
||||||
mkdir -p ~/.cache/huggingface/hub/
|
mkdir -p ~/.cache/huggingface/hub/
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Architecture Overview
|
||||||
title: Architecture Overview
|
|
||||||
---
|
|
||||||
[](){ #arch-overview }
|
|
||||||
|
|
||||||
This document provides an overview of the vLLM architecture.
|
This document provides an overview of the vLLM architecture.
|
||||||
|
|
||||||
@ -22,7 +19,7 @@ server.
|
|||||||
|
|
||||||
Here is a sample of `LLM` class usage:
|
Here is a sample of `LLM` class usage:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -74,7 +71,7 @@ python -m vllm.entrypoints.openai.api_server --model <model>
|
|||||||
|
|
||||||
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
That code can be found in <gh-file:vllm/entrypoints/openai/api_server.py>.
|
||||||
|
|
||||||
More details on the API server can be found in the [OpenAI-Compatible Server][serving-openai-compatible-server] document.
|
More details on the API server can be found in the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) document.
|
||||||
|
|
||||||
## LLM Engine
|
## LLM Engine
|
||||||
|
|
||||||
@ -132,7 +129,7 @@ input tensors and capturing cudagraphs.
|
|||||||
## Model
|
## Model
|
||||||
|
|
||||||
Every model runner object has one model object, which is the actual
|
Every model runner object has one model object, which is the actual
|
||||||
`torch.nn.Module` instance. See [huggingface_integration][huggingface-integration] for how various
|
`torch.nn.Module` instance. See [huggingface_integration](huggingface_integration.md) for how various
|
||||||
configurations affect the class we ultimately get.
|
configurations affect the class we ultimately get.
|
||||||
|
|
||||||
## Class Hierarchy
|
## Class Hierarchy
|
||||||
@ -180,7 +177,7 @@ vision-language model.
|
|||||||
|
|
||||||
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
|
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class MyOldModel(nn.Module):
|
class MyOldModel(nn.Module):
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Automatic Prefix Caching
|
||||||
title: Automatic Prefix Caching
|
|
||||||
---
|
|
||||||
[](){ #design-automatic-prefix-caching }
|
|
||||||
|
|
||||||
The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
The core idea of [PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) is to partition the KV cache of each request into KV Blocks. Each block contains the attention keys and values for a fixed number of tokens. The PagedAttention algorithm allows these blocks to be stored in non-contiguous physical memory so that we can eliminate memory fragmentation by allocating the memory on demand.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Integration with HuggingFace
|
||||||
title: Integration with HuggingFace
|
|
||||||
---
|
|
||||||
[](){ #huggingface-integration }
|
|
||||||
|
|
||||||
This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
|
This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run `vllm serve`.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# vLLM Paged Attention
|
||||||
title: vLLM Paged Attention
|
|
||||||
---
|
|
||||||
[](){ #design-paged-attention }
|
|
||||||
|
|
||||||
Currently, vLLM utilizes its own implementation of a multi-head query
|
Currently, vLLM utilizes its own implementation of a multi-head query
|
||||||
attention kernel (`csrc/attention/attention_kernels.cu`).
|
attention kernel (`csrc/attention/attention_kernels.cu`).
|
||||||
@ -448,7 +445,7 @@ elements of the entire head for all context tokens. However, overall,
|
|||||||
all results for output have been calculated but are just stored in
|
all results for output have been calculated but are just stored in
|
||||||
different thread register memory.
|
different thread register memory.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```cpp
|
```cpp
|
||||||
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
float* out_smem = reinterpret_cast<float*>(shared_mem);
|
||||||
|
@ -1,9 +1,6 @@
|
|||||||
---
|
# Multi-Modal Data Processing
|
||||||
title: Multi-Modal Data Processing
|
|
||||||
---
|
|
||||||
[](){ #mm-processing }
|
|
||||||
|
|
||||||
To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching][automatic-prefix-caching], we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
To enable various optimizations in vLLM such as [chunked prefill][chunked-prefill] and [prefix caching](../features/automatic_prefix_caching.md), we use [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor] to provide the correspondence between placeholder feature tokens (e.g. `<image>`) and multi-modal inputs (e.g. the raw input image) based on the outputs of HF processor.
|
||||||
|
|
||||||
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
Here are the main features of [BaseMultiModalProcessor][vllm.multimodal.processing.BaseMultiModalProcessor]:
|
||||||
|
|
||||||
|
@ -1,19 +1,16 @@
|
|||||||
---
|
# vLLM's Plugin System
|
||||||
title: vLLM's Plugin System
|
|
||||||
---
|
|
||||||
[](){ #plugin-system }
|
|
||||||
|
|
||||||
The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
|
The community frequently requests the ability to extend vLLM with custom features. To facilitate this, vLLM includes a plugin system that allows users to add custom features without modifying the vLLM codebase. This document explains how plugins work in vLLM and how to create a plugin for vLLM.
|
||||||
|
|
||||||
## How Plugins Work in vLLM
|
## How Plugins Work in vLLM
|
||||||
|
|
||||||
Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview][arch-overview]), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
|
Plugins are user-registered code that vLLM executes. Given vLLM's architecture (see [Arch Overview](arch_overview.md)), multiple processes may be involved, especially when using distributed inference with various parallelism techniques. To enable plugins successfully, every process created by vLLM needs to load the plugin. This is done by the [load_general_plugins](https://github.com/vllm-project/vllm/blob/c76ac49d266e27aa3fea84ef2df1f813d24c91c7/vllm/plugins/__init__.py#L16) function in the `vllm.plugins` module. This function is called for every process created by vLLM before it starts any work.
|
||||||
|
|
||||||
## How vLLM Discovers Plugins
|
## How vLLM Discovers Plugins
|
||||||
|
|
||||||
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
|
vLLM's plugin system uses the standard Python `entry_points` mechanism. This mechanism allows developers to register functions in their Python packages for use by other packages. An example of a plugin:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# inside `setup.py` file
|
# inside `setup.py` file
|
||||||
|
@ -61,7 +61,7 @@ To address the above issues, I have designed and developed a local Tensor memory
|
|||||||
|
|
||||||
# Install vLLM
|
# Install vLLM
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
# Enter the home directory or your working directory.
|
# Enter the home directory or your working directory.
|
||||||
@ -106,7 +106,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||||
@ -128,7 +128,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
|
### Decode1 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||||
@ -150,7 +150,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
|
### Decode2 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||||
@ -172,7 +172,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
|
### Decode3 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||||
@ -203,7 +203,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
### Prefill1 (e.g. 10.0.1.2 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=0 vllm serve {your model directory} \
|
||||||
@ -225,7 +225,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
|
### Prefill2 (e.g. 10.0.1.3 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=1 vllm serve {your model directory} \
|
||||||
@ -247,7 +247,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
|
### Prefill3 (e.g. 10.0.1.4 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=2 vllm serve {your model directory} \
|
||||||
@ -269,7 +269,7 @@ python3 disagg_prefill_proxy_xpyd.py &
|
|||||||
|
|
||||||
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
|
### Decode1 (e.g. 10.0.1.5 or 10.0.1.1)
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
VLLM_USE_V1=1 CUDA_VISIBLE_DEVICES=3 vllm serve {your model directory} \
|
||||||
@ -304,7 +304,7 @@ curl -X POST -s http://10.0.1.1:10001/v1/completions \
|
|||||||
|
|
||||||
# Benchmark
|
# Benchmark
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```shell
|
```shell
|
||||||
python3 benchmark_serving.py \
|
python3 benchmark_serving.py \
|
||||||
|
@ -28,7 +28,7 @@ A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all
|
|||||||
|
|
||||||
In the very verbose logs, we can see:
|
In the very verbose logs, we can see:
|
||||||
|
|
||||||
??? Logs
|
??? console "Logs"
|
||||||
|
|
||||||
```text
|
```text
|
||||||
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
|
DEBUG 03-07 03:06:52 [decorators.py:203] Start compiling function <code object forward at 0x7f08acf40c90, file "xxx/vllm/model_executor/models/llama.py", line 339>
|
||||||
@ -110,7 +110,7 @@ Then it will also compile a specific kernel just for batch size `1, 2, 4, 8`. At
|
|||||||
|
|
||||||
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
|
When all the shapes are known, `torch.compile` can compare different configs, and often find some better configs to run the kernel. For example, we can see the following log:
|
||||||
|
|
||||||
??? Logs
|
??? console "Logs"
|
||||||
|
|
||||||
```
|
```
|
||||||
AUTOTUNE mm(8x2048, 2048x3072)
|
AUTOTUNE mm(8x2048, 2048x3072)
|
||||||
|
@ -1,14 +1,11 @@
|
|||||||
---
|
# Automatic Prefix Caching
|
||||||
title: Automatic Prefix Caching
|
|
||||||
---
|
|
||||||
[](){ #automatic-prefix-caching }
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
Automatic Prefix Caching (APC in short) caches the KV cache of existing queries, so that a new query can directly reuse the KV cache if it shares the same prefix with one of the existing queries, allowing the new query to skip the computation of the shared part.
|
||||||
|
|
||||||
!!! note
|
!!! note
|
||||||
Technical details on how vLLM implements APC can be found [here][design-automatic-prefix-caching].
|
Technical details on how vLLM implements APC can be found [here](../design/automatic_prefix_caching.md).
|
||||||
|
|
||||||
## Enabling APC in vLLM
|
## Enabling APC in vLLM
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Compatibility Matrix
|
||||||
title: Compatibility Matrix
|
|
||||||
---
|
|
||||||
[](){ #compatibility-matrix }
|
|
||||||
|
|
||||||
The tables below show mutually exclusive features and the support on some hardware.
|
The tables below show mutually exclusive features and the support on some hardware.
|
||||||
|
|
||||||
@ -37,13 +34,13 @@ th:not(:first-child) {
|
|||||||
}
|
}
|
||||||
</style>
|
</style>
|
||||||
|
|
||||||
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
| Feature | [CP][chunked-prefill] | [APC](automatic_prefix_caching.md) | [LoRA](lora.md) | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD](spec_decode.md) | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|
||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
||||||
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
|
||||||
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
|
| [APC](automatic_prefix_caching.md) | ✅ | ✅ | | | | | | | | | | | | | |
|
||||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | | | | | | | | | | | | |
|
||||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
|
||||||
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
|
| [SD](spec_decode.md) | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
|
||||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
|
||||||
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
|
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
|
||||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [❌](gh-issue:7366) | ❌ | ❌ | [❌](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
|
||||||
@ -62,10 +59,10 @@ th:not(:first-child) {
|
|||||||
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|
| Feature | Volta | Turing | Ampere | Ada | Hopper | CPU | AMD | TPU |
|
||||||
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
|
|-----------------------------------------------------------|---------------------|-----------|-----------|--------|------------|--------------------|--------|-----|
|
||||||
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [CP][chunked-prefill] | [❌](gh-issue:2729) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [APC][automatic-prefix-caching] | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [APC](automatic_prefix_caching.md) | [❌](gh-issue:3687) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| [LoRA](lora.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ | ❌ |
|
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | [❌](gh-issue:8475) | ✅ | ❌ |
|
||||||
| [SD][spec-decode] | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
| [SD](spec_decode.md) | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||||
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ |
|
||||||
| <abbr title="Pooling Models">pooling</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ |
|
| <abbr title="Pooling Models">pooling</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ❌ |
|
||||||
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ |
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Disaggregated Prefilling (experimental)
|
||||||
title: Disaggregated Prefilling (experimental)
|
|
||||||
---
|
|
||||||
[](){ #disagg-prefill }
|
|
||||||
|
|
||||||
This page introduces you the disaggregated prefilling feature in vLLM.
|
This page introduces you the disaggregated prefilling feature in vLLM.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# LoRA Adapters
|
||||||
title: LoRA Adapters
|
|
||||||
---
|
|
||||||
[](){ #lora-adapter }
|
|
||||||
|
|
||||||
This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
|
This document shows you how to use [LoRA adapters](https://arxiv.org/abs/2106.09685) with vLLM on top of a base model.
|
||||||
|
|
||||||
@ -29,7 +26,7 @@ We can now submit the prompts and call `llm.generate` with the `lora_request` pa
|
|||||||
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
|
of `LoRARequest` is a human identifiable name, the second parameter is a globally unique ID for the adapter and
|
||||||
the third parameter is the path to the LoRA adapter.
|
the third parameter is the path to the LoRA adapter.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
@ -70,7 +67,7 @@ The server entrypoint accepts all other LoRA configuration parameters (`max_lora
|
|||||||
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
|
etc.), which will apply to all forthcoming requests. Upon querying the `/models` endpoint, we should see our LoRA along
|
||||||
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
|
with its base model (if `jq` is not installed, you can follow [this guide](https://jqlang.org/download/) to install it.):
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl localhost:8000/v1/models | jq .
|
curl localhost:8000/v1/models | jq .
|
||||||
@ -172,7 +169,7 @@ Alternatively, follow these example steps to implement your own plugin:
|
|||||||
|
|
||||||
1. Implement the LoRAResolver interface.
|
1. Implement the LoRAResolver interface.
|
||||||
|
|
||||||
??? Example of a simple S3 LoRAResolver implementation
|
??? code "Example of a simple S3 LoRAResolver implementation"
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import os
|
import os
|
||||||
@ -238,7 +235,7 @@ The new format of `--lora-modules` is mainly to support the display of parent mo
|
|||||||
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
|
- The `parent` field of LoRA model `sql-lora` now links to its base model `meta-llama/Llama-2-7b-hf`. This correctly reflects the hierarchical relationship between the base model and the LoRA adapter.
|
||||||
- The `root` field points to the artifact location of the lora adapter.
|
- The `root` field points to the artifact location of the lora adapter.
|
||||||
|
|
||||||
??? Command output
|
??? console "Command output"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
$ curl http://localhost:8000/v1/models
|
$ curl http://localhost:8000/v1/models
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Multimodal Inputs
|
||||||
title: Multimodal Inputs
|
|
||||||
---
|
|
||||||
[](){ #multimodal-inputs }
|
|
||||||
|
|
||||||
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
This page teaches you how to pass multi-modal inputs to [multi-modal models][supported-mm-models] in vLLM.
|
||||||
|
|
||||||
@ -20,7 +17,7 @@ To input multi-modal data, follow this schema in [vllm.inputs.PromptType][]:
|
|||||||
|
|
||||||
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
You can pass a single image to the `'image'` field of the multi-modal dictionary, as shown in the following examples:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
@ -68,7 +65,7 @@ Full example: <gh-file:examples/offline_inference/vision_language.py>
|
|||||||
|
|
||||||
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
To substitute multiple images inside the same text prompt, you can pass in a list of images instead:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
@ -146,7 +143,7 @@ for o in outputs:
|
|||||||
|
|
||||||
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
@ -193,7 +190,7 @@ Full example: <gh-file:examples/offline_inference/audio_language.py>
|
|||||||
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
To input pre-computed embeddings belonging to a data type (i.e. image, video, or audio) directly to the language model,
|
||||||
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the corresponding field of the multi-modal dictionary.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
@ -220,7 +217,7 @@ pass a tensor of shape `(num_items, feature_size, hidden_size of LM)` to the cor
|
|||||||
|
|
||||||
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
For Qwen2-VL and MiniCPM-V, we accept additional parameters alongside the embeddings:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Construct the prompt based on your model
|
# Construct the prompt based on your model
|
||||||
@ -288,7 +285,7 @@ vllm serve microsoft/Phi-3.5-vision-instruct --task generate \
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -366,7 +363,7 @@ vllm serve llava-hf/llava-onevision-qwen2-0.5b-ov-hf --task generate --max-model
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -430,7 +427,7 @@ vllm serve fixie-ai/ultravox-v0_5-llama-3_2-1b
|
|||||||
|
|
||||||
Then, you can use the OpenAI client as follows:
|
Then, you can use the OpenAI client as follows:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import base64
|
import base64
|
||||||
@ -486,7 +483,7 @@ Then, you can use the OpenAI client as follows:
|
|||||||
|
|
||||||
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
Alternatively, you can pass `audio_url`, which is the audio counterpart of `image_url` for image input:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
chat_completion_from_url = client.chat.completions.create(
|
chat_completion_from_url = client.chat.completions.create(
|
||||||
@ -531,7 +528,7 @@ pass a tensor of shape to the corresponding field of the multi-modal dictionary.
|
|||||||
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
For image embeddings, you can pass the base64-encoded tensor to the `image_embeds` field.
|
||||||
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
The following example demonstrates how to pass image embeddings to the OpenAI server:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
image_embedding = torch.load(...)
|
image_embedding = torch.load(...)
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Quantization
|
||||||
title: Quantization
|
|
||||||
---
|
|
||||||
[](){ #quantization-index }
|
|
||||||
|
|
||||||
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
|
Quantization trades off model precision for smaller memory footprint, allowing large models to be run on a wider range of devices.
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# AutoAWQ
|
||||||
title: AutoAWQ
|
|
||||||
---
|
|
||||||
[](){ #auto-awq }
|
|
||||||
|
|
||||||
To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
|
To create a new 4-bit quantized model, you can leverage [AutoAWQ](https://github.com/casper-hansen/AutoAWQ).
|
||||||
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
Quantization reduces the model's precision from BF16/FP16 to INT4 which effectively reduces the total model memory footprint.
|
||||||
@ -15,7 +12,7 @@ pip install autoawq
|
|||||||
|
|
||||||
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
After installing AutoAWQ, you are ready to quantize a model. Please refer to the [AutoAWQ documentation](https://casper-hansen.github.io/AutoAWQ/examples/#basic-quantization) for further details. Here is an example of how to quantize `mistralai/Mistral-7B-Instruct-v0.2`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from awq import AutoAWQForCausalLM
|
from awq import AutoAWQForCausalLM
|
||||||
@ -51,7 +48,7 @@ python examples/offline_inference/llm_engine_example.py \
|
|||||||
|
|
||||||
AWQ models are also supported directly through the LLM entrypoint:
|
AWQ models are also supported directly through the LLM entrypoint:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# BitBLAS
|
||||||
title: BitBLAS
|
|
||||||
---
|
|
||||||
[](){ #bitblas }
|
|
||||||
|
|
||||||
vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
|
vLLM now supports [BitBLAS](https://github.com/microsoft/BitBLAS) for more efficient and flexible model inference. Compared to other quantization frameworks, BitBLAS provides more precision combinations.
|
||||||
|
|
||||||
@ -43,7 +40,7 @@ llm = LLM(
|
|||||||
|
|
||||||
## Read gptq format checkpoint
|
## Read gptq format checkpoint
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM
|
from vllm import LLM
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# BitsAndBytes
|
||||||
title: BitsAndBytes
|
|
||||||
---
|
|
||||||
[](){ #bits-and-bytes }
|
|
||||||
|
|
||||||
vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
|
vLLM now supports [BitsAndBytes](https://github.com/TimDettmers/bitsandbytes) for more efficient model inference.
|
||||||
BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
|
BitsAndBytes quantizes models to reduce memory usage and enhance performance without significantly sacrificing accuracy.
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# FP8 W8A8
|
||||||
title: FP8 W8A8
|
|
||||||
---
|
|
||||||
[](){ #fp8 }
|
|
||||||
|
|
||||||
vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
|
vLLM supports FP8 (8-bit floating point) weight and activation quantization using hardware acceleration on GPUs such as Nvidia H100 and AMD MI300x.
|
||||||
Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
|
Currently, only Hopper and Ada Lovelace GPUs are officially supported for W8A8.
|
||||||
@ -58,7 +55,7 @@ For FP8 quantization, we can recover accuracy with simple RTN quantization. We r
|
|||||||
|
|
||||||
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
|
Since simple RTN does not require data for weight quantization and the activations are quantized dynamically, we do not need any calibration data for this quantization flow.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llmcompressor.transformers import oneshot
|
from llmcompressor.transformers import oneshot
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# GGUF
|
||||||
title: GGUF
|
|
||||||
---
|
|
||||||
[](){ #gguf }
|
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
|
Please note that GGUF support in vLLM is highly experimental and under-optimized at the moment, it might be incompatible with other features. Currently, you can use GGUF as a way to reduce memory footprint. If you encounter any issues, please report them to the vLLM team.
|
||||||
@ -41,7 +38,7 @@ vllm serve ./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf \
|
|||||||
|
|
||||||
You can also use the GGUF model directly through the LLM entrypoint:
|
You can also use the GGUF model directly through the LLM entrypoint:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# GPTQModel
|
||||||
title: GPTQModel
|
|
||||||
---
|
|
||||||
[](){ #gptqmodel }
|
|
||||||
|
|
||||||
To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
|
To create a new 4-bit or 8-bit GPTQ quantized model, you can leverage [GPTQModel](https://github.com/ModelCloud/GPTQModel) from ModelCloud.AI.
|
||||||
|
|
||||||
@ -31,7 +28,7 @@ After installing GPTQModel, you are ready to quantize a model. Please refer to t
|
|||||||
|
|
||||||
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@ -69,7 +66,7 @@ python examples/offline_inference/llm_engine_example.py \
|
|||||||
|
|
||||||
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
GPTQModel quantized models are also supported directly through the LLM entrypoint:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# INT4 W4A16
|
||||||
title: INT4 W4A16
|
|
||||||
---
|
|
||||||
[](){ #int4 }
|
|
||||||
|
|
||||||
vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
|
vLLM supports quantizing weights to INT4 for memory savings and inference acceleration. This quantization method is particularly useful for reducing model size and maintaining low latency in workloads with low queries per second (QPS).
|
||||||
|
|
||||||
@ -53,7 +50,7 @@ When quantizing weights to INT4, you need sample data to estimate the weight upd
|
|||||||
It's best to use calibration data that closely matches your deployment data.
|
It's best to use calibration data that closely matches your deployment data.
|
||||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@ -78,7 +75,7 @@ For a general-purpose instruction-tuned model, you can use a dataset like `ultra
|
|||||||
|
|
||||||
Now, apply the quantization algorithms:
|
Now, apply the quantization algorithms:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llmcompressor.transformers import oneshot
|
from llmcompressor.transformers import oneshot
|
||||||
@ -141,7 +138,7 @@ lm_eval --model vllm \
|
|||||||
|
|
||||||
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
The following is an example of an expanded quantization recipe you can tune to your own use case:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from compressed_tensors.quantization import (
|
from compressed_tensors.quantization import (
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# INT8 W8A8
|
||||||
title: INT8 W8A8
|
|
||||||
---
|
|
||||||
[](){ #int8 }
|
|
||||||
|
|
||||||
vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
|
vLLM supports quantizing weights and activations to INT8 for memory savings and inference acceleration.
|
||||||
This quantization method is particularly useful for reducing model size while maintaining good performance.
|
This quantization method is particularly useful for reducing model size while maintaining good performance.
|
||||||
@ -54,7 +51,7 @@ When quantizing activations to INT8, you need sample data to estimate the activa
|
|||||||
It's best to use calibration data that closely matches your deployment data.
|
It's best to use calibration data that closely matches your deployment data.
|
||||||
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
For a general-purpose instruction-tuned model, you can use a dataset like `ultrachat`:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@ -81,7 +78,7 @@ For a general-purpose instruction-tuned model, you can use a dataset like `ultra
|
|||||||
|
|
||||||
Now, apply the quantization algorithms:
|
Now, apply the quantization algorithms:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from llmcompressor.transformers import oneshot
|
from llmcompressor.transformers import oneshot
|
||||||
|
@ -14,7 +14,7 @@ You can quantize HuggingFace models using the example scripts provided in the Te
|
|||||||
|
|
||||||
Below is an example showing how to quantize a model using modelopt's PTQ API:
|
Below is an example showing how to quantize a model using modelopt's PTQ API:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import modelopt.torch.quantization as mtq
|
import modelopt.torch.quantization as mtq
|
||||||
@ -50,7 +50,7 @@ with torch.inference_mode():
|
|||||||
|
|
||||||
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
|
The quantized checkpoint can then be deployed with vLLM. As an example, the following code shows how to deploy `nvidia/Llama-3.1-8B-Instruct-FP8`, which is the FP8 quantized checkpoint derived from `meta-llama/Llama-3.1-8B-Instruct`, using vLLM:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Quantized KV Cache
|
||||||
title: Quantized KV Cache
|
|
||||||
---
|
|
||||||
[](){ #quantized-kvcache }
|
|
||||||
|
|
||||||
## FP8 KV Cache
|
## FP8 KV Cache
|
||||||
|
|
||||||
@ -35,7 +32,7 @@ Studies have shown that FP8 E4M3 quantization typically only minimally degrades
|
|||||||
|
|
||||||
Here is an example of how to enable FP8 quantization:
|
Here is an example of how to enable FP8 quantization:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
# To calculate kv cache scales on the fly enable the calculate_kv_scales
|
||||||
@ -73,7 +70,7 @@ pip install llmcompressor
|
|||||||
|
|
||||||
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
|
Here's a complete example using `meta-llama/Llama-3.1-8B-Instruct` (most models can use this same pattern):
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# AMD Quark
|
||||||
title: AMD Quark
|
|
||||||
---
|
|
||||||
[](){ #quark }
|
|
||||||
|
|
||||||
Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
|
Quantization can effectively reduce memory and bandwidth usage, accelerate computation and improve
|
||||||
throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
|
throughput while with minimal accuracy loss. vLLM can leverage [Quark](https://quark.docs.amd.com/latest/),
|
||||||
@ -42,7 +39,7 @@ The Quark quantization process can be listed for 5 steps as below:
|
|||||||
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
Quark uses [Transformers](https://huggingface.co/docs/transformers/en/index)
|
||||||
to fetch model and tokenizer.
|
to fetch model and tokenizer.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from transformers import AutoTokenizer, AutoModelForCausalLM
|
from transformers import AutoTokenizer, AutoModelForCausalLM
|
||||||
@ -65,7 +62,7 @@ Quark uses the [PyTorch Dataloader](https://pytorch.org/tutorials/beginner/basic
|
|||||||
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
to load calibration data. For more details about how to use calibration datasets efficiently, please refer
|
||||||
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calibration_datasets.html).
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
@ -98,7 +95,7 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
|
|||||||
AutoSmoothQuant config file for Llama is
|
AutoSmoothQuant config file for Llama is
|
||||||
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
`examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json`.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from quark.torch.quantization import (Config, QuantizationConfig,
|
from quark.torch.quantization import (Config, QuantizationConfig,
|
||||||
@ -145,7 +142,7 @@ HuggingFace `safetensors`, you can refer to
|
|||||||
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
[HuggingFace format exporting](https://quark.docs.amd.com/latest/pytorch/export/quark_export_hf.html)
|
||||||
for more exporting format details.
|
for more exporting format details.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import torch
|
import torch
|
||||||
@ -176,7 +173,7 @@ for more exporting format details.
|
|||||||
|
|
||||||
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
Now, you can load and run the Quark quantized model directly through the LLM entrypoint:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Supported Hardware
|
||||||
title: Supported Hardware
|
|
||||||
---
|
|
||||||
[](){ #quantization-supported-hardware }
|
|
||||||
|
|
||||||
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ pip install \
|
|||||||
## Quantizing HuggingFace Models
|
## Quantizing HuggingFace Models
|
||||||
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
You can quantize your own huggingface model with torchao, e.g. [transformers](https://huggingface.co/docs/transformers/main/en/quantization/torchao) and [diffusers](https://huggingface.co/docs/diffusers/en/quantization/torchao), and save the checkpoint to huggingface hub like [this](https://huggingface.co/jerryzh168/llama3-8b-int8wo) with the following example code:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```Python
|
```Python
|
||||||
import torch
|
import torch
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Reasoning Outputs
|
||||||
title: Reasoning Outputs
|
|
||||||
---
|
|
||||||
[](){ #reasoning-outputs }
|
|
||||||
|
|
||||||
vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
|
vLLM offers support for reasoning models like [DeepSeek R1](https://huggingface.co/deepseek-ai/DeepSeek-R1), which are designed to generate outputs containing both reasoning steps and final conclusions.
|
||||||
|
|
||||||
@ -33,7 +30,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
|
|||||||
|
|
||||||
Next, make a request to the model that should return the reasoning content in the response.
|
Next, make a request to the model that should return the reasoning content in the response.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -70,7 +67,7 @@ The `reasoning_content` field contains the reasoning steps that led to the final
|
|||||||
|
|
||||||
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
Streaming chat completions are also supported for reasoning models. The `reasoning_content` field is available in the `delta` field in [chat completion response chunks](https://platform.openai.com/docs/api-reference/chat/streaming).
|
||||||
|
|
||||||
??? Json
|
??? console "Json"
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
@ -95,7 +92,7 @@ Streaming chat completions are also supported for reasoning models. The `reasoni
|
|||||||
|
|
||||||
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
OpenAI Python client library does not officially support `reasoning_content` attribute for streaming output. But the client supports extra attributes in the response. You can use `hasattr` to check if the `reasoning_content` attribute is present in the response. For example:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -152,7 +149,7 @@ Remember to check whether the `reasoning_content` exists in the response before
|
|||||||
|
|
||||||
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -200,7 +197,7 @@ For more examples, please refer to <gh-file:examples/online_serving/openai_chat_
|
|||||||
|
|
||||||
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# import the required packages
|
# import the required packages
|
||||||
@ -258,7 +255,7 @@ You can add a new `ReasoningParser` similar to <gh-file:vllm/reasoning/deepseek_
|
|||||||
|
|
||||||
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
Additionally, to enable structured output, you'll need to create a new `Reasoner` similar to the one in <gh-file:vllm/reasoning/deepseek_r1_reasoning_parser.py>.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@dataclass
|
@dataclass
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Speculative Decoding
|
||||||
title: Speculative Decoding
|
|
||||||
---
|
|
||||||
[](){ #spec-decode }
|
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
Please note that speculative decoding in vLLM is not yet optimized and does
|
Please note that speculative decoding in vLLM is not yet optimized and does
|
||||||
@ -18,7 +15,7 @@ Speculative decoding is a technique which improves inter-token latency in memory
|
|||||||
|
|
||||||
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
The following code configures vLLM in an offline mode to use speculative decoding with a draft model, speculating 5 tokens at a time.
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -62,7 +59,7 @@ python -m vllm.entrypoints.openai.api_server \
|
|||||||
|
|
||||||
Then use a client:
|
Then use a client:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -103,7 +100,7 @@ Then use a client:
|
|||||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
|
matching n-grams in the prompt. For more information read [this thread.](https://x.com/joao_gante/status/1747322413006643259)
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -137,7 +134,7 @@ draft models that conditioning draft predictions on both context vectors and sam
|
|||||||
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
|
For more information see [this blog](https://pytorch.org/blog/hitchhikers-guide-speculative-decoding/) or
|
||||||
[this technical report](https://arxiv.org/abs/2404.19124).
|
[this technical report](https://arxiv.org/abs/2404.19124).
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -185,7 +182,7 @@ A variety of speculative models of this type are available on HF hub:
|
|||||||
The following code configures vLLM to use speculative decoding where proposals are generated by
|
The following code configures vLLM to use speculative decoding where proposals are generated by
|
||||||
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
|
an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https://arxiv.org/pdf/2401.15077) based draft model. A more detailed example for offline mode, including how to extract request level acceptance rate, can be found [here](gh-file:examples/offline_inference/eagle.py).
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -217,8 +214,8 @@ an [EAGLE (Extrapolation Algorithm for Greater Language-model Efficiency)](https
|
|||||||
A few important things to consider when using the EAGLE based draft models:
|
A few important things to consider when using the EAGLE based draft models:
|
||||||
|
|
||||||
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
|
1. The EAGLE draft models available in the [HF repository for EAGLE models](https://huggingface.co/yuhuili) should
|
||||||
be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
|
be able to be loaded and used directly by vLLM after <gh-pr:12304>.
|
||||||
If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
|
If you are using vllm version before <gh-pr:12304>, please use the
|
||||||
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
|
||||||
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
|
||||||
|
|
||||||
@ -228,7 +225,7 @@ A few important things to consider when using the EAGLE based draft models:
|
|||||||
|
|
||||||
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
|
||||||
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
reported in the reference implementation [here](https://github.com/SafeAILab/EAGLE). This issue is under
|
||||||
investigation and tracked here: [https://github.com/vllm-project/vllm/issues/9565](https://github.com/vllm-project/vllm/issues/9565).
|
investigation and tracked here: <gh-issue:9565>.
|
||||||
|
|
||||||
A variety of EAGLE draft models are available on the Hugging Face hub:
|
A variety of EAGLE draft models are available on the Hugging Face hub:
|
||||||
|
|
||||||
@ -269,7 +266,7 @@ speculative decoding, breaking down the guarantees into three key areas:
|
|||||||
3. **vLLM Logprob Stability**
|
3. **vLLM Logprob Stability**
|
||||||
\- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
|
\- vLLM does not currently guarantee stable token log probabilities (logprobs). This can result in different outputs for the
|
||||||
same request across runs. For more details, see the FAQ section
|
same request across runs. For more details, see the FAQ section
|
||||||
titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
|
titled *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
|
||||||
|
|
||||||
While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
|
While vLLM strives to ensure losslessness in speculative decoding, variations in generated outputs with and without speculative decoding
|
||||||
can occur due to following factors:
|
can occur due to following factors:
|
||||||
@ -278,7 +275,7 @@ can occur due to following factors:
|
|||||||
- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
|
- **Batch Size and Numerical Stability**: Changes in batch size may cause variations in logprobs and output probabilities, potentially
|
||||||
due to non-deterministic behavior in batched operations or numerical instability.
|
due to non-deterministic behavior in batched operations or numerical instability.
|
||||||
|
|
||||||
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs][faq].
|
For mitigation strategies, please refer to the FAQ entry *Can the output of a prompt vary across runs in vLLM?* in the [FAQs](../usage/faq.md).
|
||||||
|
|
||||||
## Resources for vLLM contributors
|
## Resources for vLLM contributors
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Structured Outputs
|
||||||
title: Structured Outputs
|
|
||||||
---
|
|
||||||
[](){ #structured-outputs }
|
|
||||||
|
|
||||||
vLLM supports the generation of structured outputs using
|
vLLM supports the generation of structured outputs using
|
||||||
[xgrammar](https://github.com/mlc-ai/xgrammar) or
|
[xgrammar](https://github.com/mlc-ai/xgrammar) or
|
||||||
@ -21,7 +18,7 @@ The following parameters are supported, which must be added as extra parameters:
|
|||||||
- `guided_grammar`: the output will follow the context free grammar.
|
- `guided_grammar`: the output will follow the context free grammar.
|
||||||
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
- `structural_tag`: Follow a JSON schema within a set of specified tags within the generated text.
|
||||||
|
|
||||||
You can see the complete list of supported parameters on the [OpenAI-Compatible Server][serving-openai-compatible-server] page.
|
You can see the complete list of supported parameters on the [OpenAI-Compatible Server](../serving/openai_compatible_server.md) page.
|
||||||
|
|
||||||
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
Structured outputs are supported by default in the OpenAI-Compatible Server. You
|
||||||
may choose to specify the backend to use by setting the
|
may choose to specify the backend to use by setting the
|
||||||
@ -33,7 +30,7 @@ text.
|
|||||||
|
|
||||||
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
Now let´s see an example for each of the cases, starting with the `guided_choice`, as it´s the easiest one:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -55,7 +52,7 @@ Now let´s see an example for each of the cases, starting with the `guided_choic
|
|||||||
|
|
||||||
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
The next example shows how to use the `guided_regex`. The idea is to generate an email address, given a simple regex template:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
completion = client.chat.completions.create(
|
completion = client.chat.completions.create(
|
||||||
@ -79,7 +76,7 @@ For this we can use the `guided_json` parameter in two different ways:
|
|||||||
|
|
||||||
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
The next example shows how to use the `guided_json` parameter with a Pydantic model:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -127,7 +124,7 @@ difficult to use, but it´s really powerful. It allows us to define complete
|
|||||||
languages like SQL queries. It works by using a context free EBNF grammar.
|
languages like SQL queries. It works by using a context free EBNF grammar.
|
||||||
As an example, we can use to define a specific format of simplified SQL queries:
|
As an example, we can use to define a specific format of simplified SQL queries:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
simplified_sql_grammar = """
|
simplified_sql_grammar = """
|
||||||
@ -157,7 +154,7 @@ As an example, we can use to define a specific format of simplified SQL queries:
|
|||||||
print(completion.choices[0].message.content)
|
print(completion.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](../examples/online_serving/structured_outputs.md)
|
||||||
|
|
||||||
## Reasoning Outputs
|
## Reasoning Outputs
|
||||||
|
|
||||||
@ -169,7 +166,7 @@ vllm serve deepseek-ai/DeepSeek-R1-Distill-Qwen-7B --reasoning-parser deepseek_r
|
|||||||
|
|
||||||
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
Note that you can use reasoning with any provided structured outputs feature. The following uses one with JSON schema:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -200,7 +197,7 @@ Note that you can use reasoning with any provided structured outputs feature. Th
|
|||||||
print("content: ", completion.choices[0].message.content)
|
print("content: ", completion.choices[0].message.content)
|
||||||
```
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](../examples/online_serving/structured_outputs.md)
|
||||||
|
|
||||||
## Experimental Automatic Parsing (OpenAI API)
|
## Experimental Automatic Parsing (OpenAI API)
|
||||||
|
|
||||||
@ -212,7 +209,7 @@ For the following examples, vLLM was setup using `vllm serve meta-llama/Llama-3.
|
|||||||
|
|
||||||
Here is a simple example demonstrating how to get structured output using Pydantic models:
|
Here is a simple example demonstrating how to get structured output using Pydantic models:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
@ -248,7 +245,7 @@ Age: 28
|
|||||||
|
|
||||||
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
|
Here is a more complex example using nested Pydantic models to handle a step-by-step math solution:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -308,7 +305,7 @@ These parameters can be used in the same way as the parameters from the Online
|
|||||||
Serving examples above. One example for the usage of the `choice` parameter is
|
Serving examples above. One example for the usage of the `choice` parameter is
|
||||||
shown below:
|
shown below:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
@ -325,4 +322,4 @@ shown below:
|
|||||||
print(outputs[0].outputs[0].text)
|
print(outputs[0].outputs[0].text)
|
||||||
```
|
```
|
||||||
|
|
||||||
See also: [full example](https://docs.vllm.ai/en/latest/examples/online_serving/structured_outputs.html)
|
See also: [full example](../examples/online_serving/structured_outputs.md)
|
||||||
|
@ -15,7 +15,7 @@ vllm serve meta-llama/Llama-3.1-8B-Instruct \
|
|||||||
|
|
||||||
Next, make a request to the model that should result in it using the available tools:
|
Next, make a request to the model that should result in it using the available tools:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
@ -268,10 +268,10 @@ Flags: `--tool-call-parser hermes`
|
|||||||
|
|
||||||
Supported models:
|
Supported models:
|
||||||
|
|
||||||
* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
|
* `MiniMaxAi/MiniMax-M1-40k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
|
||||||
* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax.jinja>)
|
* `MiniMaxAi/MiniMax-M1-80k` (use with <gh-file:examples/tool_chat_template_minimax_m1.jinja>)
|
||||||
|
|
||||||
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax.jinja`
|
Flags: `--tool-call-parser minimax --chat-template examples/tool_chat_template_minimax_m1.jinja`
|
||||||
|
|
||||||
### DeepSeek-V3 Models (`deepseek_v3`)
|
### DeepSeek-V3 Models (`deepseek_v3`)
|
||||||
|
|
||||||
@ -299,20 +299,17 @@ Limitations:
|
|||||||
|
|
||||||
Example supported models:
|
Example supported models:
|
||||||
|
|
||||||
* `meta-llama/Llama-3.2-1B-Instruct`\* (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
* `meta-llama/Llama-3.2-1B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
||||||
* `meta-llama/Llama-3.2-3B-Instruct`\* (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
* `meta-llama/Llama-3.2-3B-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama3.2_pythonic.jinja>)
|
||||||
* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
* `Team-ACE/ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
||||||
* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
* `fixie-ai/ultravox-v0_4-ToolACE-8B` (use with <gh-file:examples/tool_chat_template_toolace.jinja>)
|
||||||
* `meta-llama/Llama-4-Scout-17B-16E-Instruct`\* (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
* `meta-llama/Llama-4-Scout-17B-16E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
||||||
* `meta-llama/Llama-4-Maverick-17B-128E-Instruct`\* (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
* `meta-llama/Llama-4-Maverick-17B-128E-Instruct` ⚠️ (use with <gh-file:examples/tool_chat_template_llama4_pythonic.jinja>)
|
||||||
|
|
||||||
Flags: `--tool-call-parser pythonic --chat-template {see_above}`
|
Flags: `--tool-call-parser pythonic --chat-template {see_above}`
|
||||||
|
|
||||||
---
|
!!! warning
|
||||||
**WARNING**
|
Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
|
||||||
Llama's smaller models frequently fail to emit tool calls in the correct format. Your mileage may vary.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## How to write a tool parser plugin
|
## How to write a tool parser plugin
|
||||||
|
|
||||||
@ -320,7 +317,7 @@ A tool parser plugin is a Python file containing one or more ToolParser implemen
|
|||||||
|
|
||||||
Here is a summary of a plugin file:
|
Here is a summary of a plugin file:
|
||||||
|
|
||||||
??? Code
|
??? code
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
---
|
# Installation
|
||||||
title: Installation
|
|
||||||
---
|
|
||||||
[](){ #installation-index }
|
|
||||||
|
|
||||||
vLLM supports the following hardware platforms:
|
vLLM supports the following hardware platforms:
|
||||||
|
|
||||||
|
@ -76,7 +76,7 @@ Currently, there are no pre-built CPU wheels.
|
|||||||
|
|
||||||
### Build image from source
|
### Build image from source
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker build -f docker/Dockerfile.cpu \
|
docker build -f docker/Dockerfile.cpu \
|
||||||
@ -149,7 +149,7 @@ vllm serve facebook/opt-125m
|
|||||||
|
|
||||||
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
- If using vLLM CPU backend on a machine with hyper-threading, it is recommended to bind only one OpenMP thread on each physical CPU core using `VLLM_CPU_OMP_THREADS_BIND` or using auto thread binding feature by default. On a hyper-threading enabled platform with 16 logical CPU cores / 8 physical CPU cores:
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```console
|
```console
|
||||||
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
|
$ lscpu -e # check the mapping between logical CPU cores and physical CPU cores
|
||||||
|
@ -54,9 +54,6 @@ If the build has error like the following snippet where standard C++ headers can
|
|||||||
```
|
```
|
||||||
|
|
||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:set-up-using-docker]
|
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
# --8<-- [end:pre-built-images]
|
# --8<-- [end:pre-built-images]
|
||||||
|
@ -28,9 +28,6 @@ ARM CPU backend currently supports Float32, FP16 and BFloat16 datatypes.
|
|||||||
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
Testing has been conducted on AWS Graviton3 instances for compatibility.
|
||||||
|
|
||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:set-up-using-docker]
|
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
# --8<-- [end:pre-built-images]
|
# --8<-- [end:pre-built-images]
|
||||||
|
@ -56,9 +56,6 @@ Execute the following commands to build and install vLLM from the source.
|
|||||||
```
|
```
|
||||||
|
|
||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:set-up-using-docker]
|
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
# --8<-- [end:pre-built-images]
|
# --8<-- [end:pre-built-images]
|
||||||
|
@ -31,9 +31,6 @@ vLLM initially supports basic model inferencing and serving on x86 CPU platform,
|
|||||||
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
|
- If you want to force enable AVX512_BF16 for the cross-compilation, please set environment variable `VLLM_CPU_AVX512BF16=1` before the building.
|
||||||
|
|
||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:set-up-using-docker]
|
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
See [https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo](https://gallery.ecr.aws/q9t5s3a7/vllm-cpu-release-repo)
|
||||||
|
@ -46,11 +46,11 @@ vLLM is a Python library that supports the following GPU variants. Select your G
|
|||||||
|
|
||||||
=== "AMD ROCm"
|
=== "AMD ROCm"
|
||||||
|
|
||||||
There is no extra information on creating a new Python environment for this device.
|
--8<-- "docs/getting_started/installation/gpu/rocm.inc.md:set-up-using-python"
|
||||||
|
|
||||||
=== "Intel XPU"
|
=== "Intel XPU"
|
||||||
|
|
||||||
There is no extra information on creating a new Python environment for this device.
|
--8<-- "docs/getting_started/installation/gpu/xpu.inc.md:set-up-using-python"
|
||||||
|
|
||||||
### Pre-built wheels
|
### Pre-built wheels
|
||||||
|
|
||||||
|
@ -232,9 +232,6 @@ pip install -e .
|
|||||||
```
|
```
|
||||||
|
|
||||||
# --8<-- [end:build-wheel-from-source]
|
# --8<-- [end:build-wheel-from-source]
|
||||||
# --8<-- [start:set-up-using-docker]
|
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
|
See [deployment-docker-pre-built-image][deployment-docker-pre-built-image] for instructions on using the official Docker image.
|
||||||
@ -261,4 +258,3 @@ See [deployment-docker-build-image-from-source][deployment-docker-build-image-fr
|
|||||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||||
|
|
||||||
# --8<-- [end:supported-features]
|
# --8<-- [end:supported-features]
|
||||||
# --8<-- [end:extra-information]
|
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
vLLM supports AMD GPUs with ROCm 6.3.
|
vLLM supports AMD GPUs with ROCm 6.3.
|
||||||
|
|
||||||
|
!!! tip
|
||||||
|
[Docker](#set-up-using-docker) is the recommended way to use vLLM on ROCm.
|
||||||
|
|
||||||
!!! warning
|
!!! warning
|
||||||
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
|
There are no pre-built wheels for this device, so you must either use the pre-built Docker image or build vLLM from source.
|
||||||
|
|
||||||
@ -14,6 +17,8 @@ vLLM supports AMD GPUs with ROCm 6.3.
|
|||||||
# --8<-- [end:requirements]
|
# --8<-- [end:requirements]
|
||||||
# --8<-- [start:set-up-using-python]
|
# --8<-- [start:set-up-using-python]
|
||||||
|
|
||||||
|
There is no extra information on creating a new Python environment for this device.
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-python]
|
# --8<-- [end:set-up-using-python]
|
||||||
# --8<-- [start:pre-built-wheels]
|
# --8<-- [start:pre-built-wheels]
|
||||||
|
|
||||||
@ -90,7 +95,7 @@ Currently, there are no pre-built ROCm wheels.
|
|||||||
|
|
||||||
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
|
4. Build vLLM. For example, vLLM on ROCM 6.3 can be built with the following steps:
|
||||||
|
|
||||||
??? Commands
|
??? console "Commands"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
@ -123,9 +128,7 @@ Currently, there are no pre-built ROCm wheels.
|
|||||||
- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
|
- For MI300x (gfx942) users, to achieve optimal performance, please refer to [MI300x tuning guide](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/index.html) for performance optimization and tuning tips on system and workflow level.
|
||||||
For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
|
For vLLM, please refer to [vLLM performance optimization](https://rocm.docs.amd.com/en/latest/how-to/tuning-guides/mi300x/workload.html#vllm-performance-optimization).
|
||||||
|
|
||||||
## Set up using Docker (Recommended)
|
# --8<-- [end:build-wheel-from-source]
|
||||||
|
|
||||||
# --8<-- [end:set-up-using-docker]
|
|
||||||
# --8<-- [start:pre-built-images]
|
# --8<-- [start:pre-built-images]
|
||||||
|
|
||||||
The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
|
The [AMD Infinity hub for vLLM](https://hub.docker.com/r/rocm/vllm/tags) offers a prebuilt, optimized
|
||||||
@ -203,7 +206,7 @@ DOCKER_BUILDKIT=1 docker build \
|
|||||||
|
|
||||||
To run the above docker image `vllm-rocm`, use the below command:
|
To run the above docker image `vllm-rocm`, use the below command:
|
||||||
|
|
||||||
??? Command
|
??? console "Command"
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
docker run -it \
|
docker run -it \
|
||||||
@ -227,4 +230,3 @@ Where the `<path/to/model>` is the location where the model is stored, for examp
|
|||||||
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
See [feature-x-hardware][feature-x-hardware] compatibility matrix for feature support information.
|
||||||
|
|
||||||
# --8<-- [end:supported-features]
|
# --8<-- [end:supported-features]
|
||||||
# --8<-- [end:extra-information]
|
|
||||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user