mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[Doc]: fixing typos to improve docs (#24480)
Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.
|
||||
|
||||
The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
|
||||
|
||||
Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
|
||||
Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.
|
||||
|
||||
Known issues:
|
||||
|
||||
|
@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
|
||||
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
|
||||
|
||||
!!! tip
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
|
||||
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
|
||||
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
|
||||
- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
|
||||
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.
|
||||
|
@ -9,7 +9,7 @@
|
||||
<|system|>
|
||||
{{ system_message }}
|
||||
{%- if tools %}
|
||||
In addition to plain text responses, you can chose to call one or more of the provided functions.
|
||||
In addition to plain text responses, you can choose to call one or more of the provided functions.
|
||||
|
||||
Use the following rule to decide when to call a function:
|
||||
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
|
||||
@ -19,7 +19,7 @@ If you decide to call functions:
|
||||
* prefix function calls with functools marker (no closing marker required)
|
||||
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
|
||||
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
|
||||
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
|
||||
* respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
|
||||
* make sure you pick the right functions that match the user intent
|
||||
|
||||
|
||||
|
@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
|
||||
timeout: Optional[float] = None,
|
||||
args: tuple = (),
|
||||
kwargs: Optional[dict] = None) -> list[Any]:
|
||||
# Drop marker to show that this was ran
|
||||
# Drop marker to show that this was run
|
||||
with open(".marker", "w"):
|
||||
...
|
||||
return super().collective_rpc(method, timeout, args, kwargs)
|
||||
|
@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
|
||||
)
|
||||
|
||||
# Need to re-import huggingface_hub
|
||||
# and friends to setup offline mode
|
||||
# and friends to set up offline mode
|
||||
_re_import_modules()
|
||||
# Cached model files should be used in offline mode
|
||||
for model_config in MODEL_CONFIGS:
|
||||
@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
|
||||
disable_connect,
|
||||
)
|
||||
# Need to re-import huggingface_hub
|
||||
# and friends to setup offline mode
|
||||
# and friends to set up offline mode
|
||||
_re_import_modules()
|
||||
engine_args = EngineArgs(model="facebook/opt-125m")
|
||||
LLM(**dataclasses.asdict(engine_args))
|
||||
|
@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
|
||||
# then we would expand a to:
|
||||
# a = [[1, 1, 2, 2],
|
||||
# [3, 3, 4, 4]]
|
||||
# NOTE this function this function does not explicitly broadcast dimensions
|
||||
# NOTE this function does not explicitly broadcast dimensions
|
||||
# with an extent of 1, since this can be done implicitly by pytorch
|
||||
def group_broadcast(t, shape):
|
||||
for i, s in enumerate(shape):
|
||||
|
@ -301,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
|
||||
finished_requests_ids is larger than the maximum mamba block capacity.
|
||||
|
||||
This could generally happen due to the fact that hybrid does support
|
||||
statelessness mechanism where it can cleanup new incoming requests in
|
||||
statelessness mechanism where it can clean up new incoming requests in
|
||||
a single step.
|
||||
"""
|
||||
try:
|
||||
@ -322,7 +322,7 @@ def test_state_cleanup(
|
||||
This test is for verifying that the Hybrid state is cleaned up between
|
||||
steps.
|
||||
|
||||
If its not cleaned, an error would be expected.
|
||||
If it's not cleaned, an error would be expected.
|
||||
"""
|
||||
try:
|
||||
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:
|
||||
|
@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
|
||||
expected_value=0.76), # no bias
|
||||
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
|
||||
# so only one of these tests can run in a single call to pytest. As
|
||||
# a follow up, move this into the LM-EVAL section of the CI.
|
||||
# a follow-up, move this into the LM-EVAL section of the CI.
|
||||
# GSM8KAccuracyTestConfig(
|
||||
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
|
||||
# expected_value=0.66), # bias in QKV layers
|
||||
|
@ -1117,7 +1117,7 @@ def initialize_model_parallel(
|
||||
"decode context model parallel group is already initialized")
|
||||
# Note(hc): In the current implementation of decode context parallel,
|
||||
# dcp_size must not exceed tp_size, because the world size does not
|
||||
# change by DCP, it simply reuse the GPUs of TP group, and split one
|
||||
# change by DCP, it simply reuses the GPUs of TP group, and split one
|
||||
# TP group into tp_size//dcp_size DCP groups.
|
||||
group_ranks = all_ranks.reshape(
|
||||
-1, decode_context_model_parallel_size).unbind(0)
|
||||
|
Reference in New Issue
Block a user