[Doc]: fixing typos to improve docs (#24480)

Signed-off-by: Didier Durand <durand.didier@gmail.com>
This commit is contained in:
Didier Durand
2025-09-09 08:06:04 +02:00
committed by GitHub
parent 1823a00d67
commit 46876dff32
9 changed files with 12 additions and 12 deletions

View File

@ -169,7 +169,7 @@ All Llama 3.1, 3.2 and 4 models should be supported.
The tool calling that is supported is the [JSON-based tool calling](https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_1/#json-based-tool-calling). For [pythonic tool calling](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/text_prompt_format.md#zero-shot-function-calling) introduced by the Llama-3.2 models, see the `pythonic` tool parser below. As for Llama 4 models, it is recommended to use the `llama4_pythonic` tool parser.
Other tool calling formats like the built in python tool calling or custom tool calling are not supported.
Other tool calling formats like the built-in python tool calling or custom tool calling are not supported.
Known issues:

View File

@ -119,7 +119,7 @@ Currently, there are no pre-built ROCm wheels.
This may take 5-10 minutes. Currently, `pip install .` does not work for ROCm installation.
!!! tip
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm-up step before collecting perf numbers.
- Triton flash attention does not currently support sliding window attention. If using half precision, please use CK flash-attention for sliding window support.
- To use CK flash-attention or PyTorch naive attention, please use this flag `export VLLM_USE_TRITON_FLASH_ATTN=0` to turn off triton flash attention.
- The ROCm version of PyTorch, ideally, should match the ROCm driver version.

View File

@ -9,7 +9,7 @@
<|system|>
{{ system_message }}
{%- if tools %}
In addition to plain text responses, you can chose to call one or more of the provided functions.
In addition to plain text responses, you can choose to call one or more of the provided functions.
Use the following rule to decide when to call a function:
* if the response can be generated from your internal knowledge (e.g., as in the case of queries like "What is the capital of Poland?"), do so
@ -19,7 +19,7 @@ If you decide to call functions:
* prefix function calls with functools marker (no closing marker required)
* all function calls should be generated in a single JSON list formatted as functools[{"name": [function name], "arguments": [function arguments as JSON]}, ...]
* follow the provided JSON schema. Do not hallucinate arguments or values. Do to blindly copy values from the provided samples
* respect the argument type formatting. E.g., if the type if number and format is float, write value 7 as 7.0
* respect the argument type formatting. E.g., if the type is number and format is float, write value 7 as 7.0
* make sure you pick the right functions that match the user intent

View File

@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
timeout: Optional[float] = None,
args: tuple = (),
kwargs: Optional[dict] = None) -> list[Any]:
# Drop marker to show that this was ran
# Drop marker to show that this was run
with open(".marker", "w"):
...
return super().collective_rpc(method, timeout, args, kwargs)

View File

@ -79,7 +79,7 @@ def test_offline_mode(monkeypatch: pytest.MonkeyPatch):
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set up offline mode
_re_import_modules()
# Cached model files should be used in offline mode
for model_config in MODEL_CONFIGS:
@ -136,7 +136,7 @@ def test_model_from_huggingface_offline(monkeypatch: pytest.MonkeyPatch):
disable_connect,
)
# Need to re-import huggingface_hub
# and friends to setup offline mode
# and friends to set up offline mode
_re_import_modules()
engine_args = EngineArgs(model="facebook/opt-125m")
LLM(**dataclasses.asdict(engine_args))

View File

@ -1247,7 +1247,7 @@ def baseline_scaled_mm(a: torch.Tensor,
# then we would expand a to:
# a = [[1, 1, 2, 2],
# [3, 3, 4, 4]]
# NOTE this function this function does not explicitly broadcast dimensions
# NOTE this function does not explicitly broadcast dimensions
# with an extent of 1, since this can be done implicitly by pytorch
def group_broadcast(t, shape):
for i, s in enumerate(shape):

View File

@ -301,7 +301,7 @@ def test_fail_upon_inc_requests_and_finished_requests_lt_available_blocks(
finished_requests_ids is larger than the maximum mamba block capacity.
This could generally happen due to the fact that hybrid does support
statelessness mechanism where it can cleanup new incoming requests in
statelessness mechanism where it can clean up new incoming requests in
a single step.
"""
try:
@ -322,7 +322,7 @@ def test_state_cleanup(
This test is for verifying that the Hybrid state is cleaned up between
steps.
If its not cleaned, an error would be expected.
If it's not cleaned, an error would be expected.
"""
try:
with vllm_runner(model, max_num_seqs=MAX_NUM_SEQS) as vllm_model:

View File

@ -28,7 +28,7 @@ ACCURACY_CONFIGS = [
expected_value=0.76), # no bias
# NOTE(rob): We cannot re-initialize vLLM in the same process for TPU,
# so only one of these tests can run in a single call to pytest. As
# a follow up, move this into the LM-EVAL section of the CI.
# a follow-up, move this into the LM-EVAL section of the CI.
# GSM8KAccuracyTestConfig(
# model_name="neuralmagic/Qwen2-7B-Instruct-quantized.w8a8",
# expected_value=0.66), # bias in QKV layers

View File

@ -1117,7 +1117,7 @@ def initialize_model_parallel(
"decode context model parallel group is already initialized")
# Note(hc): In the current implementation of decode context parallel,
# dcp_size must not exceed tp_size, because the world size does not
# change by DCP, it simply reuse the GPUs of TP group, and split one
# change by DCP, it simply reuses the GPUs of TP group, and split one
# TP group into tp_size//dcp_size DCP groups.
group_ranks = all_ranks.reshape(
-1, decode_context_model_parallel_size).unbind(0)