[Doc] ruff format remaining Python examples (#26795)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-10-20 23:03:52 +08:00 · 2025-10-15 16:25:49 +08:00
parent 71557a5f7c
commit 6256697997
21 changed files with 166 additions and 105 deletions
--- a/docs/features/quantization/auto_awq.md
+++ b/docs/features/quantization/auto_awq.md
@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
    from awq import AutoAWQForCausalLM
    from transformers import AutoTokenizer
-    model_path = 'mistralai/Mistral-7B-Instruct-v0.2'
+    model_path = "mistralai/Mistral-7B-Instruct-v0.2"
-    quant_path = 'mistral-instruct-v0.2-awq'
+    quant_path = "mistral-instruct-v0.2-awq"
-    quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
+    quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
    # Load model
    model = AutoAWQForCausalLM.from_pretrained(
-        model_path, **{"low_cpu_mem_usage": True, "use_cache": False}
+        model_path,
        low_cpu_mem_usage=True,
        use_cache=False,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
--- a/docs/features/quantization/bitblas.md
+++ b/docs/features/quantization/bitblas.md
@ -34,7 +34,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitblas"
+    quantization="bitblas",
 )
 ```
@ -53,6 +53,6 @@ llm = LLM(
        dtype=torch.float16,
        trust_remote_code=True,
        quantization="bitblas",
-        max_model_len=1024
+        max_model_len=1024,
    )
    ```
--- a/docs/features/quantization/bnb.md
+++ b/docs/features/quantization/bnb.md
@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
 llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
-    trust_remote_code=True
+    trust_remote_code=True,
 )
 ```
@ -43,7 +43,7 @@ llm = LLM(
    model=model_id,
    dtype=torch.bfloat16,
    trust_remote_code=True,
-    quantization="bitsandbytes"
+    quantization="bitsandbytes",
 )
 ```
--- a/docs/features/quantization/fp8.md
+++ b/docs/features/quantization/fp8.md
@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
    # Configure the simple PTQ quantization
    recipe = QuantizationModifier(
-      targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"])
+        targets="Linear",
        scheme="FP8_DYNAMIC",
        ignore=["lm_head"],
    )
    # Apply the quantization algorithm.
    oneshot(model=model, recipe=recipe)
--- a/docs/features/quantization/gguf.md
+++ b/docs/features/quantization/gguf.md
@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
      conversation = [
         {
            "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
         },
         {
            "role": "user",
-            "content": "Hello"
+            "content": "Hello",
         },
         {
            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
         },
         {
            "role": "user",
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
      sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
      # Create an LLM.
-      llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
+      llm = LLM(
-               tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+         model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
         tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
      )
      # Generate texts from the prompts. The output is a list of RequestOutput objects
      # that contain the prompt, generated text, and other information.
      outputs = llm.chat(conversation, sampling_params)
--- a/docs/features/quantization/gptqmodel.md
+++ b/docs/features/quantization/gptqmodel.md
@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
    calibration_dataset = load_dataset(
        "allenai/c4",
        data_files="en/c4-train.00001-of-01024.json.gz",
-        split="train"
+        split="train",
    ).select(range(1024))["text"]
    quant_config = QuantizeConfig(bits=4, group_size=128)
--- a/docs/features/quantization/int4.md
+++ b/docs/features/quantization/int4.md
@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
        },
        ignore=["lm_head"],
        update_size=NUM_CALIBRATION_SAMPLES,
-        dampening_frac=0.01
+        dampening_frac=0.01,
    )
    ```
--- a/docs/features/quantization/int8.md
+++ b/docs/features/quantization/int8.md
@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 model = AutoModelForCausalLM.from_pretrained(
-    MODEL_ID, device_map="auto", torch_dtype="auto",
+    MODEL_ID,
    device_map="auto",
    torch_dtype="auto",
 )
 tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 ```
--- a/docs/features/quantization/modelopt.md
+++ b/docs/features/quantization/modelopt.md
@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
    from vllm import LLM, SamplingParams
    def main():
        model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
-        # Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
+
        # Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
        llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
        sampling_params = SamplingParams(temperature=0.8, top_p=0.9)
--- a/docs/features/quantization/quantized_kvcache.md
+++ b/docs/features/quantization/quantized_kvcache.md
@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
    from vllm import LLM, SamplingParams
    sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
-    llm = LLM(model="meta-llama/Llama-2-7b-chat-hf",
+    llm = LLM(
        model="meta-llama/Llama-2-7b-chat-hf",
        kv_cache_dtype="fp8",
-            calculate_kv_scales=True)
+        calculate_kv_scales=True,
    )
    prompt = "London is the capital of"
    out = llm.generate(prompt, sampling_params)[0].outputs[0].text
    print(out)
--- a/docs/features/quantization/quark.md
+++ b/docs/features/quantization/quark.md
@ -48,7 +48,9 @@ to fetch model and tokenizer.
    MAX_SEQ_LEN = 512
    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID, device_map="auto", torch_dtype="auto",
+        MODEL_ID,
        device_map="auto",
        torch_dtype="auto",
    )
    model.eval()
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
    dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
    text_data = dataset["text"][:NUM_CALIBRATION_DATA]
-    tokenized_outputs = tokenizer(text_data, return_tensors="pt",
+    tokenized_outputs = tokenizer(
-        padding=True, truncation=True, max_length=MAX_SEQ_LEN)
+        text_data,
-    calib_dataloader = DataLoader(tokenized_outputs['input_ids'],
+        return_tensors="pt",
-        batch_size=BATCH_SIZE, drop_last=True)
+        padding=True,
        truncation=True,
        max_length=MAX_SEQ_LEN,
    )
    calib_dataloader = DataLoader(
        tokenized_outputs['input_ids'],
        batch_size=BATCH_SIZE,
        drop_last=True,
    )
    ```
 ### 3. Set the Quantization Configuration
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
                                        load_quant_algo_config_from_file)
    # Define fp8/per-tensor/static spec.
-    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max",
+    FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
-        is_dynamic=False).to_quantization_spec()
+        observer_method="min_max",
        is_dynamic=False,
    ).to_quantization_spec()
    # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
-    global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC,
+    global_quant_config = QuantizationConfig(
-        weight=FP8_PER_TENSOR_SPEC)
+        input_tensors=FP8_PER_TENSOR_SPEC,
        weight=FP8_PER_TENSOR_SPEC,
    )
    # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
    KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
    kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
-    kv_cache_quant_config = {name :
+    kv_cache_quant_config = {
-        QuantizationConfig(input_tensors=global_quant_config.input_tensors,
+        name: QuantizationConfig(
            input_tensors=global_quant_config.input_tensors,
            weight=global_quant_config.weight,
-                        output_tensors=KV_CACHE_SPEC)
+            output_tensors=KV_CACHE_SPEC,
-        for name in kv_cache_layer_names_for_llama}
+        )
        for name in kv_cache_layer_names_for_llama
    }
    layer_quant_config = kv_cache_quant_config.copy()
    # Define algorithm config by config file.
-    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE =
+    LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
        'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
    algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
    EXCLUDE_LAYERS = ["lm_head"]
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
        layer_quant_config=layer_quant_config,
        kv_cache_quant_config=kv_cache_quant_config,
        exclude=EXCLUDE_LAYERS,
-        algo_config=algo_config)
+        algo_config=algo_config,
    )
    ```
 ### 4. Quantize the Model and Export
@ -165,8 +182,11 @@ for more exporting format details.
    EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
    exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
    with torch.no_grad():
-        exporter.export_safetensors_model(freezed_model,
+        exporter.export_safetensors_model(
-            quant_config=quant_config, tokenizer=tokenizer)
+            freezed_model,
            quant_config=quant_config,
            tokenizer=tokenizer,
        )
    ```
 ### 5. Evaluation in vLLM
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
    sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
    # Create an LLM.
-    llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
+    llm = LLM(
-            kv_cache_dtype='fp8',quantization='quark')
+        model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
        kv_cache_dtype="fp8",
        quantization="quark",
    )
    # Generate texts from the prompts. The output is a list of RequestOutput objects
    # that contain the prompt, generated text, and other information.
    outputs = llm.generate(prompts, sampling_params)
--- a/docs/getting_started/quickstart.md
+++ b/docs/getting_started/quickstart.md
@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
        api_key=openai_api_key,
        base_url=openai_api_base,
    )
-    completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct",
+    completion = client.completions.create(
-                                        prompt="San Francisco is a")
+        model="Qwen/Qwen2.5-1.5B-Instruct",
        prompt="San Francisco is a",
    )
    print("Completion result:", completion)
    ```
@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Tell me a joke."},
-        ]
+        ],
    )
    print("Chat response:", chat_response)
    ```
--- a/docs/models/extensions/tensorizer.md
+++ b/docs/models/extensions/tensorizer.md
@ -60,7 +60,7 @@ from vllm import LLM
 llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
    load_format="tensorizer",
-    enable_lora=True
+    enable_lora=True,
 )
 ```
@ -97,6 +97,6 @@ llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
    load_format="tensorizer",
    enable_lora=True,
-    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}}
+    model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
 )
 ```
--- a/docs/models/generative_models.md
+++ b/docs/models/generative_models.md
@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
    conversation = [
        {
            "role": "system",
-            "content": "You are a helpful assistant"
+            "content": "You are a helpful assistant",
        },
        {
            "role": "user",
-            "content": "Hello"
+            "content": "Hello",
        },
        {
            "role": "assistant",
-            "content": "Hello! How can I assist you today?"
+            "content": "Hello! How can I assist you today?",
        },
        {
            "role": "user",
--- a/docs/models/pooling_models.md
+++ b/docs/models/pooling_models.md
@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
 from vllm import LLM
 llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
-(output,) = llm.score("What is the capital of France?",
+(output,) = llm.score(
-                      "The capital of Brazil is Brasilia.")
+    "What is the capital of France?",
    "The capital of Brazil is Brasilia.",
 )
 score = output.outputs.score
 print(f"Score: {score}")
@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
 Here is an example to serve a model with Matryoshka Embeddings enabled.
-```text
+```bash
 vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
 ```
@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
 ```python
 from vllm import LLM, PoolingParams
-llm = LLM(model="jinaai/jina-embeddings-v3",
+llm = LLM(
    model="jinaai/jina-embeddings-v3",
    runner="pooling",
-          trust_remote_code=True)
+    trust_remote_code=True,
-outputs = llm.embed(["Follow the white rabbit."],
+)
-                    pooling_params=PoolingParams(dimensions=32))
+outputs = llm.embed(
    ["Follow the white rabbit."],
    pooling_params=PoolingParams(dimensions=32),
 )
 print(outputs[0].outputs)
 ```
@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
 Use the following command to start vllm server.
-```text
+```bash
 vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
 ```
 You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
-```text
+```bash
 curl http://127.0.0.1:8000/v1/embeddings \
  -H 'accept: application/json' \
  -H 'Content-Type: application/json' \
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port  vllm serve <model_name>
 ```python
 import os
-os.environ['http_proxy'] = 'http://your.proxy.server:port'
+os.environ["http_proxy"] = "http://your.proxy.server:port"
-os.environ['https_proxy'] = 'http://your.proxy.server:port'
+os.environ["https_proxy"] = "http://your.proxy.server:port"
 ```
 ### ModelScope
--- a/docs/serving/expert_parallel_deployment.md
+++ b/docs/serving/expert_parallel_deployment.md
@ -243,10 +243,10 @@ try:
                "remote_engine_id": None,     # Will be populated by vLLM
                "remote_block_ids": None,     # Will be populated by vLLM
                "remote_host": None,          # Will be populated by vLLM
-                "remote_port": None           # Will be populated by vLLM
+                "remote_port": None,          # Will be populated by vLLM
            }
        },
-        extra_headers={"X-Request-Id": request_id}
+        extra_headers={"X-Request-Id": request_id},
    )
    print("-" * 50)
@ -262,7 +262,7 @@ try:
        extra_body={
            "kv_transfer_params": prefill_response.kv_transfer_params  # Pass KV cache info
        },
-        extra_headers={"X-Request-Id": request_id}  # Same request ID
+        extra_headers={"X-Request-Id": request_id},  # Same request ID
    )
    print("-" * 50)
--- a/docs/serving/integrations/langchain.md
+++ b/docs/serving/integrations/langchain.md
@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
    ```python
    from langchain_community.llms import VLLM
-    llm = VLLM(model="mosaicml/mpt-7b",
+    llm = VLLM(
        model="mosaicml/mpt-7b",
        trust_remote_code=True,  # mandatory for hf models
        max_new_tokens=128,
        top_k=10,
        top_p=0.95,
        temperature=0.8,
-            # tensor_parallel_size=... # for distributed inference
+        # for distributed inference
        # tensor_parallel_size=...,
    )
    print(llm("What is the capital of France ?"))
--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
    completion = client.chat.completions.create(
        model="NousResearch/Meta-Llama-3-8B-Instruct",
        messages=[
-            {"role": "user", "content": "Hello!"}
+            {"role": "user", "content": "Hello!"},
-        ]
+        ],
    )
    print(completion.choices[0].message)
@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
 completion = client.chat.completions.create(
    model="NousResearch/Meta-Llama-3-8B-Instruct",
    messages=[
-        {"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]}
+        {
-    ]
+            "role": "user",
            "content": [
                {"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
            ],
        },
    ],
 )
 ```
@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
 completion = client.chat.completions.create(
    model="NousResearch/Meta-Llama-3-8B-Instruct",
    messages=[
-        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+        {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
    ],
    extra_body={
-        "structured_outputs": {"choice": ["positive", "negative"]}
+        "structured_outputs": {"choice": ["positive", "negative"]},
-    }
+    },
 )
 ```
@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
    completion = client.chat.completions.create(
        model="NousResearch/Meta-Llama-3-8B-Instruct",
        messages=[
-            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
+            {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
        ],
        extra_headers={
            "x-request-id": "sentiment-classification-00001",
-        }
+        },
    )
    print(completion._request_id)
@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
        prompt="A robot may not injure a human being",
        extra_headers={
            "x-request-id": "completion-test",
-        }
+        },
    )
    print(completion._request_id)
    ```
@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
            model="openai/whisper-large-v3-turbo",
            file=audio_file,
            language="en",
-            response_format="verbose_json"
+            response_format="verbose_json",
        )
    print(transcription.text)
@ -825,8 +830,8 @@ You can pass multi-modal inputs to scoring models by passing `content` including
                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
                            },
                        },
-                      ]
+                    ],
-                  }
+                },
            },
        )
        response.raise_for_status()
--- a/examples/offline_inference/openai_batch/README.md
+++ b/examples/offline_inference/openai_batch/README.md
@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
    """
    try:
        url = s3_client.generate_presigned_url(
-            ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in
+            ClientMethod=client_method,
            Params=method_parameters,
            ExpiresIn=expires_in,
        )
    except ClientError:
        raise
@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
 s3_client = boto3.client("s3")
 input_url = generate_presigned_url(
-    s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600
+    s3_client,
    "get_object",
    {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
    expires_in=3600,
 )
 output_url = generate_presigned_url(
-    s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600
+    s3_client,
    "put_object",
    {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
    expires_in=3600,
 )
 print(f"{input_url=}")
 print(f"{output_url=}")
--- a/examples/others/tensorize_vllm_model.py
+++ b/examples/others/tensorize_vllm_model.py
@ -84,7 +84,7 @@ directly to load models:
 from vllm import LLM
 llm = LLM(
    "s3://my-bucket/vllm/facebook/opt-125m/v1", 
-    load_format="tensorizer"
+    load_format="tensorizer",
 )
 ```