[Doc] ruff format remaining Python examples (#26795)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-10-15 16:25:49 +08:00
committed by GitHub
parent 71557a5f7c
commit 6256697997
21 changed files with 166 additions and 105 deletions

View File

@ -22,13 +22,15 @@ After installing AutoAWQ, you are ready to quantize a model. Please refer to the
from awq import AutoAWQForCausalLM from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer from transformers import AutoTokenizer
model_path = 'mistralai/Mistral-7B-Instruct-v0.2' model_path = "mistralai/Mistral-7B-Instruct-v0.2"
quant_path = 'mistral-instruct-v0.2-awq' quant_path = "mistral-instruct-v0.2-awq"
quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } quant_config = {"zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM"}
# Load model # Load model
model = AutoAWQForCausalLM.from_pretrained( model = AutoAWQForCausalLM.from_pretrained(
model_path, **{"low_cpu_mem_usage": True, "use_cache": False} model_path,
low_cpu_mem_usage=True,
use_cache=False,
) )
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

View File

@ -34,7 +34,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas" quantization="bitblas",
) )
``` ```
@ -53,6 +53,6 @@ llm = LLM(
dtype=torch.float16, dtype=torch.float16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitblas", quantization="bitblas",
max_model_len=1024 max_model_len=1024,
) )
``` ```

View File

@ -27,7 +27,7 @@ model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM( llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True trust_remote_code=True,
) )
``` ```
@ -43,7 +43,7 @@ llm = LLM(
model=model_id, model=model_id,
dtype=torch.bfloat16, dtype=torch.bfloat16,
trust_remote_code=True, trust_remote_code=True,
quantization="bitsandbytes" quantization="bitsandbytes",
) )
``` ```

View File

@ -41,7 +41,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -63,7 +65,10 @@ Since simple RTN does not require data for weight quantization and the activatio
# Configure the simple PTQ quantization # Configure the simple PTQ quantization
recipe = QuantizationModifier( recipe = QuantizationModifier(
targets="Linear", scheme="FP8_DYNAMIC", ignore=["lm_head"]) targets="Linear",
scheme="FP8_DYNAMIC",
ignore=["lm_head"],
)
# Apply the quantization algorithm. # Apply the quantization algorithm.
oneshot(model=model, recipe=recipe) oneshot(model=model, recipe=recipe)

View File

@ -47,15 +47,15 @@ You can also use the GGUF model directly through the LLM entrypoint:
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",
@ -67,8 +67,10 @@ You can also use the GGUF model directly through the LLM entrypoint:
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf", llm = LLM(
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0") model="./tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf",
tokenizer="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.chat(conversation, sampling_params) outputs = llm.chat(conversation, sampling_params)

View File

@ -40,7 +40,7 @@ Here is an example of how to quantize `meta-llama/Llama-3.2-1B-Instruct`:
calibration_dataset = load_dataset( calibration_dataset = load_dataset(
"allenai/c4", "allenai/c4",
data_files="en/c4-train.00001-of-01024.json.gz", data_files="en/c4-train.00001-of-01024.json.gz",
split="train" split="train",
).select(range(1024))["text"] ).select(range(1024))["text"]
quant_config = QuantizeConfig(bits=4, group_size=128) quant_config = QuantizeConfig(bits=4, group_size=128)

View File

@ -39,7 +39,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```
@ -166,7 +168,7 @@ The following is an example of an expanded quantization recipe you can tune to y
}, },
ignore=["lm_head"], ignore=["lm_head"],
update_size=NUM_CALIBRATION_SAMPLES, update_size=NUM_CALIBRATION_SAMPLES,
dampening_frac=0.01 dampening_frac=0.01,
) )
``` ```

View File

@ -44,7 +44,9 @@ from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
``` ```

View File

@ -56,9 +56,9 @@ The quantized checkpoint can then be deployed with vLLM. As an example, the foll
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
def main(): def main():
model_id = "nvidia/Llama-3.1-8B-Instruct-FP8" model_id = "nvidia/Llama-3.1-8B-Instruct-FP8"
# Ensure you specify quantization='modelopt' when loading the modelopt checkpoint
# Ensure you specify quantization="modelopt" when loading the modelopt checkpoint
llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True) llm = LLM(model=model_id, quantization="modelopt", trust_remote_code=True)
sampling_params = SamplingParams(temperature=0.8, top_p=0.9) sampling_params = SamplingParams(temperature=0.8, top_p=0.9)

View File

@ -41,9 +41,11 @@ Here is an example of how to enable FP8 quantization:
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
sampling_params = SamplingParams(temperature=0.7, top_p=0.8) sampling_params = SamplingParams(temperature=0.7, top_p=0.8)
llm = LLM(model="meta-llama/Llama-2-7b-chat-hf", llm = LLM(
model="meta-llama/Llama-2-7b-chat-hf",
kv_cache_dtype="fp8", kv_cache_dtype="fp8",
calculate_kv_scales=True) calculate_kv_scales=True,
)
prompt = "London is the capital of" prompt = "London is the capital of"
out = llm.generate(prompt, sampling_params)[0].outputs[0].text out = llm.generate(prompt, sampling_params)[0].outputs[0].text
print(out) print(out)

View File

@ -48,7 +48,9 @@ to fetch model and tokenizer.
MAX_SEQ_LEN = 512 MAX_SEQ_LEN = 512
model = AutoModelForCausalLM.from_pretrained( model = AutoModelForCausalLM.from_pretrained(
MODEL_ID, device_map="auto", torch_dtype="auto", MODEL_ID,
device_map="auto",
torch_dtype="auto",
) )
model.eval() model.eval()
@ -75,10 +77,18 @@ to [Adding Calibration Datasets](https://quark.docs.amd.com/latest/pytorch/calib
dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation") dataset = load_dataset("mit-han-lab/pile-val-backup", split="validation")
text_data = dataset["text"][:NUM_CALIBRATION_DATA] text_data = dataset["text"][:NUM_CALIBRATION_DATA]
tokenized_outputs = tokenizer(text_data, return_tensors="pt", tokenized_outputs = tokenizer(
padding=True, truncation=True, max_length=MAX_SEQ_LEN) text_data,
calib_dataloader = DataLoader(tokenized_outputs['input_ids'], return_tensors="pt",
batch_size=BATCH_SIZE, drop_last=True) padding=True,
truncation=True,
max_length=MAX_SEQ_LEN,
)
calib_dataloader = DataLoader(
tokenized_outputs['input_ids'],
batch_size=BATCH_SIZE,
drop_last=True,
)
``` ```
### 3. Set the Quantization Configuration ### 3. Set the Quantization Configuration
@ -103,26 +113,32 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
load_quant_algo_config_from_file) load_quant_algo_config_from_file)
# Define fp8/per-tensor/static spec. # Define fp8/per-tensor/static spec.
FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(observer_method="min_max", FP8_PER_TENSOR_SPEC = FP8E4M3PerTensorSpec(
is_dynamic=False).to_quantization_spec() observer_method="min_max",
is_dynamic=False,
).to_quantization_spec()
# Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC. # Define global quantization config, input tensors and weight apply FP8_PER_TENSOR_SPEC.
global_quant_config = QuantizationConfig(input_tensors=FP8_PER_TENSOR_SPEC, global_quant_config = QuantizationConfig(
weight=FP8_PER_TENSOR_SPEC) input_tensors=FP8_PER_TENSOR_SPEC,
weight=FP8_PER_TENSOR_SPEC,
)
# Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC. # Define quantization config for kv-cache layers, output tensors apply FP8_PER_TENSOR_SPEC.
KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC KV_CACHE_SPEC = FP8_PER_TENSOR_SPEC
kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"] kv_cache_layer_names_for_llama = ["*k_proj", "*v_proj"]
kv_cache_quant_config = {name : kv_cache_quant_config = {
QuantizationConfig(input_tensors=global_quant_config.input_tensors, name: QuantizationConfig(
input_tensors=global_quant_config.input_tensors,
weight=global_quant_config.weight, weight=global_quant_config.weight,
output_tensors=KV_CACHE_SPEC) output_tensors=KV_CACHE_SPEC,
for name in kv_cache_layer_names_for_llama} )
for name in kv_cache_layer_names_for_llama
}
layer_quant_config = kv_cache_quant_config.copy() layer_quant_config = kv_cache_quant_config.copy()
# Define algorithm config by config file. # Define algorithm config by config file.
LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE = "examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json"
'examples/torch/language_modeling/llm_ptq/models/llama/autosmoothquant_config.json'
algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE) algo_config = load_quant_algo_config_from_file(LLAMA_AUTOSMOOTHQUANT_CONFIG_FILE)
EXCLUDE_LAYERS = ["lm_head"] EXCLUDE_LAYERS = ["lm_head"]
@ -131,7 +147,8 @@ kv-cache and the quantization algorithm is AutoSmoothQuant.
layer_quant_config=layer_quant_config, layer_quant_config=layer_quant_config,
kv_cache_quant_config=kv_cache_quant_config, kv_cache_quant_config=kv_cache_quant_config,
exclude=EXCLUDE_LAYERS, exclude=EXCLUDE_LAYERS,
algo_config=algo_config) algo_config=algo_config,
)
``` ```
### 4. Quantize the Model and Export ### 4. Quantize the Model and Export
@ -165,8 +182,11 @@ for more exporting format details.
EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant" EXPORT_DIR = MODEL_ID.split("/")[1] + "-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant"
exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR) exporter = ModelExporter(config=export_config, export_dir=EXPORT_DIR)
with torch.no_grad(): with torch.no_grad():
exporter.export_safetensors_model(freezed_model, exporter.export_safetensors_model(
quant_config=quant_config, tokenizer=tokenizer) freezed_model,
quant_config=quant_config,
tokenizer=tokenizer,
)
``` ```
### 5. Evaluation in vLLM ### 5. Evaluation in vLLM
@ -189,8 +209,11 @@ Now, you can load and run the Quark quantized model directly through the LLM ent
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM. # Create an LLM.
llm = LLM(model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant", llm = LLM(
kv_cache_dtype='fp8',quantization='quark') model="Llama-2-70b-chat-hf-w-fp8-a-fp8-kvcache-fp8-pertensor-autosmoothquant",
kv_cache_dtype="fp8",
quantization="quark",
)
# Generate texts from the prompts. The output is a list of RequestOutput objects # Generate texts from the prompts. The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information. # that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)

View File

@ -194,8 +194,10 @@ Since this server is compatible with OpenAI API, you can use it as a drop-in rep
api_key=openai_api_key, api_key=openai_api_key,
base_url=openai_api_base, base_url=openai_api_base,
) )
completion = client.completions.create(model="Qwen/Qwen2.5-1.5B-Instruct", completion = client.completions.create(
prompt="San Francisco is a") model="Qwen/Qwen2.5-1.5B-Instruct",
prompt="San Francisco is a",
)
print("Completion result:", completion) print("Completion result:", completion)
``` ```
@ -239,7 +241,7 @@ Alternatively, you can use the `openai` Python package:
messages=[ messages=[
{"role": "system", "content": "You are a helpful assistant."}, {"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Tell me a joke."}, {"role": "user", "content": "Tell me a joke."},
] ],
) )
print("Chat response:", chat_response) print("Chat response:", chat_response)
``` ```

View File

@ -60,7 +60,7 @@ from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True enable_lora=True,
) )
``` ```
@ -97,6 +97,6 @@ llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer", load_format="tensorizer",
enable_lora=True, enable_lora=True,
model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}} model_loader_extra_config={"deserialization_kwargs": {"num_readers": 2}},
) )
``` ```

View File

@ -98,15 +98,15 @@ and automatically applies the model's [chat template](https://huggingface.co/doc
conversation = [ conversation = [
{ {
"role": "system", "role": "system",
"content": "You are a helpful assistant" "content": "You are a helpful assistant",
}, },
{ {
"role": "user", "role": "user",
"content": "Hello" "content": "Hello",
}, },
{ {
"role": "assistant", "role": "assistant",
"content": "Hello! How can I assist you today?" "content": "Hello! How can I assist you today?",
}, },
{ {
"role": "user", "role": "user",

View File

@ -130,8 +130,10 @@ It is designed for embedding models and cross-encoder models. Embedding models u
from vllm import LLM from vllm import LLM
llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling") llm = LLM(model="BAAI/bge-reranker-v2-m3", runner="pooling")
(output,) = llm.score("What is the capital of France?", (output,) = llm.score(
"The capital of Brazil is Brasilia.") "What is the capital of France?",
"The capital of Brazil is Brasilia.",
)
score = output.outputs.score score = output.outputs.score
print(f"Score: {score}") print(f"Score: {score}")
@ -209,7 +211,7 @@ For models that support Matryoshka Embeddings but not recognized by vLLM, please
Here is an example to serve a model with Matryoshka Embeddings enabled. Here is an example to serve a model with Matryoshka Embeddings enabled.
```text ```bash
vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}' vllm serve Snowflake/snowflake-arctic-embed-m-v1.5 --hf-overrides '{"matryoshka_dimensions":[256]}'
``` ```
@ -220,11 +222,15 @@ You can change the output dimensions of embedding models that support Matryoshka
```python ```python
from vllm import LLM, PoolingParams from vllm import LLM, PoolingParams
llm = LLM(model="jinaai/jina-embeddings-v3", llm = LLM(
model="jinaai/jina-embeddings-v3",
runner="pooling", runner="pooling",
trust_remote_code=True) trust_remote_code=True,
outputs = llm.embed(["Follow the white rabbit."], )
pooling_params=PoolingParams(dimensions=32)) outputs = llm.embed(
["Follow the white rabbit."],
pooling_params=PoolingParams(dimensions=32),
)
print(outputs[0].outputs) print(outputs[0].outputs)
``` ```
@ -234,13 +240,13 @@ A code example can be found here: <gh-file:examples/offline_inference/pooling/em
Use the following command to start vllm server. Use the following command to start vllm server.
```text ```bash
vllm serve jinaai/jina-embeddings-v3 --trust-remote-code vllm serve jinaai/jina-embeddings-v3 --trust-remote-code
``` ```
You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter. You can change the output dimensions of embedding models that support Matryoshka Embeddings by using the dimensions parameter.
```text ```bash
curl http://127.0.0.1:8000/v1/embeddings \ curl http://127.0.0.1:8000/v1/embeddings \
-H 'accept: application/json' \ -H 'accept: application/json' \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \

View File

@ -278,8 +278,8 @@ https_proxy=http://your.proxy.server:port vllm serve <model_name>
```python ```python
import os import os
os.environ['http_proxy'] = 'http://your.proxy.server:port' os.environ["http_proxy"] = "http://your.proxy.server:port"
os.environ['https_proxy'] = 'http://your.proxy.server:port' os.environ["https_proxy"] = "http://your.proxy.server:port"
``` ```
### ModelScope ### ModelScope

View File

@ -243,10 +243,10 @@ try:
"remote_engine_id": None, # Will be populated by vLLM "remote_engine_id": None, # Will be populated by vLLM
"remote_block_ids": None, # Will be populated by vLLM "remote_block_ids": None, # Will be populated by vLLM
"remote_host": None, # Will be populated by vLLM "remote_host": None, # Will be populated by vLLM
"remote_port": None # Will be populated by vLLM "remote_port": None, # Will be populated by vLLM
} }
}, },
extra_headers={"X-Request-Id": request_id} extra_headers={"X-Request-Id": request_id},
) )
print("-" * 50) print("-" * 50)
@ -262,7 +262,7 @@ try:
extra_body={ extra_body={
"kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info "kv_transfer_params": prefill_response.kv_transfer_params # Pass KV cache info
}, },
extra_headers={"X-Request-Id": request_id} # Same request ID extra_headers={"X-Request-Id": request_id}, # Same request ID
) )
print("-" * 50) print("-" * 50)

View File

@ -15,13 +15,15 @@ To run inference on a single or multiple GPUs, use `VLLM` class from `langchain`
```python ```python
from langchain_community.llms import VLLM from langchain_community.llms import VLLM
llm = VLLM(model="mosaicml/mpt-7b", llm = VLLM(
model="mosaicml/mpt-7b",
trust_remote_code=True, # mandatory for hf models trust_remote_code=True, # mandatory for hf models
max_new_tokens=128, max_new_tokens=128,
top_k=10, top_k=10,
top_p=0.95, top_p=0.95,
temperature=0.8, temperature=0.8,
# tensor_parallel_size=... # for distributed inference # for distributed inference
# tensor_parallel_size=...,
) )
print(llm("What is the capital of France ?")) print(llm("What is the capital of France ?"))

View File

@ -24,8 +24,8 @@ To call the server, in your preferred text editor, create a script that uses an
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Hello!"} {"role": "user", "content": "Hello!"},
] ],
) )
print(completion.choices[0].message) print(completion.choices[0].message)
@ -101,8 +101,13 @@ both a `type` and a `text` field. An example is provided below:
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": [{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"}]} {
] "role": "user",
"content": [
{"type": "text", "text": "Classify this sentiment: vLLM is wonderful!"},
],
},
],
) )
``` ```
@ -130,11 +135,11 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_body={ extra_body={
"structured_outputs": {"choice": ["positive", "negative"]} "structured_outputs": {"choice": ["positive", "negative"]},
} },
) )
``` ```
@ -149,11 +154,11 @@ with `--enable-request-id-headers`.
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="NousResearch/Meta-Llama-3-8B-Instruct", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"},
], ],
extra_headers={ extra_headers={
"x-request-id": "sentiment-classification-00001", "x-request-id": "sentiment-classification-00001",
} },
) )
print(completion._request_id) print(completion._request_id)
@ -162,7 +167,7 @@ with `--enable-request-id-headers`.
prompt="A robot may not injure a human being", prompt="A robot may not injure a human being",
extra_headers={ extra_headers={
"x-request-id": "completion-test", "x-request-id": "completion-test",
} },
) )
print(completion._request_id) print(completion._request_id)
``` ```
@ -403,7 +408,7 @@ The Transcriptions API supports uploading audio files in various formats includi
model="openai/whisper-large-v3-turbo", model="openai/whisper-large-v3-turbo",
file=audio_file, file=audio_file,
language="en", language="en",
response_format="verbose_json" response_format="verbose_json",
) )
print(transcription.text) print(transcription.text)
@ -825,8 +830,8 @@ You can pass multi-modal inputs to scoring models by passing `content` including
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
}, },
}, },
] ],
} },
}, },
) )
response.raise_for_status() response.raise_for_status()

View File

@ -152,7 +152,9 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
""" """
try: try:
url = s3_client.generate_presigned_url( url = s3_client.generate_presigned_url(
ClientMethod=client_method, Params=method_parameters, ExpiresIn=expires_in ClientMethod=client_method,
Params=method_parameters,
ExpiresIn=expires_in,
) )
except ClientError: except ClientError:
raise raise
@ -161,10 +163,16 @@ def generate_presigned_url(s3_client, client_method, method_parameters, expires_
s3_client = boto3.client("s3") s3_client = boto3.client("s3")
input_url = generate_presigned_url( input_url = generate_presigned_url(
s3_client, "get_object", {"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"}, 3600 s3_client,
"get_object",
{"Bucket": "MY_BUCKET", "Key": "MY_INPUT_FILE.jsonl"},
expires_in=3600,
) )
output_url = generate_presigned_url( output_url = generate_presigned_url(
s3_client, "put_object", {"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"}, 3600 s3_client,
"put_object",
{"Bucket": "MY_BUCKET", "Key": "MY_OUTPUT_FILE.jsonl"},
expires_in=3600,
) )
print(f"{input_url=}") print(f"{input_url=}")
print(f"{output_url=}") print(f"{output_url=}")

View File

@ -84,7 +84,7 @@ directly to load models:
from vllm import LLM from vllm import LLM
llm = LLM( llm = LLM(
"s3://my-bucket/vllm/facebook/opt-125m/v1", "s3://my-bucket/vllm/facebook/opt-125m/v1",
load_format="tensorizer" load_format="tensorizer",
) )
``` ```