mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
Compare commits
1 Commits
d31f7844f8
...
torch-2.8
Author | SHA1 | Date | |
---|---|---|---|
dabc03baa7 |
@ -840,3 +840,8 @@ Key capabilities:
|
|||||||
The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
|
The following example shows how to deploy a large model like DeepSeek R1 with Ray Serve LLM: <gh-file:examples/online_serving/ray_serve_deepseek.py>.
|
||||||
|
|
||||||
Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
|
Learn more about Ray Serve LLM with the official [Ray Serve LLM documentation](https://docs.ray.io/en/latest/serve/llm/serving-llms.html).
|
||||||
|
|
||||||
|
curl http://localhost:8002/v1/rerank -H "Content-Type: application/json" -d '{
|
||||||
|
"query": "What is the capital of France?",
|
||||||
|
"documents": ["The capital of France is Paris.", "The capital of Germany is Berlin."]
|
||||||
|
}'
|
@ -5,10 +5,4 @@ numba == 0.60.0; python_version == '3.9' # v0.61 doesn't support Python 3.9. Req
|
|||||||
numba == 0.61.2; python_version > '3.9'
|
numba == 0.61.2; python_version > '3.9'
|
||||||
|
|
||||||
# Dependencies for NVIDIA GPUs
|
# Dependencies for NVIDIA GPUs
|
||||||
ray[cgraph]>=2.48.0 # Ray Compiled Graph, required for pipeline parallelism in V1.
|
torch==2.8.0
|
||||||
torch==2.7.1
|
|
||||||
torchaudio==2.7.1
|
|
||||||
# These must be updated alongside torch
|
|
||||||
torchvision==0.22.1 # Required for phi3v processor. See https://github.com/pytorch/vision?tab=readme-ov-file#installation for corresponding version
|
|
||||||
# https://github.com/facebookresearch/xformers/releases/tag/v0.0.31
|
|
||||||
xformers==0.0.31; platform_system == 'Linux' and platform_machine == 'x86_64' # Requires PyTorch >= 2.7
|
|
@ -39,3 +39,20 @@ def test_models(example_prompts, model_name) -> None:
|
|||||||
expected_str = EXPECTED_STRS_MAP[model_name][i]
|
expected_str = EXPECTED_STRS_MAP[model_name][i]
|
||||||
assert expected_str == output_str, (
|
assert expected_str == output_str, (
|
||||||
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
|
f"Expected: {expected_str!r}\nvLLM: {output_str!r}")
|
||||||
|
|
||||||
|
curl https://localhost:8002/v1/embeddings \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"input": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score",
|
||||||
|
"model": "$MODEL",
|
||||||
|
"encoding_format": "float"
|
||||||
|
}'
|
||||||
|
|
||||||
|
|
||||||
|
curl https://localhost:8002/v1/rerank \
|
||||||
|
-H "Content-Type: application/json" \
|
||||||
|
-d '{
|
||||||
|
"input": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score",
|
||||||
|
"prompt": "Query: What is the capital of France? \n\nDocuments: \n1. Paris is the capital city of France.\n2. Berlin is the capital of Germany.\n \n Rank the documents from most to least relevant to the query and provide a relevance score"
|
||||||
|
"model": "BAAI/bge-reranker-v2-m3",
|
||||||
|
}'
|
||||||
|
Reference in New Issue
Block a user