mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[CI] Add mteb testing to test the accuracy of the embedding model (#17175)
This commit is contained in:
@ -33,6 +33,7 @@ num2words # required for smolvlm test
|
||||
opencv-python-headless >= 4.11.0 # required for video test
|
||||
datamodel_code_generator # required for minicpm3 test
|
||||
lm-eval[api]==0.4.8 # required for model evaluation test
|
||||
mteb>=1.38.11, <2 # required for mteb test
|
||||
transformers==4.51.3
|
||||
tokenizers==0.21.1
|
||||
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.
|
||||
|
@ -99,6 +99,7 @@ datasets==3.0.2
|
||||
# via
|
||||
# evaluate
|
||||
# lm-eval
|
||||
# mteb
|
||||
decorator==5.1.1
|
||||
# via librosa
|
||||
dill==0.3.8
|
||||
@ -124,6 +125,8 @@ email-validator==2.2.0
|
||||
# via pydantic
|
||||
encodec==0.1.1
|
||||
# via vocos
|
||||
eval-type-backport==0.2.2
|
||||
# via mteb
|
||||
evaluate==0.4.3
|
||||
# via lm-eval
|
||||
fastparquet==2024.11.0
|
||||
@ -291,6 +294,8 @@ msgpack==1.1.0
|
||||
# via
|
||||
# librosa
|
||||
# ray
|
||||
mteb==1.38.11
|
||||
# via -r requirements/test.in
|
||||
multidict==6.1.0
|
||||
# via
|
||||
# aiohttp
|
||||
@ -331,6 +336,7 @@ numpy==1.26.4
|
||||
# librosa
|
||||
# matplotlib
|
||||
# mistral-common
|
||||
# mteb
|
||||
# numba
|
||||
# numexpr
|
||||
# opencv-python-headless
|
||||
@ -443,6 +449,8 @@ plotly==5.24.1
|
||||
# via genai-perf
|
||||
pluggy==1.5.0
|
||||
# via pytest
|
||||
polars==1.29.0
|
||||
# via mteb
|
||||
pooch==1.8.2
|
||||
# via librosa
|
||||
portalocker==2.10.1
|
||||
@ -476,6 +484,7 @@ pydantic==2.9.2
|
||||
# via
|
||||
# datamodel-code-generator
|
||||
# mistral-common
|
||||
# mteb
|
||||
pydantic-core==2.23.4
|
||||
# via pydantic
|
||||
pygments==2.18.0
|
||||
@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
|
||||
# typepy
|
||||
python-rapidjson==1.20
|
||||
# via tritonclient
|
||||
pytrec-eval-terrier==0.5.7
|
||||
# via mteb
|
||||
pytz==2024.2
|
||||
# via
|
||||
# pandas
|
||||
@ -564,6 +575,7 @@ requests==2.32.3
|
||||
# huggingface-hub
|
||||
# lm-eval
|
||||
# mistral-common
|
||||
# mteb
|
||||
# pooch
|
||||
# ray
|
||||
# responses
|
||||
@ -580,6 +592,7 @@ rfc3987==1.3.8
|
||||
rich==13.9.4
|
||||
# via
|
||||
# genai-perf
|
||||
# mteb
|
||||
# typer
|
||||
rouge-score==0.1.2
|
||||
# via lm-eval
|
||||
@ -607,16 +620,20 @@ scikit-learn==1.5.2
|
||||
# via
|
||||
# librosa
|
||||
# lm-eval
|
||||
# mteb
|
||||
# sentence-transformers
|
||||
scipy==1.13.1
|
||||
# via
|
||||
# librosa
|
||||
# mteb
|
||||
# scikit-learn
|
||||
# sentence-transformers
|
||||
# statsmodels
|
||||
# vocos
|
||||
sentence-transformers==3.2.1
|
||||
# via -r requirements/test.in
|
||||
# via
|
||||
# -r requirements/test.in
|
||||
# mteb
|
||||
sentencepiece==0.2.0
|
||||
# via mistral-common
|
||||
setuptools==77.0.3
|
||||
@ -696,6 +713,7 @@ torch==2.7.0+cu128
|
||||
# fastsafetensors
|
||||
# lm-eval
|
||||
# mamba-ssm
|
||||
# mteb
|
||||
# peft
|
||||
# runai-model-streamer
|
||||
# sentence-transformers
|
||||
@ -720,6 +738,7 @@ tqdm==4.66.6
|
||||
# evaluate
|
||||
# huggingface-hub
|
||||
# lm-eval
|
||||
# mteb
|
||||
# nltk
|
||||
# peft
|
||||
# pqdm
|
||||
@ -759,6 +778,7 @@ typing-extensions==4.12.2
|
||||
# huggingface-hub
|
||||
# librosa
|
||||
# mistral-common
|
||||
# mteb
|
||||
# pqdm
|
||||
# pydantic
|
||||
# pydantic-core
|
||||
|
42
tests/entrypoints/openai/correctness/test_mteb.py
Normal file
42
tests/entrypoints/openai/correctness/test_mteb.py
Normal file
@ -0,0 +1,42 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import math
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
|
||||
OpenAIClientMtebEncoder,
|
||||
run_mteb_embed_task,
|
||||
run_mteb_embed_task_st)
|
||||
from tests.utils import RemoteOpenAIServer
|
||||
|
||||
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
|
||||
|
||||
MODEL_NAME = "BAAI/bge-m3"
|
||||
DTYPE = "float16"
|
||||
MAIN_SCORE = 0.7873427091972599
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
args = [
|
||||
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
|
||||
"--max-model-len", "512"
|
||||
]
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
def test_mteb(server):
|
||||
client = server.get_client()
|
||||
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
|
||||
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
|
||||
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
|
||||
MODEL_NAME, MTEB_EMBED_TASKS)
|
||||
|
||||
print("VLLM main score: ", vllm_main_score)
|
||||
print("SentenceTransformer main score: ", st_main_score)
|
||||
print("Difference: ", st_main_score - vllm_main_score)
|
||||
|
||||
assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4)
|
@ -58,8 +58,6 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
pytest.skip("Skipping mteb test.")
|
||||
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
|
||||
vllm_extra_kwargs: dict[str, Any] = {}
|
||||
|
@ -23,7 +23,6 @@ MODELS = [
|
||||
@pytest.mark.parametrize("model_info", MODELS)
|
||||
def test_models_mteb(hf_runner, vllm_runner,
|
||||
model_info: EmbedModelInfo) -> None:
|
||||
pytest.skip("Skipping mteb test.")
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
@ -46,7 +46,6 @@ def test_models_mteb(
|
||||
vllm_runner,
|
||||
model_info: EmbedModelInfo,
|
||||
) -> None:
|
||||
pytest.skip("Skipping mteb test.")
|
||||
from .mteb_utils import mteb_test_embed_models
|
||||
mteb_test_embed_models(hf_runner, vllm_runner, model_info)
|
||||
|
||||
|
Reference in New Issue
Block a user