[CI] Add mteb testing to test the accuracy of the embedding model (#17175)

This commit is contained in:
wang.yuqi
2025-05-20 21:51:12 +08:00
committed by GitHub
parent d6c86d09ae
commit 86847700d7
6 changed files with 64 additions and 5 deletions

View File

@ -33,6 +33,7 @@ num2words # required for smolvlm test
opencv-python-headless >= 4.11.0 # required for video test
datamodel_code_generator # required for minicpm3 test
lm-eval[api]==0.4.8 # required for model evaluation test
mteb>=1.38.11, <2 # required for mteb test
transformers==4.51.3
tokenizers==0.21.1
huggingface-hub[hf_xet]>=0.30.0 # Required for Xet downloads.

View File

@ -99,6 +99,7 @@ datasets==3.0.2
# via
# evaluate
# lm-eval
# mteb
decorator==5.1.1
# via librosa
dill==0.3.8
@ -124,6 +125,8 @@ email-validator==2.2.0
# via pydantic
encodec==0.1.1
# via vocos
eval-type-backport==0.2.2
# via mteb
evaluate==0.4.3
# via lm-eval
fastparquet==2024.11.0
@ -291,6 +294,8 @@ msgpack==1.1.0
# via
# librosa
# ray
mteb==1.38.11
# via -r requirements/test.in
multidict==6.1.0
# via
# aiohttp
@ -331,6 +336,7 @@ numpy==1.26.4
# librosa
# matplotlib
# mistral-common
# mteb
# numba
# numexpr
# opencv-python-headless
@ -443,6 +449,8 @@ plotly==5.24.1
# via genai-perf
pluggy==1.5.0
# via pytest
polars==1.29.0
# via mteb
pooch==1.8.2
# via librosa
portalocker==2.10.1
@ -476,6 +484,7 @@ pydantic==2.9.2
# via
# datamodel-code-generator
# mistral-common
# mteb
pydantic-core==2.23.4
# via pydantic
pygments==2.18.0
@ -522,6 +531,8 @@ python-dateutil==2.9.0.post0
# typepy
python-rapidjson==1.20
# via tritonclient
pytrec-eval-terrier==0.5.7
# via mteb
pytz==2024.2
# via
# pandas
@ -564,6 +575,7 @@ requests==2.32.3
# huggingface-hub
# lm-eval
# mistral-common
# mteb
# pooch
# ray
# responses
@ -580,6 +592,7 @@ rfc3987==1.3.8
rich==13.9.4
# via
# genai-perf
# mteb
# typer
rouge-score==0.1.2
# via lm-eval
@ -607,16 +620,20 @@ scikit-learn==1.5.2
# via
# librosa
# lm-eval
# mteb
# sentence-transformers
scipy==1.13.1
# via
# librosa
# mteb
# scikit-learn
# sentence-transformers
# statsmodels
# vocos
sentence-transformers==3.2.1
# via -r requirements/test.in
# via
# -r requirements/test.in
# mteb
sentencepiece==0.2.0
# via mistral-common
setuptools==77.0.3
@ -696,6 +713,7 @@ torch==2.7.0+cu128
# fastsafetensors
# lm-eval
# mamba-ssm
# mteb
# peft
# runai-model-streamer
# sentence-transformers
@ -720,6 +738,7 @@ tqdm==4.66.6
# evaluate
# huggingface-hub
# lm-eval
# mteb
# nltk
# peft
# pqdm
@ -759,6 +778,7 @@ typing-extensions==4.12.2
# huggingface-hub
# librosa
# mistral-common
# mteb
# pqdm
# pydantic
# pydantic-core

View File

@ -0,0 +1,42 @@
# SPDX-License-Identifier: Apache-2.0
import math
import os
import pytest
from tests.models.language.pooling.mteb_utils import (MTEB_EMBED_TASKS,
OpenAIClientMtebEncoder,
run_mteb_embed_task,
run_mteb_embed_task_st)
from tests.utils import RemoteOpenAIServer
os.environ["VLLM_LOGGING_LEVEL"] = "WARNING"
MODEL_NAME = "BAAI/bge-m3"
DTYPE = "float16"
MAIN_SCORE = 0.7873427091972599
@pytest.fixture(scope="module")
def server():
args = [
"--task", "embed", "--dtype", DTYPE, "--enforce-eager",
"--max-model-len", "512"
]
with RemoteOpenAIServer(MODEL_NAME, args) as remote_server:
yield remote_server
def test_mteb(server):
client = server.get_client()
encoder = OpenAIClientMtebEncoder(MODEL_NAME, client)
vllm_main_score = run_mteb_embed_task(encoder, MTEB_EMBED_TASKS)
st_main_score = MAIN_SCORE or run_mteb_embed_task_st(
MODEL_NAME, MTEB_EMBED_TASKS)
print("VLLM main score: ", vllm_main_score)
print("SentenceTransformer main score: ", st_main_score)
print("Difference: ", st_main_score - vllm_main_score)
assert math.isclose(st_main_score, vllm_main_score, rel_tol=1e-4)

View File

@ -58,8 +58,6 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
vllm_extra_kwargs: dict[str, Any] = {}

View File

@ -23,7 +23,6 @@ MODELS = [
@pytest.mark.parametrize("model_info", MODELS)
def test_models_mteb(hf_runner, vllm_runner,
model_info: EmbedModelInfo) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info)

View File

@ -46,7 +46,6 @@ def test_models_mteb(
vllm_runner,
model_info: EmbedModelInfo,
) -> None:
pytest.skip("Skipping mteb test.")
from .mteb_utils import mteb_test_embed_models
mteb_test_embed_models(hf_runner, vllm_runner, model_info)