mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[CI/Build] Tweak Marlin Nondeterminism Issues (#4713)
This commit is contained in:
@ -1,13 +1,11 @@
|
||||
"""Compares the outputs of gptq vs gptq_marlin
|
||||
Note: GPTQ and Marlin do not have bitwise correctness.
|
||||
As a result, in this test, we just confirm that the top selected tokens of the
|
||||
Marlin/GPTQ models are in the top 3 selections of each other.
|
||||
Marlin/GPTQ models are in the top 5 selections of each other.
|
||||
Note: Marlin internally uses locks to synchronize the threads. This can
|
||||
result in very slight nondeterminism for Marlin. As a result, we re-run the test
|
||||
up to 3 times to see if we pass.
|
||||
Note: This test currently fails running with --forked with the following:
|
||||
RuntimeError: Cannot re-initialize CUDA in forked subprocess.
|
||||
To use CUDA with multiprocessing, you must use the 'spawn' start method
|
||||
|
||||
Run `pytest tests/models/test_gptq_marlin.py`.
|
||||
"""
|
||||
import os
|
||||
@ -49,7 +47,7 @@ MODELS = [
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.flaky(reruns=2)
|
||||
@pytest.mark.flaky(reruns=3)
|
||||
@pytest.mark.skipif(gptq_marlin_not_supported,
|
||||
reason="gptq_marlin is not supported on this GPU type.")
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@ -75,7 +73,7 @@ def test_models(
|
||||
tensor_parallel_size=1)
|
||||
|
||||
gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
|
||||
example_prompts, max_tokens, num_logprobs)
|
||||
example_prompts[:-1], max_tokens, num_logprobs)
|
||||
del gptq_marlin_model
|
||||
|
||||
# Run gptq.
|
||||
@ -85,7 +83,7 @@ def test_models(
|
||||
quantization="gptq",
|
||||
max_model_len=MAX_MODEL_LEN,
|
||||
tensor_parallel_size=1)
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
|
||||
gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
|
||||
max_tokens,
|
||||
num_logprobs)
|
||||
del gptq_model
|
||||
|
Reference in New Issue
Block a user