mirror of
https://github.com/vllm-project/vllm-ascend.git
synced 2025-10-20 13:43:53 +08:00
ACLgraph enable: Test cases revisions for all features (#3388)
### What this PR does / why we need it? This PR revise the test cases of various features on the warehouse which add the enablement of aclgraph to the test cases. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? ut - vLLM version: v0.11.0rc3 - vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0 Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
@ -14,4 +14,4 @@ gpu_memory_utilization: 0.7
|
|||||||
apply_chat_template: False
|
apply_chat_template: False
|
||||||
fewshot_as_multiturn: False
|
fewshot_as_multiturn: False
|
||||||
trust_remote_code: True
|
trust_remote_code: True
|
||||||
enforce_eager: True
|
enforce_eager: False
|
||||||
|
@ -52,8 +52,8 @@ def test_data_parallel_inference(model, max_tokens):
|
|||||||
"--node-rank",
|
"--node-rank",
|
||||||
"0",
|
"0",
|
||||||
"--trust-remote-code",
|
"--trust-remote-code",
|
||||||
"--enforce-eager",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
if model == "Qwen/Qwen3-30B-A3B":
|
if model == "Qwen/Qwen3-30B-A3B":
|
||||||
cmd.append("--enable-expert-parallel")
|
cmd.append("--enable-expert-parallel")
|
||||||
|
|
||||||
|
@ -21,7 +21,7 @@ def test_e2e_ep_correctness(model_name):
|
|||||||
additional_config={"ascend_scheduler_config": {
|
additional_config={"ascend_scheduler_config": {
|
||||||
"enabled": True
|
"enabled": True
|
||||||
}},
|
}},
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=False) as vllm_model:
|
||||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
@ -31,7 +31,7 @@ def test_e2e_ep_correctness(model_name):
|
|||||||
additional_config={"ascend_scheduler_config": {
|
additional_config={"ascend_scheduler_config": {
|
||||||
"enabled": True
|
"enabled": True
|
||||||
}},
|
}},
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=False) as vllm_model:
|
||||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
check_outputs_equal(
|
check_outputs_equal(
|
||||||
|
@ -16,7 +16,7 @@ def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
|
|||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
distributed_executor_backend=distributed_executor_backend,
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=False) as vllm_model:
|
||||||
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
|
||||||
|
|
||||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||||
|
@ -52,7 +52,7 @@ def test_models_distributed_QwQ():
|
|||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
distributed_executor_backend="mp",
|
distributed_executor_backend="mp",
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
@ -163,11 +163,10 @@ def test_sp_for_qwen3_moe() -> None:
|
|||||||
vllm_model.generate(example_prompts, sampling_params)
|
vllm_model.generate(example_prompts, sampling_params)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
||||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
|
||||||
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model):
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
@ -176,7 +175,7 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
|||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
snapshot_download(model),
|
snapshot_download(model),
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=False,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
@ -184,12 +183,10 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
|||||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
|
||||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
||||||
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
|
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
|
||||||
model, enforce_eager):
|
|
||||||
example_prompts = [
|
example_prompts = [
|
||||||
"Hello, my name is",
|
"Hello, my name is",
|
||||||
]
|
]
|
||||||
@ -198,7 +195,7 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
|
|||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
snapshot_download(model),
|
snapshot_download(model),
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enforce_eager=enforce_eager,
|
enforce_eager=False,
|
||||||
dtype="auto",
|
dtype="auto",
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
|
@ -62,7 +62,7 @@ INPUT_PROMPTS = [
|
|||||||
@pytest.mark.parametrize("max_tokens", [50])
|
@pytest.mark.parametrize("max_tokens", [50])
|
||||||
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
||||||
with VllmRunner(model,
|
with VllmRunner(model,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
@ -71,7 +71,7 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
|
|||||||
|
|
||||||
with VllmRunner(model,
|
with VllmRunner(model,
|
||||||
enable_prefix_caching=False,
|
enable_prefix_caching=False,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
@ -96,7 +96,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
|||||||
'enabled': True,
|
'enabled': True,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
@ -109,7 +109,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
|||||||
'enable_prefix_caching': True,
|
'enable_prefix_caching': True,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
tensor_parallel_size=2,
|
tensor_parallel_size=2,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
|
@ -33,47 +33,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
|||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||||
def test_external_launcher_eager(model):
|
def test_external_launcher(model):
|
||||||
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
|
||||||
env = os.environ.copy()
|
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
str(script),
|
|
||||||
"--model",
|
|
||||||
model,
|
|
||||||
"--tp-size",
|
|
||||||
"2",
|
|
||||||
"--proc-per-node",
|
|
||||||
"2",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--enforce-eager",
|
|
||||||
"--enable-expert-parallel",
|
|
||||||
"--enable-sleep-mode",
|
|
||||||
"--model-weight-gib",
|
|
||||||
"20",
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
|
||||||
proc = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
env=env,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
timeout=600,
|
|
||||||
)
|
|
||||||
output = proc.stdout.decode()
|
|
||||||
|
|
||||||
print(output)
|
|
||||||
|
|
||||||
assert "TP RANKS: [0]" in output
|
|
||||||
assert "TP RANKS: [1]" in output
|
|
||||||
assert "Generated text:" in output
|
|
||||||
assert proc.returncode == 0
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
|
||||||
def test_external_launcher_aclgraph(model):
|
|
||||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||||
env = os.environ.copy()
|
env = os.environ.copy()
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
# TODO: Change to 2 when ci machine has 4 cards
|
||||||
@ -147,42 +107,3 @@ def test_external_launcher_dense(model):
|
|||||||
assert "TP RANKS: [1]" in output
|
assert "TP RANKS: [1]" in output
|
||||||
assert "Generated text:" in output
|
assert "Generated text:" in output
|
||||||
assert proc.returncode == 0
|
assert proc.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("model", MODELS)
|
|
||||||
def test_external_launcher_dense_eager(model):
|
|
||||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
|
||||||
env = os.environ.copy()
|
|
||||||
# TODO: Change to 2 when ci machine has 4 cards
|
|
||||||
cmd = [
|
|
||||||
sys.executable,
|
|
||||||
str(script),
|
|
||||||
"--model",
|
|
||||||
model,
|
|
||||||
"--tp-size",
|
|
||||||
"2",
|
|
||||||
"--proc-per-node",
|
|
||||||
"2",
|
|
||||||
"--trust-remote-code",
|
|
||||||
"--enforce-eager",
|
|
||||||
"--enable-sleep-mode",
|
|
||||||
"--model-weight-gib",
|
|
||||||
"20",
|
|
||||||
]
|
|
||||||
|
|
||||||
print(f"Running subprocess: {' '.join(cmd)}")
|
|
||||||
proc = subprocess.run(
|
|
||||||
cmd,
|
|
||||||
env=env,
|
|
||||||
stdout=subprocess.PIPE,
|
|
||||||
stderr=subprocess.STDOUT,
|
|
||||||
timeout=600,
|
|
||||||
)
|
|
||||||
output = proc.stdout.decode()
|
|
||||||
|
|
||||||
print(output)
|
|
||||||
|
|
||||||
assert "TP RANKS: [0]" in output
|
|
||||||
assert "TP RANKS: [1]" in output
|
|
||||||
assert "Generated text:" in output
|
|
||||||
assert proc.returncode == 0
|
|
||||||
|
@ -73,7 +73,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||||
--port $PREFILL_PORT \
|
--port $PREFILL_PORT \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
--enforce-eager \
|
|
||||||
--disable-log-requests \
|
--disable-log-requests \
|
||||||
--gpu-memory-utilization 0.8 \
|
--gpu-memory-utilization 0.8 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
||||||
@ -93,7 +92,6 @@ run_tests_for_model() {
|
|||||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||||
--port $DECODE_PORT \
|
--port $DECODE_PORT \
|
||||||
--seed 1024 \
|
--seed 1024 \
|
||||||
--enforce-eager \
|
|
||||||
--disable-log-requests \
|
--disable-log-requests \
|
||||||
--gpu-memory-utilization 0.8 \
|
--gpu-memory-utilization 0.8 \
|
||||||
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
|
||||||
|
@ -66,7 +66,6 @@ function run_prefill_instance() {
|
|||||||
--served-model-name Deepseek \
|
--served-model-name Deepseek \
|
||||||
--max-model-len 2000 \
|
--max-model-len 2000 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--enforce-eager \
|
|
||||||
--kv-transfer-config "$KV_CONFIG"
|
--kv-transfer-config "$KV_CONFIG"
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -120,7 +119,6 @@ function run_decode_instance() {
|
|||||||
--max-num-batched-tokens 2000 \
|
--max-num-batched-tokens 2000 \
|
||||||
--trust-remote-code \
|
--trust-remote-code \
|
||||||
--gpu-memory-utilization 0.9 \
|
--gpu-memory-utilization 0.9 \
|
||||||
--enforce-eager \
|
|
||||||
--kv-transfer-config "$KV_CONFIG"
|
--kv-transfer-config "$KV_CONFIG"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,7 +71,7 @@ def test_ngram_correctness(
|
|||||||
should be the same when using ngram speculative decoding.
|
should be the same when using ngram speculative decoding.
|
||||||
'''
|
'''
|
||||||
pytest.skip("Not current support for the test.")
|
pytest.skip("Not current support for the test.")
|
||||||
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
|
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False)
|
||||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||||
del ref_llm
|
del ref_llm
|
||||||
with VllmRunner(model_name,
|
with VllmRunner(model_name,
|
||||||
@ -82,7 +82,7 @@ def test_ngram_correctness(
|
|||||||
"num_speculative_tokens": 3,
|
"num_speculative_tokens": 3,
|
||||||
},
|
},
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
enforce_eager=True) as runner:
|
enforce_eager=False) as runner:
|
||||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||||
matches = 0
|
matches = 0
|
||||||
misses = 0
|
misses = 0
|
||||||
@ -111,7 +111,7 @@ def test_eagle_correctness(
|
|||||||
should be the same when using eagle speculative decoding.
|
should be the same when using eagle speculative decoding.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
|
||||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||||
del ref_llm
|
del ref_llm
|
||||||
|
|
||||||
@ -129,7 +129,7 @@ def test_eagle_correctness(
|
|||||||
"max_model_len": 128,
|
"max_model_len": 128,
|
||||||
},
|
},
|
||||||
max_model_len=128,
|
max_model_len=128,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
) as runner:
|
) as runner:
|
||||||
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
spec_outputs = runner.model.chat(test_prompts, sampling_config)
|
||||||
|
|
||||||
|
@ -9,7 +9,8 @@ from tests.e2e.model_utils import check_outputs_equal
|
|||||||
MODEL = "Qwen/Qwen3-0.6B"
|
MODEL = "Qwen/Qwen3-0.6B"
|
||||||
|
|
||||||
|
|
||||||
def test_concurrent_partial_prefill():
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||||
|
def test_concurrent_partial_prefill(enforce_eager):
|
||||||
with VllmRunner(MODEL,
|
with VllmRunner(MODEL,
|
||||||
additional_config={
|
additional_config={
|
||||||
'ascend_scheduler_config': {
|
'ascend_scheduler_config': {
|
||||||
@ -18,7 +19,7 @@ def test_concurrent_partial_prefill():
|
|||||||
},
|
},
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=2048,
|
max_num_batched_tokens=2048,
|
||||||
enforce_eager=True,
|
enforce_eager=enforce_eager,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
|
||||||
@ -28,7 +29,8 @@ def test_concurrent_partial_prefill():
|
|||||||
assert len(output.outputs) == 1
|
assert len(output.outputs) == 1
|
||||||
|
|
||||||
|
|
||||||
def test_prefix_cache_stats_is_recorded():
|
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||||
|
def test_prefix_cache_stats_is_recorded(enforce_eager):
|
||||||
with VllmRunner(MODEL,
|
with VllmRunner(MODEL,
|
||||||
additional_config={
|
additional_config={
|
||||||
'ascend_scheduler_config': {
|
'ascend_scheduler_config': {
|
||||||
@ -37,7 +39,7 @@ def test_prefix_cache_stats_is_recorded():
|
|||||||
},
|
},
|
||||||
max_num_seqs=3,
|
max_num_seqs=3,
|
||||||
max_num_batched_tokens=2048,
|
max_num_batched_tokens=2048,
|
||||||
enforce_eager=True,
|
enforce_eager=enforce_eager,
|
||||||
max_model_len=2048,
|
max_model_len=2048,
|
||||||
gpu_memory_utilization=0.7) as vllm_model:
|
gpu_memory_utilization=0.7) as vllm_model:
|
||||||
# 17 tokens will make sure first 16 tokens are cached in a block
|
# 17 tokens will make sure first 16 tokens are cached in a block
|
||||||
|
@ -74,7 +74,7 @@ def test_end_to_end():
|
|||||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||||
|
|
||||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
enable_sleep_mode=True) as runner:
|
enable_sleep_mode=True) as runner:
|
||||||
|
|
||||||
output = runner.model.generate(prompt, sampling_params)
|
output = runner.model.generate(prompt, sampling_params)
|
||||||
|
@ -43,12 +43,13 @@ def test_models(
|
|||||||
temperature=0.0,
|
temperature=0.0,
|
||||||
)
|
)
|
||||||
|
|
||||||
with VllmRunner(model, long_prefill_token_threshold=20,
|
with VllmRunner(model,
|
||||||
enforce_eager=True) as vllm_model:
|
long_prefill_token_threshold=20,
|
||||||
|
enforce_eager=False) as vllm_model:
|
||||||
output1 = vllm_model.generate(prompts, sampling_params)
|
output1 = vllm_model.generate(prompts, sampling_params)
|
||||||
|
|
||||||
with VllmRunner(model,
|
with VllmRunner(model,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
additional_config={
|
additional_config={
|
||||||
'ascend_scheduler_config': {
|
'ascend_scheduler_config': {
|
||||||
'enabled': True
|
'enabled': True
|
||||||
|
@ -29,7 +29,7 @@ def test_embed_models_correctness():
|
|||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
model_name,
|
model_name,
|
||||||
task="embed",
|
task="embed",
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
) as vllm_runner:
|
) as vllm_runner:
|
||||||
vllm_outputs = vllm_runner.encode(queries)
|
vllm_outputs = vllm_runner.encode(queries)
|
||||||
|
|
||||||
|
@ -51,7 +51,7 @@ def test_ilama_lora(ilama_lora_files):
|
|||||||
max_loras=4,
|
max_loras=4,
|
||||||
max_model_len=1024,
|
max_model_len=1024,
|
||||||
max_num_seqs=16,
|
max_num_seqs=16,
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=False) as vllm_model:
|
||||||
|
|
||||||
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
|
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
|
||||||
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
for i in range(len(EXPECTED_LORA_OUTPUT)):
|
||||||
|
@ -28,7 +28,7 @@ def test_quant_W8A8():
|
|||||||
with VllmRunner(
|
with VllmRunner(
|
||||||
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
|
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
enforce_eager=True,
|
enforce_eager=False,
|
||||||
gpu_memory_utilization=0.7,
|
gpu_memory_utilization=0.7,
|
||||||
quantization="ascend",
|
quantization="ascend",
|
||||||
) as vllm_model:
|
) as vllm_model:
|
||||||
|
@ -46,7 +46,7 @@ def test_multimodal_vl(prompt_template):
|
|||||||
"max_pixels": 1280 * 28 * 28,
|
"max_pixels": 1280 * 28 * 28,
|
||||||
"fps": 1,
|
"fps": 1,
|
||||||
},
|
},
|
||||||
enforce_eager=True) as vllm_model:
|
enforce_eager=False) as vllm_model:
|
||||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||||
images=images,
|
images=images,
|
||||||
max_tokens=64)
|
max_tokens=64)
|
||||||
|
Reference in New Issue
Block a user