ACLgraph enable: Test cases revisions for all features (#3388)

### What this PR does / why we need it?
This PR revise the test cases of various features on the warehouse which
add the enablement of aclgraph to the test cases.

### Does this PR introduce _any_ user-facing change?
no

### How was this patch tested?
ut

- vLLM version: v0.11.0rc3
- vLLM main: https://github.com/vllm-project/vllm/commit/v0.11.0

Signed-off-by: lilinsiman <lilinsiman@gmail.com>
This commit is contained in:
lilinsiman
2025-10-17 17:15:19 +08:00
committed by GitHub
parent bf87606932
commit 1b424fb7f1
17 changed files with 34 additions and 117 deletions

View File

@ -14,4 +14,4 @@ gpu_memory_utilization: 0.7
apply_chat_template: False apply_chat_template: False
fewshot_as_multiturn: False fewshot_as_multiturn: False
trust_remote_code: True trust_remote_code: True
enforce_eager: True enforce_eager: False

View File

@ -52,8 +52,8 @@ def test_data_parallel_inference(model, max_tokens):
"--node-rank", "--node-rank",
"0", "0",
"--trust-remote-code", "--trust-remote-code",
"--enforce-eager",
] ]
if model == "Qwen/Qwen3-30B-A3B": if model == "Qwen/Qwen3-30B-A3B":
cmd.append("--enable-expert-parallel") cmd.append("--enable-expert-parallel")

View File

@ -21,7 +21,7 @@ def test_e2e_ep_correctness(model_name):
additional_config={"ascend_scheduler_config": { additional_config={"ascend_scheduler_config": {
"enabled": True "enabled": True
}}, }},
enforce_eager=True) as vllm_model: enforce_eager=False) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens) tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner( with VllmRunner(
@ -31,7 +31,7 @@ def test_e2e_ep_correctness(model_name):
additional_config={"ascend_scheduler_config": { additional_config={"ascend_scheduler_config": {
"enabled": True "enabled": True
}}, }},
enforce_eager=True) as vllm_model: enforce_eager=False) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens) ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal( check_outputs_equal(

View File

@ -16,7 +16,7 @@ def test_ilama_lora_tp2(distributed_executor_backend, ilama_lora_files):
max_num_seqs=16, max_num_seqs=16,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend=distributed_executor_backend, distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model: enforce_eager=False) as vllm_model:
output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2) output = do_sample(vllm_model.model, ilama_lora_files, lora_id=2)
for i in range(len(EXPECTED_LORA_OUTPUT)): for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@ -52,7 +52,7 @@ def test_models_distributed_QwQ():
dtype=dtype, dtype=dtype,
tensor_parallel_size=2, tensor_parallel_size=2,
distributed_executor_backend="mp", distributed_executor_backend="mp",
enforce_eager=True, enforce_eager=False,
) as vllm_model: ) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)
@ -163,11 +163,10 @@ def test_sp_for_qwen3_moe() -> None:
vllm_model.generate(example_prompts, sampling_params) vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM1": "1"})
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager): def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model):
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
@ -176,7 +175,7 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
with VllmRunner( with VllmRunner(
snapshot_download(model), snapshot_download(model),
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=False,
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
quantization="ascend", quantization="ascend",
@ -184,12 +183,10 @@ def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
vllm_model.generate_greedy(example_prompts, max_tokens) vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS) @pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"}) @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight( def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(model):
model, enforce_eager):
example_prompts = [ example_prompts = [
"Hello, my name is", "Hello, my name is",
] ]
@ -198,7 +195,7 @@ def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
with VllmRunner( with VllmRunner(
snapshot_download(model), snapshot_download(model),
max_model_len=8192, max_model_len=8192,
enforce_eager=enforce_eager, enforce_eager=False,
dtype="auto", dtype="auto",
tensor_parallel_size=2, tensor_parallel_size=2,
quantization="ascend", quantization="ascend",

View File

@ -62,7 +62,7 @@ INPUT_PROMPTS = [
@pytest.mark.parametrize("max_tokens", [50]) @pytest.mark.parametrize("max_tokens", [50])
def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None: def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model, with VllmRunner(model,
enforce_eager=True, enforce_eager=False,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
@ -71,7 +71,7 @@ def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
with VllmRunner(model, with VllmRunner(model,
enable_prefix_caching=False, enable_prefix_caching=False,
enforce_eager=True, enforce_eager=False,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
@ -96,7 +96,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
'enabled': True, 'enabled': True,
}, },
}, },
enforce_eager=True, enforce_eager=False,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
@ -109,7 +109,7 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
'enable_prefix_caching': True, 'enable_prefix_caching': True,
}, },
}, },
enforce_eager=True, enforce_eager=False,
max_model_len=2048, max_model_len=2048,
tensor_parallel_size=2, tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:

View File

@ -33,47 +33,7 @@ DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MOE_MODELS) @pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_eager(model): def test_external_launcher(model):
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-expert-parallel",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_aclgraph(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py" script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy() env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards # TODO: Change to 2 when ci machine has 4 cards
@ -147,42 +107,3 @@ def test_external_launcher_dense(model):
assert "TP RANKS: [1]" in output assert "TP RANKS: [1]" in output
assert "Generated text:" in output assert "Generated text:" in output
assert proc.returncode == 0 assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
def test_external_launcher_dense_eager(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0

View File

@ -73,7 +73,6 @@ run_tests_for_model() {
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \ BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
--port $PREFILL_PORT \ --port $PREFILL_PORT \
--seed 1024 \ --seed 1024 \
--enforce-eager \
--disable-log-requests \ --disable-log-requests \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_producer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"
@ -93,7 +92,6 @@ run_tests_for_model() {
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \ BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
--port $DECODE_PORT \ --port $DECODE_PORT \
--seed 1024 \ --seed 1024 \
--enforce-eager \
--disable-log-requests \ --disable-log-requests \
--gpu-memory-utilization 0.8 \ --gpu-memory-utilization 0.8 \
--kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'" --kv-transfer-config '{\"kv_connector\":\"LLMDataDistCMgrConnector\",\"kv_role\":\"kv_consumer\",\"kv_buffer_device\":\"npu\",\"kv_parallel_size\":\"1\",\"kv_port\":\"20001\",\"engine_id\":\"0\",\"kv_connector_module_path\":\"vllm_ascend.distributed.llmdatadist_c_mgr_connector\"}'"

View File

@ -66,7 +66,6 @@ function run_prefill_instance() {
--served-model-name Deepseek \ --served-model-name Deepseek \
--max-model-len 2000 \ --max-model-len 2000 \
--trust-remote-code \ --trust-remote-code \
--enforce-eager \
--kv-transfer-config "$KV_CONFIG" --kv-transfer-config "$KV_CONFIG"
} }
@ -120,7 +119,6 @@ function run_decode_instance() {
--max-num-batched-tokens 2000 \ --max-num-batched-tokens 2000 \
--trust-remote-code \ --trust-remote-code \
--gpu-memory-utilization 0.9 \ --gpu-memory-utilization 0.9 \
--enforce-eager \
--kv-transfer-config "$KV_CONFIG" --kv-transfer-config "$KV_CONFIG"
} }

View File

@ -71,7 +71,7 @@ def test_ngram_correctness(
should be the same when using ngram speculative decoding. should be the same when using ngram speculative decoding.
''' '''
pytest.skip("Not current support for the test.") pytest.skip("Not current support for the test.")
ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True) ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=False)
ref_outputs = ref_llm.chat(test_prompts, sampling_config) ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm del ref_llm
with VllmRunner(model_name, with VllmRunner(model_name,
@ -82,7 +82,7 @@ def test_ngram_correctness(
"num_speculative_tokens": 3, "num_speculative_tokens": 3,
}, },
max_model_len=1024, max_model_len=1024,
enforce_eager=True) as runner: enforce_eager=False) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config) spec_outputs = runner.model.chat(test_prompts, sampling_config)
matches = 0 matches = 0
misses = 0 misses = 0
@ -111,7 +111,7 @@ def test_eagle_correctness(
should be the same when using eagle speculative decoding. should be the same when using eagle speculative decoding.
''' '''
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True) ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=False)
ref_outputs = ref_llm.chat(test_prompts, sampling_config) ref_outputs = ref_llm.chat(test_prompts, sampling_config)
del ref_llm del ref_llm
@ -129,7 +129,7 @@ def test_eagle_correctness(
"max_model_len": 128, "max_model_len": 128,
}, },
max_model_len=128, max_model_len=128,
enforce_eager=True, enforce_eager=False,
) as runner: ) as runner:
spec_outputs = runner.model.chat(test_prompts, sampling_config) spec_outputs = runner.model.chat(test_prompts, sampling_config)

View File

@ -9,7 +9,8 @@ from tests.e2e.model_utils import check_outputs_equal
MODEL = "Qwen/Qwen3-0.6B" MODEL = "Qwen/Qwen3-0.6B"
def test_concurrent_partial_prefill(): @pytest.mark.parametrize("enforce_eager", [True, False])
def test_concurrent_partial_prefill(enforce_eager):
with VllmRunner(MODEL, with VllmRunner(MODEL,
additional_config={ additional_config={
'ascend_scheduler_config': { 'ascend_scheduler_config': {
@ -18,7 +19,7 @@ def test_concurrent_partial_prefill():
}, },
max_num_seqs=3, max_num_seqs=3,
max_num_batched_tokens=2048, max_num_batched_tokens=2048,
enforce_eager=True, enforce_eager=enforce_eager,
max_model_len=2048, max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
outputs = vllm_model.model.generate(["Hello my name is Robert and I"] * outputs = vllm_model.model.generate(["Hello my name is Robert and I"] *
@ -28,7 +29,8 @@ def test_concurrent_partial_prefill():
assert len(output.outputs) == 1 assert len(output.outputs) == 1
def test_prefix_cache_stats_is_recorded(): @pytest.mark.parametrize("enforce_eager", [True, False])
def test_prefix_cache_stats_is_recorded(enforce_eager):
with VllmRunner(MODEL, with VllmRunner(MODEL,
additional_config={ additional_config={
'ascend_scheduler_config': { 'ascend_scheduler_config': {
@ -37,7 +39,7 @@ def test_prefix_cache_stats_is_recorded():
}, },
max_num_seqs=3, max_num_seqs=3,
max_num_batched_tokens=2048, max_num_batched_tokens=2048,
enforce_eager=True, enforce_eager=enforce_eager,
max_model_len=2048, max_model_len=2048,
gpu_memory_utilization=0.7) as vllm_model: gpu_memory_utilization=0.7) as vllm_model:
# 17 tokens will make sure first 16 tokens are cached in a block # 17 tokens will make sure first 16 tokens are cached in a block

View File

@ -74,7 +74,7 @@ def test_end_to_end():
sampling_params = SamplingParams(temperature=0, max_tokens=10) sampling_params = SamplingParams(temperature=0, max_tokens=10)
with VllmRunner("Qwen/Qwen3-0.6B", with VllmRunner("Qwen/Qwen3-0.6B",
enforce_eager=True, enforce_eager=False,
enable_sleep_mode=True) as runner: enable_sleep_mode=True) as runner:
output = runner.model.generate(prompt, sampling_params) output = runner.model.generate(prompt, sampling_params)

View File

@ -43,12 +43,13 @@ def test_models(
temperature=0.0, temperature=0.0,
) )
with VllmRunner(model, long_prefill_token_threshold=20, with VllmRunner(model,
enforce_eager=True) as vllm_model: long_prefill_token_threshold=20,
enforce_eager=False) as vllm_model:
output1 = vllm_model.generate(prompts, sampling_params) output1 = vllm_model.generate(prompts, sampling_params)
with VllmRunner(model, with VllmRunner(model,
enforce_eager=True, enforce_eager=False,
additional_config={ additional_config={
'ascend_scheduler_config': { 'ascend_scheduler_config': {
'enabled': True 'enabled': True

View File

@ -29,7 +29,7 @@ def test_embed_models_correctness():
with VllmRunner( with VllmRunner(
model_name, model_name,
task="embed", task="embed",
enforce_eager=True, enforce_eager=False,
) as vllm_runner: ) as vllm_runner:
vllm_outputs = vllm_runner.encode(queries) vllm_outputs = vllm_runner.encode(queries)

View File

@ -51,7 +51,7 @@ def test_ilama_lora(ilama_lora_files):
max_loras=4, max_loras=4,
max_model_len=1024, max_model_len=1024,
max_num_seqs=16, max_num_seqs=16,
enforce_eager=True) as vllm_model: enforce_eager=False) as vllm_model:
output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1) output1 = do_sample(vllm_model.model, ilama_lora_files, lora_id=1)
for i in range(len(EXPECTED_LORA_OUTPUT)): for i in range(len(EXPECTED_LORA_OUTPUT)):

View File

@ -28,7 +28,7 @@ def test_quant_W8A8():
with VllmRunner( with VllmRunner(
snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"), snapshot_download("vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"),
max_model_len=8192, max_model_len=8192,
enforce_eager=True, enforce_eager=False,
gpu_memory_utilization=0.7, gpu_memory_utilization=0.7,
quantization="ascend", quantization="ascend",
) as vllm_model: ) as vllm_model:

View File

@ -46,7 +46,7 @@ def test_multimodal_vl(prompt_template):
"max_pixels": 1280 * 28 * 28, "max_pixels": 1280 * 28 * 28,
"fps": 1, "fps": 1,
}, },
enforce_eager=True) as vllm_model: enforce_eager=False) as vllm_model:
outputs = vllm_model.generate_greedy(prompts=prompts, outputs = vllm_model.generate_greedy(prompts=prompts,
images=images, images=images,
max_tokens=64) max_tokens=64)