mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
1268 lines
45 KiB
YAML
1268 lines
45 KiB
YAML
# In this file, you can add more tests to run either by adding a new step or
|
|
# adding a new command to an existing step. See different options here for examples.
|
|
|
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
|
# to generate the final pipeline yaml file.
|
|
|
|
# Documentation
|
|
# label(str): the name of the test. emojis allowed.
|
|
# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
|
|
# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
|
|
# fast_check_only(bool): run this test on the fastcheck pipeline only
|
|
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
|
|
# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
|
|
# command(str): the single command to run for tests. incompatible with commands.
|
|
# commands(list): the list of commands to run for the test. incompatible with command.
|
|
# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
|
|
# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
|
|
# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
|
|
# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
|
|
# in this case, commands must be specified. the first command runs on the first host, the second
|
|
# command runs on the second host.
|
|
# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
|
|
# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
|
|
# and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
|
|
# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
|
|
# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
|
|
|
|
# When adding a test
|
|
# - If the test belongs to an existing group, add it there
|
|
# - If the test is short, add to any existing step
|
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
|
# Note that all steps execute in parallel.
|
|
|
|
steps:
|
|
##### fast check tests #####
|
|
|
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
|
# if this test fails, it means the nightly torch version is not compatible with some
|
|
# of the dependencies. Please check the error message and add the package to whitelist
|
|
# in /vllm/tools/generate_nightly_torch_test.py
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
soft_fail: true
|
|
source_file_dependencies:
|
|
- requirements/nightly_torch_test.txt
|
|
commands:
|
|
- bash standalone_tests/pytorch_nightly_dependency.sh
|
|
|
|
- label: Async Engine, Inputs, Utils, Worker Test # 36min
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/multimodal
|
|
- tests/utils_
|
|
commands:
|
|
- pytest -v -s -m 'not cpu_test' multimodal
|
|
- pytest -v -s utils_
|
|
|
|
- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
|
|
timeout_in_minutes: 10
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/test_inputs.py
|
|
- tests/test_outputs.py
|
|
- tests/multimodal
|
|
- tests/standalone_tests/lazy_imports.py
|
|
- tests/transformers_utils
|
|
no_gpu: true
|
|
commands:
|
|
- python3 standalone_tests/lazy_imports.py
|
|
- pytest -v -s test_inputs.py
|
|
- pytest -v -s test_outputs.py
|
|
- pytest -v -s -m 'cpu_test' multimodal
|
|
- pytest -v -s transformers_utils
|
|
|
|
- label: Python-only Installation Test # 10min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- tests/standalone_tests/python_only_compile.sh
|
|
- setup.py
|
|
commands:
|
|
- bash standalone_tests/python_only_compile.sh
|
|
|
|
- label: Basic Correctness Test # 20min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
fast_check: true
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/basic_correctness/test_basic_correctness
|
|
- tests/basic_correctness/test_cpu_offload
|
|
- tests/basic_correctness/test_cumem.py
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -v -s basic_correctness/test_cumem.py
|
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
|
|
|
- label: Entrypoints Unit Tests # 5min
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
timeout_in_minutes: 10
|
|
working_dir: "/vllm-workspace/tests"
|
|
fast_check: true
|
|
source_file_dependencies:
|
|
- vllm/entrypoints
|
|
- tests/entrypoints/
|
|
commands:
|
|
- pytest -v -s entrypoints/openai/tool_parsers
|
|
- pytest -v -s entrypoints/ --ignore=entrypoints/llm --ignore=entrypoints/openai --ignore=entrypoints/offline_mode --ignore=entrypoints/test_chat_utils.py --ignore=entrypoints/pooling
|
|
|
|
- label: Entrypoints Integration Test (LLM) # 30min
|
|
timeout_in_minutes: 40
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
fast_check: true
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/entrypoints/llm
|
|
- tests/entrypoints/offline_mode
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
|
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
|
|
|
- label: Entrypoints Integration Test (API Server) # 100min
|
|
timeout_in_minutes: 130
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
fast_check: true
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/entrypoints/openai
|
|
- tests/entrypoints/test_chat_utils
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- PYTHONPATH=/vllm-workspace pytest -v -s entrypoints/openai/test_collective_rpc.py # PYTHONPATH is needed to import custom Worker extension
|
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_collective_rpc.py --ignore=entrypoints/openai/tool_parsers/
|
|
- pytest -v -s entrypoints/test_chat_utils.py
|
|
|
|
- label: Entrypoints Integration Test (Pooling)
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
fast_check: true
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/entrypoints/pooling
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -v -s entrypoints/pooling
|
|
|
|
- label: Distributed Tests (4 GPUs) # 35min
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_4
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- tests/distributed/test_utils
|
|
- tests/distributed/test_pynccl
|
|
- tests/distributed/test_events
|
|
- tests/compile/test_basic_correctness
|
|
- examples/offline_inference/rlhf.py
|
|
- examples/offline_inference/rlhf_colocate.py
|
|
- tests/examples/offline_inference/data_parallel.py
|
|
- tests/v1/distributed
|
|
- tests/v1/engine/test_engine_core_client.py
|
|
- tests/distributed/test_symm_mem_allreduce.py
|
|
commands:
|
|
# test with torchrun tp=2 and external_dp=2
|
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
# test with torchrun tp=2 and pp=2
|
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
|
# test with torchrun tp=4 and dp=1
|
|
- TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=2, pp=2 and dp=1
|
|
- PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=1 and dp=4 with ep
|
|
- DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with torchrun tp=2 and dp=2 with ep
|
|
- TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
|
|
# test with internal dp
|
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
|
|
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
|
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
|
- pytest -v -s distributed/test_utils.py
|
|
- pytest -v -s compile/test_basic_correctness.py
|
|
- pytest -v -s distributed/test_pynccl.py
|
|
- pytest -v -s distributed/test_events.py
|
|
- pytest -v -s distributed/test_symm_mem_allreduce.py
|
|
# TODO: create a dedicated test section for multi-GPU example tests
|
|
# when we have multiple distributed example tests
|
|
- pushd ../examples/offline_inference
|
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
|
- popd
|
|
|
|
- label: EPLB Algorithm Test # 5min
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
timeout_in_minutes: 15
|
|
working_dir: "/vllm-workspace/tests"
|
|
source_file_dependencies:
|
|
- vllm/distributed/eplb
|
|
- tests/distributed/test_eplb_algo.py
|
|
commands:
|
|
- pytest -v -s distributed/test_eplb_algo.py
|
|
|
|
- label: EPLB Execution Test # 5min
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_4
|
|
# grade: Blocking
|
|
timeout_in_minutes: 15
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/eplb
|
|
- tests/distributed/test_eplb_execute.py
|
|
commands:
|
|
- pytest -v -s distributed/test_eplb_execute.py
|
|
|
|
- label: Metrics, Tracing Test # 12min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1/tracing
|
|
commands:
|
|
- "pip install \
|
|
'opentelemetry-sdk>=1.26.0' \
|
|
'opentelemetry-api>=1.26.0' \
|
|
'opentelemetry-exporter-otlp>=1.26.0' \
|
|
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
|
- pytest -v -s v1/tracing
|
|
|
|
##### fast check tests #####
|
|
##### 1 GPU test #####
|
|
|
|
- label: Regression Test # 7min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/test_regression
|
|
commands:
|
|
- pip install modelscope
|
|
- pytest -v -s test_regression.py
|
|
working_dir: "/vllm-workspace/tests" # optional
|
|
|
|
- label: Engine Test # 25min
|
|
timeout_in_minutes: 40
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
#grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/engine
|
|
- tests/tokenization
|
|
- tests/test_sequence
|
|
- tests/test_config
|
|
- tests/test_logger
|
|
- tests/test_vllm_port
|
|
commands:
|
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
|
# OOM in the CI unless we run this separately
|
|
- pytest -v -s tokenization
|
|
|
|
- label: V1 Test e2e + engine # 30min
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1
|
|
commands:
|
|
# TODO: accuracy does not match, whether setting
|
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
|
- pytest -v -s v1/e2e
|
|
- pytest -v -s v1/engine
|
|
|
|
- label: V1 Test entrypoints # 35min
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1
|
|
commands:
|
|
- pytest -v -s v1/entrypoints
|
|
|
|
- label: V1 Test others # 42min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1
|
|
commands:
|
|
# split the test to avoid interference
|
|
- pytest -v -s -m 'not cpu_test' v1/core
|
|
- pytest -v -s v1/executor
|
|
- pytest -v -s v1/kv_offload
|
|
- pytest -v -s v1/sample
|
|
- pytest -v -s v1/logits_processors
|
|
- pytest -v -s v1/worker
|
|
- pytest -v -s v1/spec_decode
|
|
- pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
|
|
- pytest -v -s -m 'not cpu_test' v1/metrics
|
|
- pytest -v -s v1/test_oracle.py
|
|
- pytest -v -s v1/test_request.py
|
|
# Integration test for streaming correctness (requires special branch).
|
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
|
|
|
- label: V1 Test others (CPU) # 5 mins
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/v1
|
|
no_gpu: true
|
|
commands:
|
|
# split the test to avoid interference
|
|
- pytest -v -s -m 'cpu_test' v1/core
|
|
- pytest -v -s v1/structured_output
|
|
- pytest -v -s v1/test_serial_utils.py
|
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
|
- pytest -v -s -m 'cpu_test' v1/metrics
|
|
|
|
|
|
- label: Examples Test # 30min
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/examples"
|
|
source_file_dependencies:
|
|
- vllm/entrypoints
|
|
- examples/
|
|
commands:
|
|
- pip install tensorizer # for tensorizer test
|
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
|
- python3 offline_inference/basic/chat.py
|
|
- python3 offline_inference/prefix_caching.py
|
|
- python3 offline_inference/llm_engine_example.py
|
|
- python3 offline_inference/audio_language.py --seed 0
|
|
- python3 offline_inference/vision_language.py --seed 0
|
|
- python3 offline_inference/vision_language_pooling.py --seed 0
|
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
|
- python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
|
- python3 offline_inference/basic/classify.py
|
|
- python3 offline_inference/basic/embed.py
|
|
- python3 offline_inference/basic/score.py
|
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
|
|
|
- label: Platform Tests (CUDA) # 4min
|
|
timeout_in_minutes: 15
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/cuda
|
|
commands:
|
|
- pytest -v -s cuda/test_cuda_context.py
|
|
|
|
- label: Samplers Test # 56min
|
|
timeout_in_minutes: 75
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/model_executor/layers
|
|
- vllm/sampling_metadata.py
|
|
- tests/samplers
|
|
- tests/conftest.py
|
|
commands:
|
|
- pytest -v -s samplers
|
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
|
|
|
- label: LoRA Test %N # 20min each
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/lora
|
|
- tests/lora
|
|
commands:
|
|
- pytest -v -s lora \
|
|
--shard-id=$$BUILDKITE_PARALLEL_JOB \
|
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
|
--ignore=lora/test_chatglm3_tp.py \
|
|
--ignore=lora/test_llama_tp.py \
|
|
--ignore=lora/test_llm_with_multi_loras.py
|
|
parallelism: 4
|
|
|
|
- label: PyTorch Compilation Unit Tests # 15min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/compile
|
|
commands:
|
|
- pytest -v -s compile/test_pass_manager.py
|
|
- pytest -v -s compile/test_fusion.py
|
|
- pytest -v -s compile/test_fusion_attn.py
|
|
- pytest -v -s compile/test_functionalization.py
|
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
|
- pytest -v -s compile/test_sequence_parallelism.py
|
|
- pytest -v -s compile/test_async_tp.py
|
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
|
- pytest -v -s compile/test_decorator.py
|
|
- pytest -v -s compile/test_noop_elimination.py
|
|
- pytest -v -s compile/test_aot_compile.py
|
|
|
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/compile
|
|
commands:
|
|
- pytest -v -s compile/test_basic_correctness.py
|
|
- pytest -v -s compile/piecewise/
|
|
|
|
- label: PyTorch Fullgraph Test # 20min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/compile
|
|
commands:
|
|
- pytest -v -s compile/test_full_graph.py
|
|
|
|
- label: Kernels Core Operation Test # 48min
|
|
timeout_in_minutes: 75
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- tests/kernels/core
|
|
commands:
|
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
|
|
|
- label: Kernels Attention Test %N # 23min
|
|
timeout_in_minutes: 35
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/attention/
|
|
- vllm/attention
|
|
- vllm/v1/attention
|
|
- tests/kernels/attention
|
|
commands:
|
|
- pytest -v -s kernels/attention --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 2
|
|
|
|
- label: Kernels Quantization Test %N # 64min
|
|
timeout_in_minutes: 90
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/quantization/
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/kernels/quantization
|
|
commands:
|
|
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 2
|
|
|
|
- label: Kernels MoE Test %N # 40min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/quantization/cutlass_w8a8/moe/
|
|
- csrc/moe/
|
|
- tests/kernels/moe
|
|
- vllm/model_executor/layers/fused_moe/
|
|
- vllm/distributed/device_communicators/
|
|
commands:
|
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
|
parallelism: 2
|
|
|
|
- label: Kernels Mamba Test # 31min
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/mamba/
|
|
- tests/kernels/mamba
|
|
- vllm/model_executor/layers/mamba/ops
|
|
commands:
|
|
- pytest -v -s kernels/mamba
|
|
|
|
- label: Model Executor Test # 23min
|
|
timeout_in_minutes: 35
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/model_executor
|
|
- tests/model_executor
|
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
|
commands:
|
|
- apt-get update && apt-get install -y curl libsodium23
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -v -s model_executor
|
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
|
|
|
- label: Benchmarks # 11min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/.buildkite"
|
|
source_file_dependencies:
|
|
- benchmarks/
|
|
commands:
|
|
- bash scripts/run-benchmarks.sh
|
|
|
|
- label: Benchmarks CLI Test # 7min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/benchmarks/
|
|
commands:
|
|
- pytest -v -s benchmarks/
|
|
|
|
- label: Quantization Test # 70min
|
|
timeout_in_minutes: 90
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/quantization
|
|
commands:
|
|
# temporary install here since we need nightly, will move to requirements/test.in
|
|
# after torchao 0.12 release, and pin a working version of torchao nightly here
|
|
|
|
# since torchao nightly is only compatible with torch nightly currently
|
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
|
# we can only upgrade after this is resolved
|
|
# TODO(jerryzh168): resolve the above comment
|
|
- uv pip install --system torchao==0.13.0
|
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
|
|
|
- label: LM Eval Small Models # 53min
|
|
timeout_in_minutes: 75
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
|
|
|
- label: OpenAI API correctness # 22min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/entrypoints/openai/
|
|
- vllm/model_executor/models/whisper.py
|
|
commands: # LMEval+Transcription WER check
|
|
- pytest -s entrypoints/openai/correctness/
|
|
|
|
- label: OpenAI-Compatible Tool Use # 23 min
|
|
timeout_in_minutes: 35
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
fast_check: false
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/tool_use
|
|
commands:
|
|
- pytest -v -s -m 'not cpu_test' tool_use
|
|
|
|
- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
timeout_in_minutes: 10
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/tool_use
|
|
no_gpu: true
|
|
commands:
|
|
- pytest -v -s -m 'cpu_test' tool_use
|
|
|
|
##### models test #####
|
|
|
|
- label: Basic Models Tests (Initialization)
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/test_initialization.py
|
|
commands:
|
|
# Run a subset of model initialization tests
|
|
- pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
|
|
|
|
- label: Basic Models Tests (Extra Initialization) %N
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/model_executor/models/
|
|
- tests/models/test_initialization.py
|
|
commands:
|
|
# Only when vLLM model source is modified - test initialization of a large
|
|
# subset of supported models (the complement of the small subset in the above
|
|
# test.) Also run if model initialization test file is modified
|
|
- pytest -v -s models/test_initialization.py \
|
|
-k 'not test_can_initialize_small_subset' \
|
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
parallelism: 2
|
|
|
|
- label: Basic Models Tests (Other)
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/test_transformers.py
|
|
- tests/models/test_registry.py
|
|
commands:
|
|
- pytest -v -s models/test_transformers.py models/test_registry.py
|
|
|
|
- label: Basic Models Test (Other CPU) # 5min
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
timeout_in_minutes: 10
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/test_utils.py
|
|
- tests/models/test_vision.py
|
|
no_gpu: true
|
|
commands:
|
|
- pytest -v -s models/test_utils.py models/test_vision.py
|
|
|
|
- label: Language Models Tests (Standard)
|
|
timeout_in_minutes: 25
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language
|
|
commands:
|
|
# Test standard language models, excluding a subset of slow tests
|
|
- pip freeze | grep -E 'torch'
|
|
- pytest -v -s models/language -m 'core_model and (not slow_test)'
|
|
|
|
- label: Language Models Tests (Extra Standard) %N
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/model_executor/models/
|
|
- tests/models/language/pooling/test_embedding.py
|
|
- tests/models/language/generation/test_common.py
|
|
- tests/models/language/pooling/test_classification.py
|
|
commands:
|
|
# Shard slow subset of standard language models tests. Only run when model
|
|
# source is modified, or when specified test files are modified
|
|
- pip freeze | grep -E 'torch'
|
|
- pytest -v -s models/language -m 'core_model and slow_test' \
|
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
parallelism: 2
|
|
|
|
- label: Language Models Tests (Hybrid) %N
|
|
timeout_in_minutes: 75
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_8
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language/generation
|
|
commands:
|
|
# Install fast path packages for testing against transformers
|
|
# Note: also needed to run plamo2 model in vLLM
|
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
|
# Shard hybrid language model tests
|
|
- pytest -v -s models/language/generation \
|
|
-m hybrid_model \
|
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
|
--shard-id=$$BUILDKITE_PARALLEL_JOB
|
|
parallelism: 2
|
|
|
|
- label: Language Models Test (Extended Generation) # 80min
|
|
timeout_in_minutes: 110
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language/generation
|
|
commands:
|
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
|
|
|
- label: Language Models Test (PPL)
|
|
timeout_in_minutes: 110
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language/generation_ppl_test
|
|
commands:
|
|
- pytest -v -s models/language/generation_ppl_test
|
|
|
|
- label: Language Models Test (Extended Pooling) # 36min
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language/pooling
|
|
commands:
|
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
|
|
|
- label: Language Models Test (MTEB)
|
|
timeout_in_minutes: 110
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/language/pooling_mteb_test
|
|
commands:
|
|
- pytest -v -s models/language/pooling_mteb_test
|
|
|
|
- label: Multi-Modal Processor Test # 44min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/multimodal
|
|
commands:
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
- pytest -v -s models/multimodal/processing
|
|
|
|
- label: Multi-Modal Models Test (Standard) # 60min
|
|
timeout_in_minutes: 80
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
torch_nightly: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/multimodal
|
|
commands:
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
- pip freeze | grep -E 'torch'
|
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
|
|
|
- label: Multi-Modal Models Test (Extended) 1
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/multimodal
|
|
commands:
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
- pytest -v -s models/multimodal -m 'not core_model' --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing
|
|
|
|
- label: Multi-Modal Models Test (Extended) 2
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/multimodal
|
|
commands:
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
|
|
|
- label: Multi-Modal Models Test (Extended) 3
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/models/multimodal
|
|
commands:
|
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
|
|
|
- label: Quantized Models Test # 45 min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
source_file_dependencies:
|
|
- vllm/model_executor/layers/quantization
|
|
- tests/models/quantization
|
|
commands:
|
|
- pytest -v -s models/quantization
|
|
|
|
# This test is used only in PR development phase to test individual models and should never run on main
|
|
- label: Custom Models Test
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_1
|
|
# grade: Blocking
|
|
optional: true
|
|
commands:
|
|
- echo 'Testing custom models...'
|
|
# PR authors can temporarily add commands below to test individual models
|
|
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
|
|
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
|
|
|
|
- label: Transformers Nightly Models Test
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_1
|
|
working_dir: "/vllm-workspace/"
|
|
optional: true
|
|
commands:
|
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
|
- pytest -v -s tests/models/test_initialization.py
|
|
- pytest -v -s tests/models/test_transformers.py
|
|
- pytest -v -s tests/models/multimodal/processing/
|
|
- pytest -v -s tests/models/multimodal/test_mapping.py
|
|
- python3 examples/offline_inference/basic/chat.py
|
|
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
|
# Whisper needs spawn method to avoid deadlock
|
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
|
|
|
- label: Blackwell Test # 38 min
|
|
timeout_in_minutes: 60
|
|
working_dir: "/vllm-workspace/"
|
|
gpu: b200
|
|
# optional: true
|
|
source_file_dependencies:
|
|
- csrc/quantization/fp4/
|
|
- csrc/attention/mla/
|
|
- csrc/quantization/cutlass_w8a8/moe/
|
|
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
|
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
|
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
- vllm/compilation/fusion.py
|
|
- vllm/compilation/fusion_attn.py
|
|
commands:
|
|
- nvidia-smi
|
|
- python3 examples/offline_inference/basic/chat.py
|
|
# Attention
|
|
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
|
|
- pytest -v -s tests/kernels/attention/test_cutlass_mla_decode.py
|
|
- pytest -v -s tests/kernels/attention/test_flashinfer_mla_decode.py
|
|
# Quantization
|
|
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
|
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
|
|
- pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
|
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
|
# Fusion
|
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
|
|
|
- label: Blackwell GPT-OSS Eval
|
|
timeout_in_minutes: 60
|
|
working_dir: "/vllm-workspace/"
|
|
gpu: b200
|
|
optional: true # run on nightlies
|
|
source_file_dependencies:
|
|
- tests/evals/gpt_oss
|
|
- vllm/model_executor/models/gpt_oss.py
|
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
commands:
|
|
- uv pip install --system 'gpt-oss[eval]==0.0.5'
|
|
- pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
|
|
|
|
- label: Blackwell Quantized MoE Test
|
|
timeout_in_minutes: 60
|
|
working_dir: "/vllm-workspace/"
|
|
gpu: b200
|
|
source_file_dependencies:
|
|
- tests/quantization/test_blackwell_moe.py
|
|
- vllm/model_executor/models/deepseek_v2.py
|
|
- vllm/model_executor/models/gpt_oss.py
|
|
- vllm/model_executor/models/llama4.py
|
|
- vllm/model_executor/layers/fused_moe
|
|
- vllm/model_executor/layers/quantization/compressed_tensors
|
|
- vllm/model_executor/layers/quantization/modelopt.py
|
|
- vllm/model_executor/layers/quantization/mxfp4.py
|
|
- vllm/v1/attention/backends/flashinfer.py
|
|
commands:
|
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
|
|
|
- label: Blackwell LM Eval Small Models
|
|
timeout_in_minutes: 120
|
|
gpu: b200
|
|
optional: true # run on nightlies
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
|
|
|
|
##### 1 GPU test #####
|
|
##### multi gpus test #####
|
|
|
|
- label: Distributed Comm Ops Test # 7min
|
|
timeout_in_minutes: 20
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/distributed
|
|
- tests/distributed
|
|
commands:
|
|
- pytest -v -s distributed/test_comm_ops.py
|
|
- pytest -v -s distributed/test_shm_broadcast.py
|
|
- pytest -v -s distributed/test_shm_buffer.py
|
|
- pytest -v -s distributed/test_shm_storage.py
|
|
|
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_4
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
num_nodes: 2
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/model_executor/models/
|
|
- tests/distributed/
|
|
- tests/examples/offline_inference/data_parallel.py
|
|
commands:
|
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
|
|
|
- label: Distributed Tests (2 GPUs) # 68min
|
|
timeout_in_minutes: 90
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/compilation/
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/worker/worker_base.py
|
|
- vllm/v1/engine/
|
|
- vllm/v1/worker/
|
|
- tests/compile/test_basic_correctness.py
|
|
- tests/compile/test_wrapper.py
|
|
- tests/distributed/
|
|
- tests/entrypoints/llm/test_collective_rpc.py
|
|
- tests/v1/distributed
|
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
|
- tests/v1/shutdown
|
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
|
commands:
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
|
- pytest -v -s ./compile/test_basic_correctness.py
|
|
- pytest -v -s ./compile/test_wrapper.py
|
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
|
- pytest -v -s distributed/test_sequence_parallel.py
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
|
|
|
- label: Distributed Model Tests (2 GPUs) # 37min
|
|
timeout_in_minutes: 50
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/model_executor/model_loader/sharded_state_loader.py
|
|
- vllm/model_executor/models/
|
|
- tests/basic_correctness/
|
|
- tests/model_executor/model_loader/test_sharded_state_loader.py
|
|
- tests/models/
|
|
commands:
|
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
|
|
# Avoid importing model tests that cause CUDA reinitialization error
|
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
|
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
|
|
|
|
- label: Plugin Tests (2 GPUs) # 40min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
source_file_dependencies:
|
|
- vllm/plugins/
|
|
- tests/plugins/
|
|
commands:
|
|
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
|
- pip uninstall vllm_add_dummy_platform -y
|
|
# end platform plugin tests
|
|
# begin io_processor plugins test, all the code in between uses the prithvi_io_processor plugin
|
|
- pip install -e ./plugins/prithvi_io_processor_plugin
|
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
|
- pip uninstall prithvi_io_processor_plugin -y
|
|
# end io_processor plugins test
|
|
# other tests continue here:
|
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
|
- pip install -e ./plugins/vllm_add_dummy_model
|
|
- pytest -v -s distributed/test_distributed_oot.py
|
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
|
|
|
- label: Pipeline + Context Parallelism Test # 45min
|
|
timeout_in_minutes: 60
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_4
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/distributed/
|
|
- vllm/engine/
|
|
- vllm/executor/
|
|
- vllm/model_executor/models/
|
|
- tests/distributed/
|
|
commands:
|
|
- pytest -v -s distributed/test_pp_cudagraph.py
|
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
|
|
|
- label: LoRA TP Test (Distributed) # 17 min
|
|
timeout_in_minutes: 30
|
|
mirror_hardwares: [amdexperimental, amdproduction]
|
|
agent_pool: mi325_4
|
|
# grade: Blocking
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/lora
|
|
- tests/lora
|
|
commands:
|
|
# FIXIT: find out which code initialize cuda before running the test
|
|
# before the fix, we need to use spawn to test it
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
|
# requires multi-GPU testing for validation.
|
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
|
- pytest -v -s -x lora/test_llama_tp.py
|
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
|
|
|
|
|
- label: Weight Loading Multiple GPU Test # 33min
|
|
timeout_in_minutes: 45
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/weight_loading
|
|
commands:
|
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
|
|
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
working_dir: "/vllm-workspace/tests"
|
|
num_gpus: 2
|
|
gpu: a100
|
|
optional: true
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- tests/weight_loading
|
|
commands:
|
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
|
|
|
|
|
##### multi gpus test #####
|
|
##### A100 test #####
|
|
|
|
- label: Distributed Tests (A100) # optional
|
|
gpu: a100
|
|
optional: true
|
|
num_gpus: 4
|
|
source_file_dependencies:
|
|
- vllm/
|
|
commands:
|
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
|
- torchrun --nproc_per_node=2 distributed/test_ca_buffer_sharing.py
|
|
- TARGET_TEST_SUITE=A100 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
|
- pytest -v -s -x lora/test_mixtral.py
|
|
|
|
- label: LM Eval Large Models # optional
|
|
gpu: a100
|
|
optional: true
|
|
num_gpus: 4
|
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
|
source_file_dependencies:
|
|
- csrc/
|
|
- vllm/model_executor/layers/quantization
|
|
commands:
|
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
|
|
|
##### H200 test #####
|
|
- label: Distrubted Tests (H200) # optional
|
|
gpu: h200
|
|
optional: true
|
|
working_dir: "/vllm-workspace/"
|
|
num_gpus: 2
|
|
commands:
|
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
|
|
|
##### B200 test #####
|
|
- label: Distributed Tests (B200) # optional
|
|
gpu: b200
|
|
optional: true
|
|
working_dir: "/vllm-workspace/"
|
|
num_gpus: 2
|
|
commands:
|
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
|
|
|
##### RL Integration Tests #####
|
|
- label: Prime-RL Integration Test # 15min
|
|
mirror_hardwares: [amdexperimental]
|
|
agent_pool: mi325_2
|
|
# grade: Blocking
|
|
timeout_in_minutes: 30
|
|
optional: true
|
|
num_gpus: 2
|
|
working_dir: "/vllm-workspace"
|
|
source_file_dependencies:
|
|
- vllm/
|
|
- .buildkite/scripts/run-prime-rl-test.sh
|
|
commands:
|
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|