[3/n][CI] Load Quantization test models with S3 (#13570)

Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-10-20 14:53:52 +08:00 · 2025-02-19 18:12:30 -08:00
parent a4c402a756
commit 473f51cfd9
2 changed files with 53 additions and 2 deletions
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -57,6 +57,57 @@ MODELS_ON_S3 = [
    "ArthurZ/Ilama-3.2-1B",
    "llava-hf/llava-1.5-7b-hf",
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+    "JackFram/llama-160m",
+    "ai21labs/Jamba-tiny-random",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Phi-3-mini-128k-instruct-FP8",
+    "nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+    "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
+    "AMead10/Llama-3.2-1B-Instruct-AWQ",
+    "shuyuej/Llama-3.2-1B-Instruct-GPTQ",
+    "ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
+    "ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
+    "TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
+    "neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
+    "amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
+    "nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
+    "neuralmagic/Llama-3.2-1B-quantized.w8a8",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
+    "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
+    "nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
+    "nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
+    "nm-testing/tinyllama-oneshot-w4a16-channel-v2",
+    "nm-testing/tinyllama-oneshot-w4a16-group128-v2",
+    "nm-testing/tinyllama-oneshot-w8a16-per-channel",
+    "nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
+    "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
+    "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
+    "nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
+    "nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
+    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
 ]

 MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -27,8 +27,6 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
 from vllm.platforms import current_platform
 from vllm.utils import PlaceholderModule

-logger = init_logger(__name__)
-
 try:
    from runai_model_streamer import SafetensorsStreamer
 except (ImportError, OSError):
@ -39,6 +37,8 @@ except (ImportError, OSError):
    SafetensorsStreamer = runai_model_streamer.placeholder_attr(
        "SafetensorsStreamer")

+logger = init_logger(__name__)
+
 # use system-level temp directory for file locks, so that multiple users
 # can share the same lock without error.
 # lock files in the temp directory will be automatically deleted when the