mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[3/n][CI] Load Quantization test models with S3 (#13570)
Signed-off-by: <> Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
This commit is contained in:
@ -57,6 +57,57 @@ MODELS_ON_S3 = [
|
||||
"ArthurZ/Ilama-3.2-1B",
|
||||
"llava-hf/llava-1.5-7b-hf",
|
||||
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
||||
"JackFram/llama-160m",
|
||||
"ai21labs/Jamba-tiny-random",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Phi-3-mini-128k-instruct-FP8",
|
||||
"nm-testing/Qwen2-0.5B-Instruct-FP8-SkipQKV",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
|
||||
"nm-testing/Qwen2-1.5B-Instruct-FP8-K-V",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symTrue",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head-symFalse",
|
||||
"AMead10/Llama-3.2-1B-Instruct-AWQ",
|
||||
"shuyuej/Llama-3.2-1B-Instruct-GPTQ",
|
||||
"ModelCloud/Qwen1.5-1.8B-Chat-GPTQ-4bits-dynamic-cfg-with-lm_head",
|
||||
"ModelCloud/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-10-25-2024",
|
||||
"TheBloke/TinyLlama-1.1B-Chat-v1.0-GPTQ",
|
||||
"neuralmagic/Meta-Llama-3-8B-Instruct-FP8",
|
||||
"amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8-channel-a8-tensor",
|
||||
"nm-testing/asym-w8w8-int8-static-per-tensor-tiny-llama",
|
||||
"neuralmagic/Llama-3.2-1B-quantized.w8a8",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Dynamic-Asym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Sym",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-W8A8-Static-Per-Tensor-Asym",
|
||||
"nm-testing/tinyllama-oneshot-w8w8-test-static-shape-change",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a8-channel-dynamic-token-v2-asym",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
||||
"nm-testing/tinyllama-oneshot-w4a16-group128-v2",
|
||||
"nm-testing/tinyllama-oneshot-w8a16-per-channel",
|
||||
"nm-testing/llama7b-one-shot-2_4-w4a16-marlin24-t",
|
||||
"nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test",
|
||||
"nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-2of4-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-Per-Tensor-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Static-testing",
|
||||
"nm-testing/Meta-Llama-3-8B-Instruct-FP8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_fp8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-chnl_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_per_tok_dyn_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-gsm8k-pruned.2of4-tensor_wts_tensor_act_int8-BitM",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Channel-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Static-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-INT8-Dynamic-IA-Per-Tensor-Weight-testing",
|
||||
"nm-testing/TinyLlama-1.1B-Chat-v1.0-2of4-Sparse-Dense-Compressor",
|
||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||
]
|
||||
|
||||
MODEL_WEIGHTS_S3_BUCKET = "s3://vllm-ci-model-weights"
|
||||
|
@ -27,8 +27,6 @@ from vllm.model_executor.layers.quantization import (QuantizationConfig,
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import PlaceholderModule
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
try:
|
||||
from runai_model_streamer import SafetensorsStreamer
|
||||
except (ImportError, OSError):
|
||||
@ -39,6 +37,8 @@ except (ImportError, OSError):
|
||||
SafetensorsStreamer = runai_model_streamer.placeholder_attr(
|
||||
"SafetensorsStreamer")
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
# use system-level temp directory for file locks, so that multiple users
|
||||
# can share the same lock without error.
|
||||
# lock files in the temp directory will be automatically deleted when the
|
||||
|
Reference in New Issue
Block a user