Generate _ModelInfo properties file when loading to improve loading speed (#23558)

Signed-off-by: Manoel Marques <manoel.marques@ibm.com>
Signed-off-by: Manoel Marques <manoelmrqs@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
This commit is contained in:
Manoel Marques
2025-09-20 07:51:13 -04:00
committed by GitHub
parent 032d661d27
commit bf8b26cad1
4 changed files with 167 additions and 3 deletions

View File

@ -2,7 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logging_utils.formatter import NewLineFormatter
from vllm.logging_utils.log_time import logtime
__all__ = [
"NewLineFormatter",
"logtime",
]

View File

@ -0,0 +1,32 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Provides a timeslice logging decorator
"""
import functools
import time
def logtime(logger, msg=None):
"""
Logs the execution time of the decorated function.
Always place it beneath other decorators.
"""
def _inner(func):
@functools.wraps(func)
def _wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
elapsed = time.perf_counter() - start
prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
if msg is None else msg
logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
return result
return _wrapper
return _inner

View File

@ -11,6 +11,7 @@ import tempfile
import time
from collections import defaultdict
from collections.abc import Generator
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Callable, Optional, Union
@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path],
return lock
@contextmanager
def atomic_writer(filepath: Union[str, Path],
mode: str = 'w',
encoding: Optional[str] = None):
"""
Context manager that provides an atomic file writing routine.
The context manager writes to a temporary file and, if successful,
atomically replaces the original file.
Args:
filepath (str or Path): The path to the file to write.
mode (str): The file mode for the temporary file (e.g., 'w', 'wb').
encoding (str): The encoding for text mode.
Yields:
file object: A handle to the temporary file.
"""
# Create a temporary file in the same directory as the target file
# to ensure it's on the same filesystem for an atomic replace.
temp_dir = os.path.dirname(filepath)
temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir)
try:
# Open the temporary file for writing
with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file:
yield temp_file
# If the 'with' block completes successfully,
# perform the atomic replace.
os.replace(temp_path, filepath)
except Exception:
logger.exception(
"Error during atomic write. Original file '%s' not modified",
filepath)
raise
finally:
# Clean up the temporary file if it still exists.
if os.path.exists(temp_path):
os.remove(temp_path)
def maybe_download_from_modelscope(
model: str,
revision: Optional[str] = None,

View File

@ -4,7 +4,9 @@
Whenever you add an architecture to this page, please also update
`tests/models/registry.py` with example HuggingFace models for it.
"""
import hashlib
import importlib
import json
import os
import pickle
import subprocess
@ -12,16 +14,19 @@ import sys
import tempfile
from abc import ABC, abstractmethod
from collections.abc import Set
from dataclasses import dataclass, field
from dataclasses import asdict, dataclass, field
from functools import lru_cache
from pathlib import Path
from typing import Callable, Optional, TypeVar, Union
import torch.nn as nn
import transformers
from vllm import envs
from vllm.config import (ModelConfig, iter_architecture_defaults,
try_match_architecture_defaults)
from vllm.logger import init_logger
from vllm.logging_utils import logtime
from vllm.transformers_utils.dynamic_module import (
try_get_class_from_dynamic_module)
@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
module_name: str
class_name: str
# Performed in another process to avoid initializing CUDA
@staticmethod
def _get_cache_dir() -> Path:
return Path(envs.VLLM_CACHE_ROOT) / "modelinfos"
def _get_cache_filename(self) -> str:
cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
return f"{cls_name}.json"
def _load_modelinfo_from_cache(self,
module_hash: str) -> _ModelInfo | None:
try:
try:
modelinfo_path = self._get_cache_dir(
) / self._get_cache_filename()
with open(modelinfo_path, encoding="utf-8") as file:
mi_dict = json.load(file)
except FileNotFoundError:
logger.debug(("Cached model info file "
"for class %s.%s not found"), self.module_name,
self.class_name)
return None
if mi_dict["hash"] != module_hash:
logger.debug(("Cached model info file "
"for class %s.%s is stale"), self.module_name,
self.class_name)
return None
# file not changed, use cached _ModelInfo properties
return _ModelInfo(**mi_dict["modelinfo"])
except Exception:
logger.exception(("Cached model info "
"for class %s.%s error. "), self.module_name,
self.class_name)
return None
def _save_modelinfo_to_cache(self, mi: _ModelInfo,
module_hash: str) -> None:
"""save dictionary json file to cache"""
from vllm.model_executor.model_loader.weight_utils import atomic_writer
try:
modelinfo_dict = {
"hash": module_hash,
"modelinfo": asdict(mi),
}
cache_dir = self._get_cache_dir()
cache_dir.mkdir(parents=True, exist_ok=True)
modelinfo_path = cache_dir / self._get_cache_filename()
with atomic_writer(modelinfo_path, encoding='utf-8') as f:
json.dump(modelinfo_dict, f, indent=2)
except Exception:
logger.exception("Error saving model info cache.")
@logtime(logger=logger, msg="Registry inspect model class")
def inspect_model_cls(self) -> _ModelInfo:
return _run_in_subprocess(
model_path = Path(
__file__).parent / f"{self.module_name.split('.')[-1]}.py"
assert model_path.exists(), \
f"Model {self.module_name} expected to be on path {model_path}"
with open(model_path, "rb") as f:
module_hash = hashlib.md5(f.read()).hexdigest()
mi = self._load_modelinfo_from_cache(module_hash)
if mi is not None:
logger.debug(("Loaded model info "
"for class %s.%s from cache"), self.module_name,
self.class_name)
return mi
else:
logger.debug(("Cache model info "
"for class %s.%s miss. "
"Loading model instead."), self.module_name,
self.class_name)
# Performed in another process to avoid initializing CUDA
mi = _run_in_subprocess(
lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
logger.debug("Loaded model info for class %s.%s", self.module_name,
self.class_name)
# save cache file
self._save_modelinfo_to_cache(mi, module_hash)
return mi
def load_model_cls(self) -> type[nn.Module]:
mod = importlib.import_module(self.module_name)