Generate _ModelInfo properties file when loading to improve loading speed (#23558)

Signed-off-by: Manoel Marques <manoel.marques@ibm.com> Signed-off-by: Manoel Marques <manoelmrqs@gmail.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
2025-10-20 14:53:52 +08:00 · 2025-09-20 07:51:13 -04:00
parent 032d661d27
commit bf8b26cad1
4 changed files with 167 additions and 3 deletions
--- a/vllm/logging_utils/init.py
+++ b/vllm/logging_utils/init.py
@ -2,7 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 from vllm.logging_utils.formatter import NewLineFormatter
+from vllm.logging_utils.log_time import logtime

 __all__ = [
    "NewLineFormatter",
+    "logtime",
 ]
--- a/vllm/logging_utils/log_time.py
+++ b/vllm/logging_utils/log_time.py
@ -0,0 +1,32 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Provides a timeslice logging decorator
+"""
+
+import functools
+import time
+
+
+def logtime(logger, msg=None):
+    """
+    Logs the execution time of the decorated function.
+    Always place it beneath other decorators.
+    """
+
+    def _inner(func):
+
+        @functools.wraps(func)
+        def _wrapper(*args, **kwargs):
+            start = time.perf_counter()
+            result = func(*args, **kwargs)
+            elapsed = time.perf_counter() - start
+
+            prefix = f"Function '{func.__module__}.{func.__qualname__}'" \
+                if msg is None else msg
+            logger.debug("%s: Elapsed time %.7f secs", prefix, elapsed)
+            return result
+
+        return _wrapper
+
+    return _inner
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -11,6 +11,7 @@ import tempfile
 import time
 from collections import defaultdict
 from collections.abc import Generator
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Any, Callable, Optional, Union

@ -98,6 +99,49 @@ def get_lock(model_name_or_path: Union[str, Path],
    return lock


+@contextmanager
+def atomic_writer(filepath: Union[str, Path],
+                  mode: str = 'w',
+                  encoding: Optional[str] = None):
+    """
+    Context manager that provides an atomic file writing routine.
+
+    The context manager writes to a temporary file and, if successful,
+    atomically replaces the original file.
+
+    Args:
+        filepath (str or Path): The path to the file to write.
+        mode (str): The file mode for the temporary file (e.g., 'w', 'wb').
+        encoding (str): The encoding for text mode.
+
+    Yields:
+        file object: A handle to the temporary file.
+    """
+    # Create a temporary file in the same directory as the target file
+    # to ensure it's on the same filesystem for an atomic replace.
+    temp_dir = os.path.dirname(filepath)
+    temp_fd, temp_path = tempfile.mkstemp(dir=temp_dir)
+
+    try:
+        # Open the temporary file for writing
+        with os.fdopen(temp_fd, mode=mode, encoding=encoding) as temp_file:
+            yield temp_file
+
+        # If the 'with' block completes successfully,
+        # perform the atomic replace.
+        os.replace(temp_path, filepath)
+
+    except Exception:
+        logger.exception(
+            "Error during atomic write. Original file '%s' not modified",
+            filepath)
+        raise
+    finally:
+        # Clean up the temporary file if it still exists.
+        if os.path.exists(temp_path):
+            os.remove(temp_path)
+
+
 def maybe_download_from_modelscope(
        model: str,
        revision: Optional[str] = None,
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@ -4,7 +4,9 @@
 Whenever you add an architecture to this page, please also update
 `tests/models/registry.py` with example HuggingFace models for it.
 """
+import hashlib
 import importlib
+import json
 import os
 import pickle
 import subprocess
@ -12,16 +14,19 @@ import sys
 import tempfile
 from abc import ABC, abstractmethod
 from collections.abc import Set
-from dataclasses import dataclass, field
+from dataclasses import asdict, dataclass, field
 from functools import lru_cache
+from pathlib import Path
 from typing import Callable, Optional, TypeVar, Union

 import torch.nn as nn
 import transformers

+from vllm import envs
 from vllm.config import (ModelConfig, iter_architecture_defaults,
                         try_match_architecture_defaults)
 from vllm.logger import init_logger
+from vllm.logging_utils import logtime
 from vllm.transformers_utils.dynamic_module import (
    try_get_class_from_dynamic_module)

@ -421,10 +426,91 @@ class _LazyRegisteredModel(_BaseRegisteredModel):
    module_name: str
    class_name: str

-    # Performed in another process to avoid initializing CUDA
+    @staticmethod
+    def _get_cache_dir() -> Path:
+        return Path(envs.VLLM_CACHE_ROOT) / "modelinfos"
+
+    def _get_cache_filename(self) -> str:
+        cls_name = f"{self.module_name}-{self.class_name}".replace(".", "-")
+        return f"{cls_name}.json"
+
+    def _load_modelinfo_from_cache(self,
+                                   module_hash: str) -> _ModelInfo | None:
+        try:
+            try:
+                modelinfo_path = self._get_cache_dir(
+                ) / self._get_cache_filename()
+                with open(modelinfo_path, encoding="utf-8") as file:
+                    mi_dict = json.load(file)
+            except FileNotFoundError:
+                logger.debug(("Cached model info file "
+                              "for class %s.%s not found"), self.module_name,
+                             self.class_name)
+                return None
+
+            if mi_dict["hash"] != module_hash:
+                logger.debug(("Cached model info file "
+                              "for class %s.%s is stale"), self.module_name,
+                             self.class_name)
+                return None
+
+            # file not changed, use cached _ModelInfo properties
+            return _ModelInfo(**mi_dict["modelinfo"])
+        except Exception:
+            logger.exception(("Cached model info "
+                              "for class %s.%s error. "), self.module_name,
+                             self.class_name)
+            return None
+
+    def _save_modelinfo_to_cache(self, mi: _ModelInfo,
+                                 module_hash: str) -> None:
+        """save dictionary json file to cache"""
+        from vllm.model_executor.model_loader.weight_utils import atomic_writer
+        try:
+            modelinfo_dict = {
+                "hash": module_hash,
+                "modelinfo": asdict(mi),
+            }
+            cache_dir = self._get_cache_dir()
+            cache_dir.mkdir(parents=True, exist_ok=True)
+            modelinfo_path = cache_dir / self._get_cache_filename()
+            with atomic_writer(modelinfo_path, encoding='utf-8') as f:
+                json.dump(modelinfo_dict, f, indent=2)
+        except Exception:
+            logger.exception("Error saving model info cache.")
+
+    @logtime(logger=logger, msg="Registry inspect model class")
    def inspect_model_cls(self) -> _ModelInfo:
-        return _run_in_subprocess(
+        model_path = Path(
+            __file__).parent / f"{self.module_name.split('.')[-1]}.py"
+
+        assert model_path.exists(), \
+            f"Model {self.module_name} expected to be on path {model_path}"
+        with open(model_path, "rb") as f:
+            module_hash = hashlib.md5(f.read()).hexdigest()
+
+        mi = self._load_modelinfo_from_cache(module_hash)
+        if mi is not None:
+            logger.debug(("Loaded model info "
+                          "for class %s.%s from cache"), self.module_name,
+                         self.class_name)
+            return mi
+        else:
+            logger.debug(("Cache model info "
+                          "for class %s.%s miss. "
+                          "Loading model instead."), self.module_name,
+                         self.class_name)
+
+        # Performed in another process to avoid initializing CUDA
+        mi = _run_in_subprocess(
            lambda: _ModelInfo.from_model_cls(self.load_model_cls()))
+        logger.debug("Loaded model info for class %s.%s", self.module_name,
+                     self.class_name)
+
+        # save cache file
+        self._save_modelinfo_to_cache(mi, module_hash)
+
+        return mi

    def load_model_cls(self) -> type[nn.Module]:
        mod = importlib.import_module(self.module_name)