update using local

Signed-off-by: yewentao256 <zhyanwentao@126.com>
update through comments
2025-10-20 14:53:52 +08:00 · 2025-10-17 13:37:15 -07:00 · 2025-10-17 13:36:09 -07:00 · 2025-10-17 12:38:43 -07:00 · 2025-10-15 12:48:30 -07:00 · 2025-10-15 12:47:14 -07:00
13 changed files with 64 additions and 28 deletions
--- a/vllm/compilation/backends.py
+++ b/vllm/compilation/backends.py
@ -22,6 +22,7 @@ from vllm.compilation.partition_rules import (
    resolve_defined_ops,
 )
 from vllm.config import CompilationConfig, CUDAGraphMode, VllmConfig
+from vllm.distributed.parallel_state import is_local_first_rank
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.utils import is_torch_equal_or_newer
@ -245,9 +246,9 @@ class CompilerManager:
            if graph_index == 0:
                # adds some info logging for the first graph
                if runtime_shape is None:
-                    logger.info("Cache the graph for dynamic shape for later use")
+                    logger.debug("Cache the graph for dynamic shape for later use")
                else:
-                    logger.info(
+                    logger.debug(
                        "Cache the graph of shape %s for later use", str(runtime_shape)
                    )
            if runtime_shape is None:
@ -603,12 +604,14 @@ class VllmBackend:

        disable_cache = envs.VLLM_DISABLE_COMPILE_CACHE

-        if disable_cache:
-            logger.info("vLLM's torch.compile cache is disabled.")
-        else:
-            logger.info(
-                "Using cache directory: %s for vLLM's torch.compile", local_cache_dir
-            )
+        if is_local_first_rank():
+            if disable_cache:
+                logger.info_once("vLLM's torch.compile cache is disabled.")
+            else:
+                logger.info_once(
+                    "Using cache directory: %s for vLLM's torch.compile",
+                    local_cache_dir,
+                )

        self.compiler_manager.initialize_cache(
            local_cache_dir, disable_cache, self.prefix
@ -620,7 +623,7 @@ class VllmBackend:
        from .monitor import torch_compile_start_time

        dynamo_time = time.time() - torch_compile_start_time
-        logger.info("Dynamo bytecode transform time: %.2f s", dynamo_time)
+        logger.debug("Dynamo bytecode transform time: %.2f s", dynamo_time)
        self.compilation_config.compilation_time += dynamo_time

        # we control the compilation process, each instance can only be
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@ -34,7 +34,7 @@ def _can_p2p(rank: int, world_size: int) -> bool:
        if i == rank:
            continue
        if envs.VLLM_SKIP_P2P_CHECK:
-            logger.info("Skipping P2P check and trusting the driver's P2P report.")
+            logger.debug("Skipping P2P check and trusting the driver's P2P report.")
            return torch.cuda.can_device_access_peer(rank, i)
        if not gpu_p2p_access_check(rank, i):
            return False
--- a/vllm/distributed/device_communicators/shm_broadcast.py
+++ b/vllm/distributed/device_communicators/shm_broadcast.py
@ -310,7 +310,7 @@ class MessageQueue:
            remote_addr_ipv6=remote_addr_ipv6,
        )

-        logger.info("vLLM message queue communication handle: %s", self.handle)
+        logger.debug("vLLM message queue communication handle: %s", self.handle)

    def export_handle(self) -> Handle:
        return self.handle
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -1157,7 +1157,7 @@ def init_distributed_environment(
        ip = parallel_config.data_parallel_master_ip
        port = parallel_config.get_next_dp_init_port()
        distributed_init_method = get_distributed_init_method(ip, port)
-        logger.info(
+        logger.debug(
            "Adjusting world_size=%d rank=%d distributed_init_method=%s for DP",
            world_size,
            rank,
@ -1322,7 +1322,7 @@ def initialize_model_parallel(
        group_ranks, get_world_group().local_rank, backend, group_name="ep"
    )

-    logger.info(
+    logger.info_once(
        "rank %s in world size %s is assigned as "
        "DP rank %s, PP rank %s, TP rank %s, EP rank %s",
        rank,
@ -1623,6 +1623,29 @@ def is_global_first_rank() -> bool:
        return True


+def is_local_first_rank() -> bool:
+    """
+    Check if the current process is the first local rank (rank 0 on its node).
+    """
+    try:
+        # prefer the initialized world group if available
+        global _WORLD
+        if _WORLD is not None:
+            return _WORLD.local_rank == 0
+
+        if not torch.distributed.is_initialized():
+            return True
+
+        # fallback to environment-provided local rank if available
+        # note: envs.LOCAL_RANK is set when using env:// launchers (e.g., torchrun)
+        try:
+            return int(envs.LOCAL_RANK) == 0  # type: ignore[arg-type]
+        except Exception:
+            return torch.distributed.get_rank() == 0
+    except Exception:
+        return True
+
+
 def _node_count(pg: ProcessGroup | StatelessProcessGroup) -> int:
    """
    Returns the total number of nodes in the process group.
--- a/vllm/model_executor/model_loader/default_loader.py
+++ b/vllm/model_executor/model_loader/default_loader.py
@ -13,6 +13,7 @@ from transformers.utils import SAFE_WEIGHTS_INDEX_NAME

 from vllm.config import ModelConfig
 from vllm.config.load import LoadConfig
+from vllm.distributed.parallel_state import is_local_first_rank
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.torchao import torchao_version_at_least
 from vllm.model_executor.model_loader.base_loader import BaseModelLoader
@ -311,10 +312,12 @@ class DefaultModelLoader(BaseModelLoader):
            loaded_weights = load_weights_and_online_quantize(self, model, model_config)

        self.counter_after_loading_weights = time.perf_counter()
-        logger.info(
-            "Loading weights took %.2f seconds",
-            self.counter_after_loading_weights - self.counter_before_loading_weights,
-        )
+        if is_local_first_rank():
+            logger.info(
+                "Loading weights took %.2f seconds",
+                self.counter_after_loading_weights
+                - self.counter_before_loading_weights,
+            )
        # We only enable strict check for non-quantized models
        # that have loaded weights tracking currently.
        if model_config.quantization is None and loaded_weights is not None:
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@ -416,7 +416,7 @@ def download_weights_from_hf(
                e,
            )

-    logger.info("Using model weights format %s", allow_patterns)
+    logger.debug("Using model weights format %s", allow_patterns)
    # Use file lock to prevent multiple processes from
    # downloading the same model weights at the same time.
    with get_lock(model_name_or_path, cache_dir):
--- a/vllm/platforms/init.py
+++ b/vllm/platforms/init.py
@ -222,10 +222,12 @@ def resolve_current_platform_cls_qualname() -> str:
        )
    elif len(activated_builtin_plugins) == 1:
        platform_cls_qualname = builtin_platform_plugins[activated_builtin_plugins[0]]()
-        logger.info("Automatically detected platform %s.", activated_builtin_plugins[0])
+        logger.debug(
+            "Automatically detected platform %s.", activated_builtin_plugins[0]
+        )
    else:
        platform_cls_qualname = "vllm.platforms.interface.UnspecifiedPlatform"
-        logger.info("No platform detected, vLLM is running on UnspecifiedPlatform")
+        logger.debug("No platform detected, vLLM is running on UnspecifiedPlatform")
    return platform_cls_qualname


--- a/vllm/utils/gc_utils.py
+++ b/vllm/utils/gc_utils.py
@ -37,7 +37,7 @@ class GCDebugConfig:
            except Exception:
                self.enabled = False
                logger.error("Failed to parse VLLM_GC_DEBUG(%s)", envs.VLLM_GC_DEBUG)
-        logger.info("GC Debug Config. %s", str(self))
+        logger.debug("GC Debug Config. %s", str(self))

    def __repr__(self) -> str:
        return f"enabled:{self.enabled},top_objects:{self.top_objects}"
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@ -19,7 +19,6 @@ import zmq

 from vllm.config import ParallelConfig, VllmConfig
 from vllm.distributed import stateless_destroy_torch_distributed_process_group
-from vllm.distributed.parallel_state import is_global_first_rank
 from vllm.envs import enable_envs_cache
 from vllm.logger import init_logger
 from vllm.logging_utils.dump_input import dump_engine_exception
@ -93,7 +92,7 @@ class EngineCore:
        load_general_plugins()

        self.vllm_config = vllm_config
-        if is_global_first_rank():
+        if vllm_config.parallel_config.data_parallel_rank == 0:
            logger.info(
                "Initializing a V1 LLM engine (v%s) with config: %s",
                VLLM_VERSION,
@ -727,7 +726,7 @@ class EngineCoreProc(EngineCore):
        )

        # Receive initialization message.
-        logger.info("Waiting for init message from front-end.")
+        logger.debug("Waiting for init message from front-end.")
        if not handshake_socket.poll(timeout=HANDSHAKE_TIMEOUT_MINS * 60_000):
            raise RuntimeError(
                "Did not receive response from front-end "
--- a/vllm/v1/metrics/loggers.py
+++ b/vllm/v1/metrics/loggers.py
@ -188,7 +188,7 @@ class LoggingStatLogger(StatLoggerBase):

    def log_engine_initialized(self):
        if self.vllm_config.cache_config.num_gpu_blocks:
-            logger.info(
+            logger.debug(
                "Engine %03d: vllm cache_config_info with initialization "
                "after num_gpu_blocks is: %d",
                self.engine_index,
--- a/vllm/v1/sample/ops/topk_topp_sampler.py
+++ b/vllm/v1/sample/ops/topk_topp_sampler.py
@ -8,6 +8,7 @@ from packaging import version

 from vllm import envs
 from vllm.config.model import LogprobsMode
+from vllm.distributed.parallel_state import is_global_first_rank
 from vllm.logger import init_logger
 from vllm.platforms import CpuArchEnum, current_platform

@ -33,7 +34,8 @@ class TopKTopPSampler(nn.Module):
        ):
            if envs.VLLM_USE_FLASHINFER_SAMPLER:
                # Users must opt in explicitly via VLLM_USE_FLASHINFER_SAMPLER=1.
-                logger.info_once("Using FlashInfer for top-p & top-k sampling.")
+                if is_global_first_rank():
+                    logger.info_once("Using FlashInfer for top-p & top-k sampling.")
                self.forward = self.forward_cuda
            else:
                logger.debug_once(
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@ -2840,7 +2840,8 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
        Args:
            eep_scale_up: the model loading is for elastic EP scale up.
        """
-        logger.info("Starting to load model %s...", self.model_config.model)
+        if is_global_first_rank():
+            logger.info_once("Starting to load model %s...", self.model_config.model)
        if eep_scale_up:
            from vllm.distributed.parallel_state import get_ep_group

--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@ -20,7 +20,10 @@ from vllm.distributed import (
    set_custom_all_reduce,
 )
 from vllm.distributed.kv_transfer import ensure_kv_transfer_initialized
-from vllm.distributed.parallel_state import get_pp_group, get_tp_group
+from vllm.distributed.parallel_state import (
+    get_pp_group,
+    get_tp_group,
+)
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.model_executor import set_random_seed
Author	SHA1	Message	Date
yewentao256	99c02cce50	update using local Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 13:37:15 -07:00
yewentao256	2789316b0a	update through comments Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 13:36:09 -07:00
yewentao256	96d5d7b959	Merge branch 'main' into wentao-optimize-startup-log-2 Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-17 12:38:43 -07:00
yewentao256	4652ff1e02	Merge branch 'main' into wentao-optimize-startup-log-2	2025-10-15 12:48:30 -07:00
yewentao256	b44c430a1d	add local rank Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-15 12:47:14 -07:00
yewentao256	0fbcfd64f7	info once Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-15 12:38:34 -07:00
yewentao256	c96ef07ec6	Merge branch 'main' into wentao-optimize-startup-log-2 Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-15 08:00:50 -07:00
yewentao256	8d8c88ec87	update Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-14 07:37:37 -07:00
yewentao256	3967657957	optimize startup log Signed-off-by: yewentao256 <zhyanwentao@126.com>	2025-10-13 15:23:46 -07:00