mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[misc][distributed] improve libcudart.so finding (#7127)
This commit is contained in:
@ -4,9 +4,6 @@ convenient for use when we just need to call a few functions.
|
||||
"""
|
||||
|
||||
import ctypes
|
||||
import glob
|
||||
import os
|
||||
import sys
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
@ -36,24 +33,25 @@ class Function:
|
||||
argtypes: List[Any]
|
||||
|
||||
|
||||
def get_pytorch_default_cudart_library_path() -> str:
|
||||
# code borrowed from https://github.com/pytorch/pytorch/blob/1cae60a87e5bdda8bcf55724a862eeed98a9747e/torch/__init__.py#L284 # noqa
|
||||
lib_folder = "cuda_runtime"
|
||||
lib_name = "libcudart.so.*[0-9]"
|
||||
lib_path = None
|
||||
for path in sys.path:
|
||||
nvidia_path = os.path.join(path, "nvidia")
|
||||
if not os.path.exists(nvidia_path):
|
||||
continue
|
||||
candidate_lib_paths = glob.glob(
|
||||
os.path.join(nvidia_path, lib_folder, "lib", lib_name))
|
||||
if candidate_lib_paths and not lib_path:
|
||||
lib_path = candidate_lib_paths[0]
|
||||
if lib_path:
|
||||
break
|
||||
if not lib_path:
|
||||
raise ValueError(f"{lib_name} not found in the system path {sys.path}")
|
||||
return lib_path
|
||||
def find_loaded_library(lib_name) -> Optional[str]:
|
||||
"""
|
||||
According to according to https://man7.org/linux/man-pages/man5/proc_pid_maps.5.html,
|
||||
the file `/proc/self/maps` contains the memory maps of the process, which includes the
|
||||
shared libraries loaded by the process. We can use this file to find the path of the
|
||||
a loaded library.
|
||||
""" # noqa
|
||||
found = False
|
||||
with open("/proc/self/maps") as f:
|
||||
for line in f:
|
||||
if lib_name in line:
|
||||
found = True
|
||||
break
|
||||
if not found:
|
||||
# the library is not loaded in the current process
|
||||
return None
|
||||
start = line.index("/")
|
||||
path = line[start:].strip()
|
||||
return path
|
||||
|
||||
|
||||
class CudaRTLibrary:
|
||||
@ -100,7 +98,9 @@ class CudaRTLibrary:
|
||||
|
||||
def __init__(self, so_file: Optional[str] = None):
|
||||
if so_file is None:
|
||||
so_file = get_pytorch_default_cudart_library_path()
|
||||
so_file = find_loaded_library("libcudart.so")
|
||||
assert so_file is not None, \
|
||||
"libcudart.so is not loaded in the current process"
|
||||
if so_file not in CudaRTLibrary.path_to_library_cache:
|
||||
lib = ctypes.CDLL(so_file)
|
||||
CudaRTLibrary.path_to_library_cache[so_file] = lib
|
||||
|
@ -145,6 +145,7 @@ def can_actually_p2p(
|
||||
p_tgt.start()
|
||||
p_src.join()
|
||||
p_tgt.join()
|
||||
assert p_src.exitcode == 0 and p_tgt.exitcode == 0
|
||||
result: List[bool] = []
|
||||
for src, tgt in zip(batch_src, batch_tgt):
|
||||
a = result_queue.get()
|
||||
@ -221,7 +222,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
|
||||
# wrap raised exception to provide more information
|
||||
raise RuntimeError(
|
||||
f"Error happened when batch testing "
|
||||
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
|
||||
f"peer-to-peer access from {batch_src} to {batch_tgt}:\n"
|
||||
f"{returned.stderr.decode()}") from e
|
||||
result = pickle.loads(returned.stdout)
|
||||
for _i, _j, r in zip(batch_src, batch_tgt, result):
|
||||
cache[f"{_i}->{_j}"] = r
|
||||
|
Reference in New Issue
Block a user