701 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			701 lines
		
	
	
		
			25 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| # SPDX-License-Identifier: Apache-2.0
 | |
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | |
| 
 | |
| import ctypes
 | |
| import importlib.util
 | |
| import json
 | |
| import logging
 | |
| import os
 | |
| import re
 | |
| import subprocess
 | |
| import sys
 | |
| from pathlib import Path
 | |
| from shutil import which
 | |
| 
 | |
| import torch
 | |
| from packaging.version import Version, parse
 | |
| from setuptools import Extension, setup
 | |
| from setuptools.command.build_ext import build_ext
 | |
| from setuptools_scm import get_version
 | |
| from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
 | |
| 
 | |
| 
 | |
| def load_module_from_path(module_name, path):
 | |
|     spec = importlib.util.spec_from_file_location(module_name, path)
 | |
|     module = importlib.util.module_from_spec(spec)
 | |
|     sys.modules[module_name] = module
 | |
|     spec.loader.exec_module(module)
 | |
|     return module
 | |
| 
 | |
| 
 | |
| ROOT_DIR = Path(__file__).parent
 | |
| logger = logging.getLogger(__name__)
 | |
| 
 | |
| # cannot import envs directly because it depends on vllm,
 | |
| #  which is not installed yet
 | |
| envs = load_module_from_path('envs', os.path.join(ROOT_DIR, 'vllm', 'envs.py'))
 | |
| 
 | |
| VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
 | |
| 
 | |
| if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
 | |
|     logger.warning(
 | |
|         "VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
 | |
|     VLLM_TARGET_DEVICE = "cpu"
 | |
| elif not (sys.platform.startswith("linux")
 | |
|           or sys.platform.startswith("darwin")):
 | |
|     logger.warning(
 | |
|         "vLLM only supports Linux platform (including WSL) and MacOS."
 | |
|         "Building on %s, "
 | |
|         "so vLLM may not be able to run correctly", sys.platform)
 | |
|     VLLM_TARGET_DEVICE = "empty"
 | |
| elif (sys.platform.startswith("linux") and torch.version.cuda is None
 | |
|       and os.getenv("VLLM_TARGET_DEVICE") is None
 | |
|       and torch.version.hip is None):
 | |
|     # if cuda or hip is not available and VLLM_TARGET_DEVICE is not set,
 | |
|     # fallback to cpu
 | |
|     VLLM_TARGET_DEVICE = "cpu"
 | |
| 
 | |
| MAIN_CUDA_VERSION = "12.8"
 | |
| 
 | |
| 
 | |
| def is_sccache_available() -> bool:
 | |
|     return which("sccache") is not None
 | |
| 
 | |
| 
 | |
| def is_ccache_available() -> bool:
 | |
|     return which("ccache") is not None
 | |
| 
 | |
| 
 | |
| def is_ninja_available() -> bool:
 | |
|     return which("ninja") is not None
 | |
| 
 | |
| 
 | |
| def is_url_available(url: str) -> bool:
 | |
|     from urllib.request import urlopen
 | |
| 
 | |
|     status = None
 | |
|     try:
 | |
|         with urlopen(url) as f:
 | |
|             status = f.status
 | |
|     except Exception:
 | |
|         return False
 | |
|     return status == 200
 | |
| 
 | |
| 
 | |
| class CMakeExtension(Extension):
 | |
| 
 | |
|     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
 | |
|         super().__init__(name, sources=[], py_limited_api=True, **kwa)
 | |
|         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 | |
| 
 | |
| 
 | |
| class cmake_build_ext(build_ext):
 | |
|     # A dict of extension directories that have been configured.
 | |
|     did_config: dict[str, bool] = {}
 | |
| 
 | |
|     #
 | |
|     # Determine number of compilation jobs and optionally nvcc compile threads.
 | |
|     #
 | |
|     def compute_num_jobs(self):
 | |
|         # `num_jobs` is either the value of the MAX_JOBS environment variable
 | |
|         # (if defined) or the number of CPUs available.
 | |
|         num_jobs = envs.MAX_JOBS
 | |
|         if num_jobs is not None:
 | |
|             num_jobs = int(num_jobs)
 | |
|             logger.info("Using MAX_JOBS=%d as the number of jobs.", num_jobs)
 | |
|         else:
 | |
|             try:
 | |
|                 # os.sched_getaffinity() isn't universally available, so fall
 | |
|                 #  back to os.cpu_count() if we get an error here.
 | |
|                 num_jobs = len(os.sched_getaffinity(0))
 | |
|             except AttributeError:
 | |
|                 num_jobs = os.cpu_count()
 | |
| 
 | |
|         nvcc_threads = None
 | |
|         if _is_cuda() and get_nvcc_cuda_version() >= Version("11.2"):
 | |
|             # `nvcc_threads` is either the value of the NVCC_THREADS
 | |
|             # environment variable (if defined) or 1.
 | |
|             # when it is set, we reduce `num_jobs` to avoid
 | |
|             # overloading the system.
 | |
|             nvcc_threads = envs.NVCC_THREADS
 | |
|             if nvcc_threads is not None:
 | |
|                 nvcc_threads = int(nvcc_threads)
 | |
|                 logger.info(
 | |
|                     "Using NVCC_THREADS=%d as the number of nvcc threads.",
 | |
|                     nvcc_threads)
 | |
|             else:
 | |
|                 nvcc_threads = 1
 | |
|             num_jobs = max(1, num_jobs // nvcc_threads)
 | |
| 
 | |
|         return num_jobs, nvcc_threads
 | |
| 
 | |
|     #
 | |
|     # Perform cmake configuration for a single extension.
 | |
|     #
 | |
|     def configure(self, ext: CMakeExtension) -> None:
 | |
|         # If we've already configured using the CMakeLists.txt for
 | |
|         # this extension, exit early.
 | |
|         if ext.cmake_lists_dir in cmake_build_ext.did_config:
 | |
|             return
 | |
| 
 | |
|         cmake_build_ext.did_config[ext.cmake_lists_dir] = True
 | |
| 
 | |
|         # Select the build type.
 | |
|         # Note: optimization level + debug info are set by the build type
 | |
|         default_cfg = "Debug" if self.debug else "RelWithDebInfo"
 | |
|         cfg = envs.CMAKE_BUILD_TYPE or default_cfg
 | |
| 
 | |
|         cmake_args = [
 | |
|             '-DCMAKE_BUILD_TYPE={}'.format(cfg),
 | |
|             '-DVLLM_TARGET_DEVICE={}'.format(VLLM_TARGET_DEVICE),
 | |
|         ]
 | |
| 
 | |
|         verbose = envs.VERBOSE
 | |
|         if verbose:
 | |
|             cmake_args += ['-DCMAKE_VERBOSE_MAKEFILE=ON']
 | |
| 
 | |
|         if is_sccache_available():
 | |
|             cmake_args += [
 | |
|                 '-DCMAKE_C_COMPILER_LAUNCHER=sccache',
 | |
|                 '-DCMAKE_CXX_COMPILER_LAUNCHER=sccache',
 | |
|                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=sccache',
 | |
|                 '-DCMAKE_HIP_COMPILER_LAUNCHER=sccache',
 | |
|             ]
 | |
|         elif is_ccache_available():
 | |
|             cmake_args += [
 | |
|                 '-DCMAKE_C_COMPILER_LAUNCHER=ccache',
 | |
|                 '-DCMAKE_CXX_COMPILER_LAUNCHER=ccache',
 | |
|                 '-DCMAKE_CUDA_COMPILER_LAUNCHER=ccache',
 | |
|                 '-DCMAKE_HIP_COMPILER_LAUNCHER=ccache',
 | |
|             ]
 | |
| 
 | |
|         # Pass the python executable to cmake so it can find an exact
 | |
|         # match.
 | |
|         cmake_args += ['-DVLLM_PYTHON_EXECUTABLE={}'.format(sys.executable)]
 | |
| 
 | |
|         # Pass the python path to cmake so it can reuse the build dependencies
 | |
|         # on subsequent calls to python.
 | |
|         cmake_args += ['-DVLLM_PYTHON_PATH={}'.format(":".join(sys.path))]
 | |
| 
 | |
|         # Override the base directory for FetchContent downloads to $ROOT/.deps
 | |
|         # This allows sharing dependencies between profiles,
 | |
|         # and plays more nicely with sccache.
 | |
|         # To override this, set the FETCHCONTENT_BASE_DIR environment variable.
 | |
|         fc_base_dir = os.path.join(ROOT_DIR, ".deps")
 | |
|         fc_base_dir = os.environ.get("FETCHCONTENT_BASE_DIR", fc_base_dir)
 | |
|         cmake_args += ['-DFETCHCONTENT_BASE_DIR={}'.format(fc_base_dir)]
 | |
| 
 | |
|         #
 | |
|         # Setup parallelism and build tool
 | |
|         #
 | |
|         num_jobs, nvcc_threads = self.compute_num_jobs()
 | |
| 
 | |
|         if nvcc_threads:
 | |
|             cmake_args += ['-DNVCC_THREADS={}'.format(nvcc_threads)]
 | |
| 
 | |
|         if is_ninja_available():
 | |
|             build_tool = ['-G', 'Ninja']
 | |
|             cmake_args += [
 | |
|                 '-DCMAKE_JOB_POOL_COMPILE:STRING=compile',
 | |
|                 '-DCMAKE_JOB_POOLS:STRING=compile={}'.format(num_jobs),
 | |
|             ]
 | |
|         else:
 | |
|             # Default build tool to whatever cmake picks.
 | |
|             build_tool = []
 | |
|         # Make sure we use the nvcc from CUDA_HOME
 | |
|         if _is_cuda():
 | |
|             cmake_args += [f'-DCMAKE_CUDA_COMPILER={CUDA_HOME}/bin/nvcc']
 | |
|         subprocess.check_call(
 | |
|             ['cmake', ext.cmake_lists_dir, *build_tool, *cmake_args],
 | |
|             cwd=self.build_temp)
 | |
| 
 | |
|     def build_extensions(self) -> None:
 | |
|         # Ensure that CMake is present and working
 | |
|         try:
 | |
|             subprocess.check_output(['cmake', '--version'])
 | |
|         except OSError as e:
 | |
|             raise RuntimeError('Cannot find CMake executable') from e
 | |
| 
 | |
|         # Create build directory if it does not exist.
 | |
|         if not os.path.exists(self.build_temp):
 | |
|             os.makedirs(self.build_temp)
 | |
| 
 | |
|         targets = []
 | |
| 
 | |
|         def target_name(s: str) -> str:
 | |
|             return s.removeprefix("vllm.").removeprefix("vllm_flash_attn.")
 | |
| 
 | |
|         # Build all the extensions
 | |
|         for ext in self.extensions:
 | |
|             self.configure(ext)
 | |
|             targets.append(target_name(ext.name))
 | |
| 
 | |
|         num_jobs, _ = self.compute_num_jobs()
 | |
| 
 | |
|         build_args = [
 | |
|             "--build",
 | |
|             ".",
 | |
|             f"-j={num_jobs}",
 | |
|             *[f"--target={name}" for name in targets],
 | |
|         ]
 | |
| 
 | |
|         subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
 | |
| 
 | |
|         # Install the libraries
 | |
|         for ext in self.extensions:
 | |
|             # Install the extension into the proper location
 | |
|             outdir = Path(self.get_ext_fullpath(ext.name)).parent.absolute()
 | |
| 
 | |
|             # Skip if the install directory is the same as the build directory
 | |
|             if outdir == self.build_temp:
 | |
|                 continue
 | |
| 
 | |
|             # CMake appends the extension prefix to the install path,
 | |
|             # and outdir already contains that prefix, so we need to remove it.
 | |
|             prefix = outdir
 | |
|             for _ in range(ext.name.count('.')):
 | |
|                 prefix = prefix.parent
 | |
| 
 | |
|             # prefix here should actually be the same for all components
 | |
|             install_args = [
 | |
|                 "cmake", "--install", ".", "--prefix", prefix, "--component",
 | |
|                 target_name(ext.name)
 | |
|             ]
 | |
|             subprocess.check_call(install_args, cwd=self.build_temp)
 | |
| 
 | |
|     def run(self):
 | |
|         # First, run the standard build_ext command to compile the extensions
 | |
|         super().run()
 | |
| 
 | |
|         # copy vllm/vllm_flash_attn/**/*.py from self.build_lib to current
 | |
|         # directory so that they can be included in the editable build
 | |
|         import glob
 | |
|         files = glob.glob(os.path.join(self.build_lib, "vllm",
 | |
|                                        "vllm_flash_attn", "**", "*.py"),
 | |
|                           recursive=True)
 | |
|         for file in files:
 | |
|             dst_file = os.path.join("vllm/vllm_flash_attn",
 | |
|                                     file.split("vllm/vllm_flash_attn/")[-1])
 | |
|             print(f"Copying {file} to {dst_file}")
 | |
|             os.makedirs(os.path.dirname(dst_file), exist_ok=True)
 | |
|             self.copy_file(file, dst_file)
 | |
| 
 | |
| 
 | |
| class repackage_wheel(build_ext):
 | |
|     """Extracts libraries and other files from an existing wheel."""
 | |
| 
 | |
|     def get_base_commit_in_main_branch(self) -> str:
 | |
|         # Force to use the nightly wheel. This is mainly used for CI testing.
 | |
|         if envs.VLLM_TEST_USE_PRECOMPILED_NIGHTLY_WHEEL:
 | |
|             return "nightly"
 | |
| 
 | |
|         try:
 | |
|             # Get the latest commit hash of the upstream main branch.
 | |
|             resp_json = subprocess.check_output([
 | |
|                 "curl", "-s",
 | |
|                 "https://api.github.com/repos/vllm-project/vllm/commits/main"
 | |
|             ]).decode("utf-8")
 | |
|             upstream_main_commit = json.loads(resp_json)["sha"]
 | |
| 
 | |
|             # Check if the upstream_main_commit exists in the local repo
 | |
|             try:
 | |
|                 subprocess.check_output(
 | |
|                     ["git", "cat-file", "-e", f"{upstream_main_commit}"])
 | |
|             except subprocess.CalledProcessError:
 | |
|                 # If not present, fetch it from the remote repository.
 | |
|                 # Note that this does not update any local branches,
 | |
|                 # but ensures that this commit ref and its history are
 | |
|                 # available in our local repo.
 | |
|                 subprocess.check_call([
 | |
|                     "git", "fetch", "https://github.com/vllm-project/vllm",
 | |
|                     "main"
 | |
|                 ])
 | |
| 
 | |
|             # Then get the commit hash of the current branch that is the same as
 | |
|             # the upstream main commit.
 | |
|             current_branch = subprocess.check_output(
 | |
|                 ["git", "branch", "--show-current"]).decode("utf-8").strip()
 | |
| 
 | |
|             base_commit = subprocess.check_output([
 | |
|                 "git", "merge-base", f"{upstream_main_commit}", current_branch
 | |
|             ]).decode("utf-8").strip()
 | |
|             return base_commit
 | |
|         except ValueError as err:
 | |
|             raise ValueError(err) from None
 | |
|         except Exception as err:
 | |
|             logger.warning(
 | |
|                 "Failed to get the base commit in the main branch. "
 | |
|                 "Using the nightly wheel. The libraries in this "
 | |
|                 "wheel may not be compatible with your dev branch: %s", err)
 | |
|             return "nightly"
 | |
| 
 | |
|     def run(self) -> None:
 | |
|         assert _is_cuda(
 | |
|         ), "VLLM_USE_PRECOMPILED is only supported for CUDA builds"
 | |
| 
 | |
|         wheel_location = os.getenv("VLLM_PRECOMPILED_WHEEL_LOCATION", None)
 | |
|         if wheel_location is None:
 | |
|             base_commit = self.get_base_commit_in_main_branch()
 | |
|             wheel_location = f"https://wheels.vllm.ai/{base_commit}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 | |
|             # Fallback to nightly wheel if latest commit wheel is unavailable,
 | |
|             # in this rare case, the nightly release CI hasn't finished on main.
 | |
|             if not is_url_available(wheel_location):
 | |
|                 wheel_location = "https://wheels.vllm.ai/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
 | |
| 
 | |
|         import zipfile
 | |
| 
 | |
|         if os.path.isfile(wheel_location):
 | |
|             wheel_path = wheel_location
 | |
|             print(f"Using existing wheel={wheel_path}")
 | |
|         else:
 | |
|             # Download the wheel from a given URL, assume
 | |
|             # the filename is the last part of the URL
 | |
|             wheel_filename = wheel_location.split("/")[-1]
 | |
| 
 | |
|             import tempfile
 | |
| 
 | |
|             # create a temporary directory to store the wheel
 | |
|             temp_dir = tempfile.mkdtemp(prefix="vllm-wheels")
 | |
|             wheel_path = os.path.join(temp_dir, wheel_filename)
 | |
| 
 | |
|             print(f"Downloading wheel from {wheel_location} to {wheel_path}")
 | |
| 
 | |
|             from urllib.request import urlretrieve
 | |
| 
 | |
|             try:
 | |
|                 urlretrieve(wheel_location, filename=wheel_path)
 | |
|             except Exception as e:
 | |
|                 from setuptools.errors import SetupError
 | |
| 
 | |
|                 raise SetupError(
 | |
|                     f"Failed to get vLLM wheel from {wheel_location}") from e
 | |
| 
 | |
|         with zipfile.ZipFile(wheel_path) as wheel:
 | |
|             files_to_copy = [
 | |
|                 "vllm/_C.abi3.so",
 | |
|                 "vllm/_moe_C.abi3.so",
 | |
|                 "vllm/_flashmla_C.abi3.so",
 | |
|                 "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
 | |
|                 "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
 | |
|                 "vllm/cumem_allocator.abi3.so",
 | |
|                 # "vllm/_version.py", # not available in nightly wheels yet
 | |
|             ]
 | |
| 
 | |
|             file_members = list(
 | |
|                 filter(lambda x: x.filename in files_to_copy, wheel.filelist))
 | |
| 
 | |
|             # vllm_flash_attn python code:
 | |
|             # Regex from
 | |
|             #  `glob.translate('vllm/vllm_flash_attn/**/*.py', recursive=True)`
 | |
|             compiled_regex = re.compile(
 | |
|                 r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py")
 | |
|             file_members += list(
 | |
|                 filter(lambda x: compiled_regex.match(x.filename),
 | |
|                        wheel.filelist))
 | |
| 
 | |
|             for file in file_members:
 | |
|                 print(f"Extracting and including {file.filename} "
 | |
|                       "from existing wheel")
 | |
|                 package_name = os.path.dirname(file.filename).replace("/", ".")
 | |
|                 file_name = os.path.basename(file.filename)
 | |
| 
 | |
|                 if package_name not in package_data:
 | |
|                     package_data[package_name] = []
 | |
| 
 | |
|                 wheel.extract(file)
 | |
|                 if file_name.endswith(".py"):
 | |
|                     # python files shouldn't be added to package_data
 | |
|                     continue
 | |
| 
 | |
|                 package_data[package_name].append(file_name)
 | |
| 
 | |
| 
 | |
| def _is_hpu() -> bool:
 | |
|     # if VLLM_TARGET_DEVICE env var was set explicitly, skip HPU autodetection
 | |
|     if os.getenv("VLLM_TARGET_DEVICE", None) == VLLM_TARGET_DEVICE:
 | |
|         return VLLM_TARGET_DEVICE == "hpu"
 | |
| 
 | |
|     # if VLLM_TARGET_DEVICE was not set explicitly, check if hl-smi succeeds,
 | |
|     # and if it doesn't, check if habanalabs driver is loaded
 | |
|     is_hpu_available = False
 | |
|     try:
 | |
|         out = subprocess.run(["hl-smi"], capture_output=True, check=True)
 | |
|         is_hpu_available = out.returncode == 0
 | |
|     except (FileNotFoundError, PermissionError, subprocess.CalledProcessError):
 | |
|         if sys.platform.startswith("linux"):
 | |
|             try:
 | |
|                 output = subprocess.check_output(
 | |
|                     'lsmod | grep habanalabs | wc -l', shell=True)
 | |
|                 is_hpu_available = int(output) > 0
 | |
|             except (ValueError, FileNotFoundError, PermissionError,
 | |
|                     subprocess.CalledProcessError):
 | |
|                 pass
 | |
|     return is_hpu_available
 | |
| 
 | |
| 
 | |
| def _no_device() -> bool:
 | |
|     return VLLM_TARGET_DEVICE == "empty"
 | |
| 
 | |
| 
 | |
| def _is_cuda() -> bool:
 | |
|     has_cuda = torch.version.cuda is not None
 | |
|     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
 | |
|             and not (_is_neuron() or _is_tpu() or _is_hpu()))
 | |
| 
 | |
| 
 | |
| def _is_hip() -> bool:
 | |
|     return (VLLM_TARGET_DEVICE == "cuda"
 | |
|             or VLLM_TARGET_DEVICE == "rocm") and torch.version.hip is not None
 | |
| 
 | |
| 
 | |
| def _is_neuron() -> bool:
 | |
|     return VLLM_TARGET_DEVICE == "neuron"
 | |
| 
 | |
| 
 | |
| def _is_tpu() -> bool:
 | |
|     return VLLM_TARGET_DEVICE == "tpu"
 | |
| 
 | |
| 
 | |
| def _is_cpu() -> bool:
 | |
|     return VLLM_TARGET_DEVICE == "cpu"
 | |
| 
 | |
| 
 | |
| def _is_xpu() -> bool:
 | |
|     return VLLM_TARGET_DEVICE == "xpu"
 | |
| 
 | |
| 
 | |
| def _build_custom_ops() -> bool:
 | |
|     return _is_cuda() or _is_hip() or _is_cpu()
 | |
| 
 | |
| 
 | |
| def get_rocm_version():
 | |
|     # Get the Rocm version from the ROCM_HOME/bin/librocm-core.so
 | |
|     # see https://github.com/ROCm/rocm-core/blob/d11f5c20d500f729c393680a01fa902ebf92094b/rocm_version.cpp#L21
 | |
|     try:
 | |
|         librocm_core_file = Path(ROCM_HOME) / "lib" / "librocm-core.so"
 | |
|         if not librocm_core_file.is_file():
 | |
|             return None
 | |
|         librocm_core = ctypes.CDLL(librocm_core_file)
 | |
|         VerErrors = ctypes.c_uint32
 | |
|         get_rocm_core_version = librocm_core.getROCmVersion
 | |
|         get_rocm_core_version.restype = VerErrors
 | |
|         get_rocm_core_version.argtypes = [
 | |
|             ctypes.POINTER(ctypes.c_uint32),
 | |
|             ctypes.POINTER(ctypes.c_uint32),
 | |
|             ctypes.POINTER(ctypes.c_uint32),
 | |
|         ]
 | |
|         major = ctypes.c_uint32()
 | |
|         minor = ctypes.c_uint32()
 | |
|         patch = ctypes.c_uint32()
 | |
| 
 | |
|         if (get_rocm_core_version(ctypes.byref(major), ctypes.byref(minor),
 | |
|                                   ctypes.byref(patch)) == 0):
 | |
|             return f"{major.value}.{minor.value}.{patch.value}"
 | |
|         return None
 | |
|     except Exception:
 | |
|         return None
 | |
| 
 | |
| 
 | |
| def get_neuronxcc_version():
 | |
|     import sysconfig
 | |
|     site_dir = sysconfig.get_paths()["purelib"]
 | |
|     version_file = os.path.join(site_dir, "neuronxcc", "version",
 | |
|                                 "__init__.py")
 | |
| 
 | |
|     # Check if the command was executed successfully
 | |
|     with open(version_file) as fp:
 | |
|         content = fp.read()
 | |
| 
 | |
|     # Extract the version using a regular expression
 | |
|     match = re.search(r"__version__ = '(\S+)'", content)
 | |
|     if match:
 | |
|         # Return the version string
 | |
|         return match.group(1)
 | |
|     else:
 | |
|         raise RuntimeError("Could not find Neuron version in the output")
 | |
| 
 | |
| 
 | |
| def get_nvcc_cuda_version() -> Version:
 | |
|     """Get the CUDA version from nvcc.
 | |
| 
 | |
|     Adapted from https://github.com/NVIDIA/apex/blob/8b7a1ff183741dd8f9b87e7bafd04cfde99cea28/setup.py
 | |
|     """
 | |
|     assert CUDA_HOME is not None, "CUDA_HOME is not set"
 | |
|     nvcc_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"],
 | |
|                                           universal_newlines=True)
 | |
|     output = nvcc_output.split()
 | |
|     release_idx = output.index("release") + 1
 | |
|     nvcc_cuda_version = parse(output[release_idx].split(",")[0])
 | |
|     return nvcc_cuda_version
 | |
| 
 | |
| 
 | |
| def get_gaudi_sw_version():
 | |
|     """
 | |
|     Returns the driver version.
 | |
|     """
 | |
|     # Enable console printing for `hl-smi` check
 | |
|     output = subprocess.run("hl-smi",
 | |
|                             shell=True,
 | |
|                             text=True,
 | |
|                             capture_output=True,
 | |
|                             env={"ENABLE_CONSOLE": "true"})
 | |
|     if output.returncode == 0 and output.stdout:
 | |
|         return output.stdout.split("\n")[2].replace(
 | |
|             " ", "").split(":")[1][:-1].split("-")[0]
 | |
|     return "0.0.0"  # when hl-smi is not available
 | |
| 
 | |
| 
 | |
| def get_vllm_version() -> str:
 | |
|     version = get_version(write_to="vllm/_version.py")
 | |
|     sep = "+" if "+" not in version else "."  # dev versions might contain +
 | |
| 
 | |
|     if _no_device():
 | |
|         if envs.VLLM_TARGET_DEVICE == "empty":
 | |
|             version += f"{sep}empty"
 | |
|     elif _is_cuda():
 | |
|         if envs.VLLM_USE_PRECOMPILED:
 | |
|             version += f"{sep}precompiled"
 | |
|         else:
 | |
|             cuda_version = str(get_nvcc_cuda_version())
 | |
|             if cuda_version != MAIN_CUDA_VERSION:
 | |
|                 cuda_version_str = cuda_version.replace(".", "")[:3]
 | |
|                 # skip this for source tarball, required for pypi
 | |
|                 if "sdist" not in sys.argv:
 | |
|                     version += f"{sep}cu{cuda_version_str}"
 | |
|     elif _is_hip():
 | |
|         # Get the Rocm Version
 | |
|         rocm_version = get_rocm_version() or torch.version.hip
 | |
|         if rocm_version and rocm_version != MAIN_CUDA_VERSION:
 | |
|             version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
 | |
|     elif _is_neuron():
 | |
|         # Get the Neuron version
 | |
|         neuron_version = str(get_neuronxcc_version())
 | |
|         if neuron_version != MAIN_CUDA_VERSION:
 | |
|             neuron_version_str = neuron_version.replace(".", "")[:3]
 | |
|             version += f"{sep}neuron{neuron_version_str}"
 | |
|     elif _is_hpu():
 | |
|         # Get the Intel Gaudi Software Suite version
 | |
|         gaudi_sw_version = str(get_gaudi_sw_version())
 | |
|         if gaudi_sw_version != MAIN_CUDA_VERSION:
 | |
|             gaudi_sw_version = gaudi_sw_version.replace(".", "")[:3]
 | |
|             version += f"{sep}gaudi{gaudi_sw_version}"
 | |
|     elif _is_tpu():
 | |
|         version += f"{sep}tpu"
 | |
|     elif _is_cpu():
 | |
|         if envs.VLLM_TARGET_DEVICE == "cpu":
 | |
|             version += f"{sep}cpu"
 | |
|     elif _is_xpu():
 | |
|         version += f"{sep}xpu"
 | |
|     else:
 | |
|         raise RuntimeError("Unknown runtime environment")
 | |
| 
 | |
|     return version
 | |
| 
 | |
| 
 | |
| def get_requirements() -> list[str]:
 | |
|     """Get Python package dependencies from requirements.txt."""
 | |
|     requirements_dir = ROOT_DIR / "requirements"
 | |
| 
 | |
|     def _read_requirements(filename: str) -> list[str]:
 | |
|         with open(requirements_dir / filename) as f:
 | |
|             requirements = f.read().strip().split("\n")
 | |
|         resolved_requirements = []
 | |
|         for line in requirements:
 | |
|             if line.startswith("-r "):
 | |
|                 resolved_requirements += _read_requirements(line.split()[1])
 | |
|             elif not line.startswith("--") and not line.startswith(
 | |
|                     "#") and line.strip() != "":
 | |
|                 resolved_requirements.append(line)
 | |
|         return resolved_requirements
 | |
| 
 | |
|     if _no_device():
 | |
|         requirements = _read_requirements("common.txt")
 | |
|     elif _is_cuda():
 | |
|         requirements = _read_requirements("cuda.txt")
 | |
|         cuda_major, cuda_minor = torch.version.cuda.split(".")
 | |
|         modified_requirements = []
 | |
|         for req in requirements:
 | |
|             if ("vllm-flash-attn" in req and cuda_major != "12"):
 | |
|                 # vllm-flash-attn is built only for CUDA 12.x.
 | |
|                 # Skip for other versions.
 | |
|                 continue
 | |
|             modified_requirements.append(req)
 | |
|         requirements = modified_requirements
 | |
|     elif _is_hip():
 | |
|         requirements = _read_requirements("rocm.txt")
 | |
|     elif _is_neuron():
 | |
|         requirements = _read_requirements("neuron.txt")
 | |
|     elif _is_hpu():
 | |
|         requirements = _read_requirements("hpu.txt")
 | |
|     elif _is_tpu():
 | |
|         requirements = _read_requirements("tpu.txt")
 | |
|     elif _is_cpu():
 | |
|         requirements = _read_requirements("cpu.txt")
 | |
|     elif _is_xpu():
 | |
|         requirements = _read_requirements("xpu.txt")
 | |
|     else:
 | |
|         raise ValueError(
 | |
|             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
 | |
|             "or CPU.")
 | |
|     return requirements
 | |
| 
 | |
| 
 | |
| ext_modules = []
 | |
| 
 | |
| if _is_cuda() or _is_hip():
 | |
|     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 | |
| 
 | |
| if _is_hip():
 | |
|     ext_modules.append(CMakeExtension(name="vllm._rocm_C"))
 | |
| 
 | |
| if _is_cuda():
 | |
|     ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
 | |
|     if envs.VLLM_USE_PRECOMPILED or get_nvcc_cuda_version() >= Version("12.3"):
 | |
|         # FA3 requires CUDA 12.3 or later
 | |
|         ext_modules.append(
 | |
|             CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa3_C"))
 | |
|         # Optional since this doesn't get built (produce an .so file) when
 | |
|         # not targeting a hopper system
 | |
|         ext_modules.append(
 | |
|             CMakeExtension(name="vllm._flashmla_C", optional=True))
 | |
|     ext_modules.append(CMakeExtension(name="vllm.cumem_allocator"))
 | |
| 
 | |
| if _build_custom_ops():
 | |
|     ext_modules.append(CMakeExtension(name="vllm._C"))
 | |
| 
 | |
| package_data = {
 | |
|     "vllm": [
 | |
|         "py.typed",
 | |
|         "model_executor/layers/fused_moe/configs/*.json",
 | |
|         "model_executor/layers/quantization/utils/configs/*.json",
 | |
|     ]
 | |
| }
 | |
| 
 | |
| if _no_device():
 | |
|     ext_modules = []
 | |
| 
 | |
| if not ext_modules:
 | |
|     cmdclass = {}
 | |
| else:
 | |
|     cmdclass = {
 | |
|         "build_ext":
 | |
|         repackage_wheel if envs.VLLM_USE_PRECOMPILED else cmake_build_ext
 | |
|     }
 | |
| 
 | |
| setup(
 | |
|     # static metadata should rather go in pyproject.toml
 | |
|     version=get_vllm_version(),
 | |
|     ext_modules=ext_modules,
 | |
|     install_requires=get_requirements(),
 | |
|     extras_require={
 | |
|         "bench": ["pandas", "datasets"],
 | |
|         "tensorizer": ["tensorizer>=2.9.0"],
 | |
|         "fastsafetensors": ["fastsafetensors >= 0.1.10"],
 | |
|         "runai": ["runai-model-streamer", "runai-model-streamer-s3", "boto3"],
 | |
|         "audio": ["librosa", "soundfile"],  # Required for audio processing
 | |
|         "video": []  # Kept for backwards compatibility
 | |
|     },
 | |
|     cmdclass=cmdclass,
 | |
|     package_data=package_data,
 | |
| )
 |