mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 14:53:52 +08:00
[ci] fix wheel names for arm wheels (#24898)
Signed-off-by: simon-mo <simon.mo@hey.com>
This commit is contained in:
@ -1,24 +1,22 @@
|
||||
steps:
|
||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||
- label: "Build arm64 wheel - CUDA 12.9"
|
||||
depends_on: ~
|
||||
id: build-wheel-arm64-cuda-12-9
|
||||
agents:
|
||||
queue: arm64_cpu_queue_postmerge
|
||||
commands:
|
||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||
- "mkdir artifacts"
|
||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build CUDA 12.8 wheel"
|
||||
key: block-build-cu128-wheel
|
||||
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
depends_on: block-build-cu128-wheel
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
@ -30,12 +28,8 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- block: "Build CUDA 12.6 wheel"
|
||||
key: block-build-cu126-wheel
|
||||
depends_on: ~
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
depends_on: block-build-cu126-wheel
|
||||
depends_on: ~
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
@ -102,8 +96,6 @@ steps:
|
||||
depends_on:
|
||||
- create-multi-arch-manifest
|
||||
- build-wheel-cuda-12-8
|
||||
- build-wheel-cuda-12-6
|
||||
- build-wheel-cuda-12-9
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
|
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai:latest
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
docker push vllm/vllm-openai:latest-x86_64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
docker push vllm/vllm-openai:latest-aarch64
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||
|
||||
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||
docker manifest push vllm/vllm-openai:latest
|
||||
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
@ -196,6 +196,7 @@ ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# Flag to control whether to use pre-built vLLM wheels
|
||||
ARG VLLM_USE_PRECOMPILED=""
|
||||
ARG VLLM_MAIN_CUDA_VERSION=""
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
@ -213,6 +214,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_USE_PRECOMPILED="${VLLM_USE_PRECOMPILED}" \
|
||||
&& export VLLM_MAIN_CUDA_VERSION="${VLLM_MAIN_CUDA_VERSION}" \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38 \
|
||||
|
6
setup.py
6
setup.py
@ -56,8 +56,6 @@ elif (sys.platform.startswith("linux") and torch.version.cuda is None
|
||||
# fallback to cpu
|
||||
VLLM_TARGET_DEVICE = "cpu"
|
||||
|
||||
MAIN_CUDA_VERSION = "12.8"
|
||||
|
||||
|
||||
def is_sccache_available() -> bool:
|
||||
return which("sccache") is not None and \
|
||||
@ -507,7 +505,7 @@ def get_vllm_version() -> str:
|
||||
version += f"{sep}precompiled"
|
||||
else:
|
||||
cuda_version = str(get_nvcc_cuda_version())
|
||||
if cuda_version != MAIN_CUDA_VERSION:
|
||||
if cuda_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||
cuda_version_str = cuda_version.replace(".", "")[:3]
|
||||
# skip this for source tarball, required for pypi
|
||||
if "sdist" not in sys.argv:
|
||||
@ -515,7 +513,7 @@ def get_vllm_version() -> str:
|
||||
elif _is_hip():
|
||||
# Get the Rocm Version
|
||||
rocm_version = get_rocm_version() or torch.version.hip
|
||||
if rocm_version and rocm_version != MAIN_CUDA_VERSION:
|
||||
if rocm_version and rocm_version != envs.VLLM_MAIN_CUDA_VERSION:
|
||||
version += f"{sep}rocm{rocm_version.replace('.', '')[:3]}"
|
||||
elif _is_tpu():
|
||||
version += f"{sep}tpu"
|
||||
|
@ -70,6 +70,7 @@ if TYPE_CHECKING:
|
||||
VLLM_VIDEO_LOADER_BACKEND: str = "opencv"
|
||||
VLLM_MM_INPUT_CACHE_GIB: int = 4
|
||||
VLLM_TARGET_DEVICE: str = "cuda"
|
||||
VLLM_MAIN_CUDA_VERSION: str = "12.8"
|
||||
MAX_JOBS: Optional[str] = None
|
||||
NVCC_THREADS: Optional[str] = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
@ -249,6 +250,11 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_TARGET_DEVICE":
|
||||
lambda: os.getenv("VLLM_TARGET_DEVICE", "cuda").lower(),
|
||||
|
||||
# Main CUDA version of vLLM, supporting [12.6, 12.8, 12.9],
|
||||
# 12.8 is the default. This follows PyTorch but can be overridden.
|
||||
"VLLM_MAIN_CUDA_VERSION":
|
||||
lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.8",
|
||||
|
||||
# Maximum number of compilation jobs to run in parallel.
|
||||
# By default this is the number of CPUs
|
||||
"MAX_JOBS":
|
||||
|
Reference in New Issue
Block a user