Use gpu_1_queue

Consolidate args
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-10-20 23:03:52 +08:00 · 2025-07-30 18:08:58 -04:00 · 2025-07-30 14:40:00 -04:00 · 2025-07-30 13:01:55 -04:00 · 2025-07-30 12:58:07 -04:00 · 2025-07-30 12:41:14 -04:00
5 changed files with 139 additions and 29 deletions
--- a/.buildkite/release-pipeline.yaml
+++ b/.buildkite/release-pipeline.yaml
@ -41,6 +41,20 @@ steps:
    env:
      DOCKER_BUILDKIT: "1"

+  - block: "Build FlashInfer wheel"
+    key: block-build-flashinfer-wheel
+    depends_on: ~
+
+  - label: "Build and upload FlashInfer wheel - CUDA 12.8"
+    depends_on: block-build-flashinfer-wheel
+    id: build-upload-flashinfer-wheel
+    agents:
+      queue: gpu_1_queue
+    commands:
+      - "bash .buildkite/scripts/build-upload-flashinfer-wheel.sh 12.8.1"
+    env:
+      DOCKER_BUILDKIT: "1"
+
  - block: "Build release image"
    depends_on: ~
    key: block-release-image-build
--- a/.buildkite/scripts/build-upload-flashinfer-wheel.sh
+++ b/.buildkite/scripts/build-upload-flashinfer-wheel.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+
+set -ex
+
+CUDA_VERSION="${1:-12.8.1}"
+# FlashInfer version controlled in tools/flashinfer-build.sh
+
+echo "Building FlashInfer wheel for CUDA ${CUDA_VERSION} using vLLM Dockerfile"
+
+# Build the FlashInfer wheel using the existing Dockerfile stage
+DOCKER_BUILDKIT=1 docker build \
+  --build-arg max_jobs=16 \
+  --build-arg USE_SCCACHE=1 \
+  --build-arg CUDA_VERSION="${CUDA_VERSION}" \
+  --tag flashinfer-wheel-builder:${CUDA_VERSION} \
+  --target flashinfer-wheel-builder \
+  --progress plain \
+  -f docker/Dockerfile .
+
+# Extract the wheel
+mkdir -p artifacts/dist
+docker run --rm -v $(pwd)/artifacts:/output_host flashinfer-wheel-builder:${CUDA_VERSION} \
+  bash -c 'cp /output/*.whl /output_host/dist/ && chmod -R a+rw /output_host'
+
+# Upload the wheel to S3
+echo "Uploading FlashInfer wheel to S3..."
+wheel_files=(artifacts/dist/*.whl)
+
+# Check that exactly one wheel is found
+if [[ ${#wheel_files[@]} -ne 1 ]]; then
+  echo "Error: Expected exactly one wheel file in artifacts/dist/, but found ${#wheel_files[@]}"
+  exit 1
+fi
+
+# Get the single wheel file
+wheel="${wheel_files[0]}"
+echo "Processing FlashInfer wheel: $wheel"
+
+# Rename 'linux' to 'manylinux1' in the wheel filename for compatibility
+new_wheel="${wheel/linux/manylinux1}"
+if [[ "$wheel" != "$new_wheel" ]]; then
+  mv -- "$wheel" "$new_wheel"
+  wheel="$new_wheel"
+  echo "Renamed wheel to: $wheel"
+fi
+
+# Extract the version from the wheel
+version=$(unzip -p "$wheel" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
+wheel_name=$(basename "$wheel")
+echo "FlashInfer version: $version"
+
+# Upload the wheel to S3 under flashinfer-python directory
+aws s3 cp "$wheel" "s3://vllm-wheels/flashinfer-python/"
+
+echo "✅ FlashInfer wheel built and uploaded successfully for CUDA ${CUDA_VERSION}"
+echo "📦 Wheel: $wheel_name (version $version)"
+ls -la artifacts/dist/
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -268,6 +268,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
    else \
        echo "Skipping wheel size check."; \
    fi
+#################### FLASHINFER WHEEL BUILD IMAGE ####################
+FROM base AS flashinfer-wheel-builder
+ARG CUDA_VERSION
+
+COPY tools/flashinfer-build.sh /tmp/flashinfer-build.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    . /etc/environment && \
+    BUILD_WHEEL=true /tmp/flashinfer-build.sh
+
 #################### EXTENSION Build IMAGE ####################

 #################### DEV IMAGE ####################
@ -391,35 +400,11 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl

 # Install FlashInfer from source
-ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-# We use `--force-reinstall --no-deps` to avoid issues with the existing FlashInfer wheel.
-ARG FLASHINFER_GIT_REF="v0.2.9rc2"
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
-  . /etc/environment
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer
-    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-    if [[ "${CUDA_VERSION}" == 11.* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-    else
-        # CUDA 12.8+ supports 10.0a and 12.0
-        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
-    fi
-    echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
-    # Needed to build AOT kernels
-    pushd flashinfer
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            python3 -m flashinfer.aot
-        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
-            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
-    popd
-    rm -rf flashinfer
-BASH
+# Version controlled in tools/flashinfer-build.sh - keep in sync with requirements/cuda.txt
+COPY tools/flashinfer-build.sh /tmp/flashinfer-build.sh
+RUN --mount=type=cache,target=/root/.cache/uv \
+    . /etc/environment && \
+    /tmp/flashinfer-build.sh
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
--- a/docs/assets/contributing/dockerfile-stages-dependency.png
+++ b/docs/assets/contributing/dockerfile-stages-dependency.png
--- a/tools/flashinfer-build.sh
+++ b/tools/flashinfer-build.sh
@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+
+set -ex
+
+# Build FlashInfer with AOT kernels
+# This script is used by both the Dockerfile and standalone wheel building
+
+# FlashInfer configuration - keep FLASHINFER_GIT_REF in sync with requirements/cuda.txt
+FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF:-v0.2.9rc2}"  # Must match requirements/cuda.txt
+CUDA_VERSION="${CUDA_VERSION:-12.8.1}"
+BUILD_WHEEL="${BUILD_WHEEL:-false}"
+
+echo "🏗️  Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
+
+# Clone FlashInfer
+git clone --depth 1 --recursive --shallow-submodules \
+    --branch ${FLASHINFER_GIT_REF} \
+    ${FLASHINFER_GIT_REPO} flashinfer
+
+# Set CUDA arch list based on CUDA version
+# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+if [[ "${CUDA_VERSION}" == 11.* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+else
+    # CUDA 12.8+ supports 10.0a and 12.0
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+fi
+
+echo "🏗️  Building FlashInfer for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+
+# Build AOT kernels and install/build wheel
+pushd flashinfer
+    TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+        python3 -m flashinfer.aot
+    
+    if [[ "${BUILD_WHEEL}" == "true" ]]; then
+        # Build wheel for distribution
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            uv pip wheel --no-deps --wheel-dir /wheels .
+        mkdir -p /output && cp /wheels/*.whl /output/
+        echo "✅ FlashInfer wheel built successfully"
+    else
+        # Install directly (for Dockerfile)
+        TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+            uv pip install --system --no-build-isolation --force-reinstall --no-deps .
+        echo "✅ FlashInfer installed successfully"
+    fi
+popd
+
+# Cleanup
+rm -rf flashinfer
Author	SHA1	Message	Date
Michael Goin	c3d9640b09	Use gpu_1_queue	2025-07-30 18:08:58 -04:00
mgoin	02f7780716	Consolidate args Signed-off-by: mgoin <michael@neuralmagic.com>	2025-07-30 14:40:00 -04:00
mgoin	61d568a19d	Update dockerfile image Signed-off-by: mgoin <mgoin64@gmail.com>	2025-07-30 13:01:55 -04:00
mgoin	629468aa71	Use flashinfer-python location Signed-off-by: mgoin <michael@neuralmagic.com>	2025-07-30 12:58:07 -04:00
mgoin	b15005dc12	Simplify! Signed-off-by: mgoin <michael@neuralmagic.com>	2025-07-30 12:41:14 -04:00
mgoin	67ba6a9487	Fix index Signed-off-by: mgoin <michael@neuralmagic.com>	2025-07-30 11:56:36 -04:00
mgoin	f626cc9300	Add FlashInfer wheel building capability to buildkite pipeline Signed-off-by: mgoin <michael@neuralmagic.com>	2025-07-30 11:48:04 -04:00