Add flashinfer-build.sh and register precompiled cu128 wheel in Dockerfile (#25782)

Signed-off-by: mgoin <mgoin64@gmail.com>
2025-10-20 14:53:52 +08:00 · 2025-09-26 21:54:09 -04:00
parent 3958b96bf5
commit 92da847cf5
2 changed files with 83 additions and 10 deletions
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@ -391,18 +391,28 @@ RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
        ${FLASHINFER_GIT_REPO} flashinfer
+    # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+    # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
+    if [[ "${CUDA_VERSION}" == 11.* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+    elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+    else
+        # CUDA 12.8+ supports 10.0a and 12.0
+        FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+    fi
    pushd flashinfer
-        if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
-            # Exclude CUDA arches for older versions (11.x and 12.0-12.7)
-            # TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
-            if [[ "${CUDA_VERSION}" == 11.* ]]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
-            elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
-            else
-                # CUDA 12.8+ supports 10.0a and 12.0
-                FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+        if [[ "${CUDA_VERSION}" == 12.8.* ]] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then
+            # NOTE: To make new precompiled wheels, see tools/flashinfer-build.sh
+            echo "🏗️  Installing FlashInfer from pre-compiled wheel"
+            uv pip install --system https://wheels.vllm.ai/flashinfer-python/flashinfer_python-0.3.1-cp39-abi3-manylinux1_x86_64.whl \
+                --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+            if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
+                # Download pre-compiled cubins
+                TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
+                    python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
            fi
+        elif [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
            echo "🏗️  Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
            export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
            # HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
--- a/tools/flashinfer-build.sh
+++ b/tools/flashinfer-build.sh
@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# This script is used to build FlashInfer wheels with AOT kernels
+
+set -ex
+
+# FlashInfer configuration
+FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
+FLASHINFER_GIT_REF="${FLASHINFER_GIT_REF}"
+CUDA_VERSION="${CUDA_VERSION}"
+BUILD_WHEEL="${BUILD_WHEEL:-true}"
+
+if [[ -z "${FLASHINFER_GIT_REF}" ]]; then
+    echo "❌ FLASHINFER_GIT_REF must be specified" >&2
+    exit 1
+fi
+
+if [[ -z "${CUDA_VERSION}" ]]; then
+    echo "❌ CUDA_VERSION must be specified" >&2
+    exit 1
+fi
+
+echo "🏗️  Building FlashInfer ${FLASHINFER_GIT_REF} for CUDA ${CUDA_VERSION}"
+
+# Clone FlashInfer
+git clone --depth 1 --recursive --shallow-submodules \
+    --branch ${FLASHINFER_GIT_REF} \
+    ${FLASHINFER_GIT_REPO} flashinfer
+
+# Set CUDA arch list based on CUDA version
+# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
+if [[ "${CUDA_VERSION}" == 11.* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
+elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
+else
+    # CUDA 12.8+ supports 10.0a and 12.0
+    FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
+fi
+
+echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
+
+pushd flashinfer
+    # Make sure the wheel is built for the correct CUDA version
+    export UV_TORCH_BACKEND=cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
+
+    # Build AOT kernels
+    export TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+    export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
+    python3 -m flashinfer.aot
+    
+    if [[ "${BUILD_WHEEL}" == "true" ]]; then
+        # Build wheel for distribution
+        uv build --no-build-isolation --wheel --out-dir ../flashinfer-dist .
+        echo "✅ FlashInfer wheel built successfully in flashinfer-dist/"
+    else
+        # Install directly (for Dockerfile)
+        uv pip install --system --no-build-isolation --force-reinstall .
+        echo "✅ FlashInfer installed successfully"
+    fi
+popd
+
+# Cleanup
+rm -rf flashinfer