[bugfix] fix blackwell deepep installation (#22255)

2025-10-20 14:53:52 +08:00 · 2025-08-06 01:26:09 +08:00
parent 469b3ffaaa
commit 59a0b8554b
2 changed files with 12 additions and 6 deletions
--- a/tools/ep_kernels/README.md
+++ b/tools/ep_kernels/README.md
@ -13,16 +13,16 @@ All scripts accept a positional argument as workspace path for staging the build

 ## Usage

-### Single-node
-
 ```bash
-bash install_python_libraries.sh
+# for hopper
+TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
+# for blackwell
+TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
 ```

-### Multi-node
+Additional step for multi-node deployment:

 ```bash
-bash install_python_libraries.sh
 sudo bash configure_system_drivers.sh
 sudo reboot # Reboot is required to load the new driver
 ```
--- a/tools/ep_kernels/install_python_libraries.sh
+++ b/tools/ep_kernels/install_python_libraries.sh
@ -29,6 +29,12 @@ if [ -z "$CUDA_HOME" ]; then
    exit 1
 fi

+# assume TORCH_CUDA_ARCH_LIST is set correctly
+if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
+    echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
+    exit 1
+fi
+
 # disable all features except IBGDA
 export NVSHMEM_IBGDA_SUPPORT=1

@ -95,7 +101,7 @@ clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
 cd pplx-kernels
 # see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
 # PIP_NO_BUILD_ISOLATION=0 disables build isolation
-PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e  .
+PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e  .
 popd

 # build and install deepep, require pytorch installed