[bugfix] fix blackwell deepep installation (#22255)

This commit is contained in:
youkaichao
2025-08-06 01:26:09 +08:00
committed by GitHub
parent 469b3ffaaa
commit 59a0b8554b
2 changed files with 12 additions and 6 deletions

View File

@ -13,16 +13,16 @@ All scripts accept a positional argument as workspace path for staging the build
## Usage
### Single-node
```bash
bash install_python_libraries.sh
# for hopper
TORCH_CUDA_ARCH_LIST="9.0" bash install_python_libraries.sh
# for blackwell
TORCH_CUDA_ARCH_LIST="10.0" bash install_python_libraries.sh
```
### Multi-node
Additional step for multi-node deployment:
```bash
bash install_python_libraries.sh
sudo bash configure_system_drivers.sh
sudo reboot # Reboot is required to load the new driver
```

View File

@ -29,6 +29,12 @@ if [ -z "$CUDA_HOME" ]; then
exit 1
fi
# assume TORCH_CUDA_ARCH_LIST is set correctly
if [ -z "$TORCH_CUDA_ARCH_LIST" ]; then
echo "TORCH_CUDA_ARCH_LIST is not set, please set it to your desired architecture."
exit 1
fi
# disable all features except IBGDA
export NVSHMEM_IBGDA_SUPPORT=1
@ -95,7 +101,7 @@ clone_repo "https://github.com/ppl-ai/pplx-kernels" "pplx-kernels" "setup.py"
cd pplx-kernels
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
PIP_NO_BUILD_ISOLATION=0 TORCH_CUDA_ARCH_LIST=9.0a+PTX pip install -vvv -e .
PIP_NO_BUILD_ISOLATION=0 pip install -vvv -e .
popd
# build and install deepep, require pytorch installed