mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-03 07:24:58 +08:00
Compare commits
8 Commits
csl/lintru
...
lucaskabel
| Author | SHA1 | Date | |
|---|---|---|---|
| 1207f9ab93 | |||
| fcfb6bab89 | |||
| 95bd114806 | |||
| ec68abdc38 | |||
| ee417d1806 | |||
| 4a8afeaffb | |||
| 90cba401a0 | |||
| 6e4c4d9e57 |
@ -1,15 +0,0 @@
|
||||
version: 1
|
||||
paths:
|
||||
include:
|
||||
- "**/*.py"
|
||||
exclude:
|
||||
- ".*"
|
||||
- ".*/**"
|
||||
- "**/.*/**"
|
||||
- "**/.*"
|
||||
- "**/_*/**"
|
||||
- "**/_*.py"
|
||||
- "**/test/**"
|
||||
- "**/benchmarks/**"
|
||||
- "**/test_*.py"
|
||||
- "**/*_test.py"
|
||||
@ -7,15 +7,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
|
||||
fi
|
||||
|
||||
if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
|
||||
export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
|
||||
fi
|
||||
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ "$DESIRED_CUDA" == *"13"* ]]; then
|
||||
export TORCH_NVCC_FLAGS="-compress-mode=size"
|
||||
fi
|
||||
|
||||
SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
|
||||
source $SCRIPTPATH/aarch64_ci_setup.sh
|
||||
|
||||
|
||||
@ -77,24 +77,21 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
wheelname = os.path.basename(wheel_path)
|
||||
os.mkdir(f"{folder}/tmp")
|
||||
os.system(f"unzip {wheel_path} -d {folder}/tmp")
|
||||
# Common libraries for all CUDA versions
|
||||
common_libs = [
|
||||
# Non-NVIDIA system libraries
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
# Common CUDA libraries (same for all versions)
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
libs_to_copy = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
|
||||
"/usr/local/cuda/lib64/libcudnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
"/usr/local/cuda/lib64/libcusparseLt.so.0",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libcurand.so.10",
|
||||
"/usr/local/cuda/lib64/libnccl.so.2",
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
"/usr/local/cuda/lib64/libcudnn_adv.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_cnn.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_graph.so.9",
|
||||
@ -102,41 +99,22 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
|
||||
"/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
"/usr/local/cuda/lib64/libcusparse.so.12",
|
||||
"/lib64/libgomp.so.1",
|
||||
"/usr/lib64/libgfortran.so.5",
|
||||
"/acl/build/libarm_compute.so",
|
||||
"/acl/build/libarm_compute_graph.so",
|
||||
"/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
|
||||
"/usr/local/lib/libnvpl_lapack_core.so.0",
|
||||
"/usr/local/lib/libnvpl_blas_core.so.0",
|
||||
]
|
||||
|
||||
# CUDA version-specific libraries
|
||||
if "130" in desired_cuda:
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
|
||||
"/usr/local/cuda/lib64/libcublas.so.13",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.13",
|
||||
"/usr/local/cuda/lib64/libcudart.so.13",
|
||||
"/usr/local/cuda/lib64/libcufft.so.12",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.12",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.13",
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
|
||||
if "129" in desired_cuda:
|
||||
libs_to_copy += [
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
|
||||
"/usr/local/cuda/lib64/libcufile.so.0",
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1",
|
||||
]
|
||||
elif "12" in desired_cuda:
|
||||
# Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
|
||||
minor_version = desired_cuda[-1]
|
||||
version_specific_libs = [
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
|
||||
"/usr/local/cuda/lib64/libcublas.so.12",
|
||||
"/usr/local/cuda/lib64/libcublasLt.so.12",
|
||||
"/usr/local/cuda/lib64/libcudart.so.12",
|
||||
"/usr/local/cuda/lib64/libcufft.so.11",
|
||||
"/usr/local/cuda/lib64/libcusolver.so.11",
|
||||
"/usr/local/cuda/lib64/libnvJitLink.so.12",
|
||||
"/usr/local/cuda/lib64/libnvrtc.so.12",
|
||||
f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
|
||||
]
|
||||
|
||||
# Combine all libraries
|
||||
libs_to_copy = common_libs + version_specific_libs
|
||||
|
||||
# Copy libraries to unzipped_folder/a/lib
|
||||
for lib_path in libs_to_copy:
|
||||
@ -230,7 +208,7 @@ if __name__ == "__main__":
|
||||
build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
|
||||
# MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
|
||||
if enable_cuda:
|
||||
build_vars += "MAX_JOBS=5 "
|
||||
build_vars = "MAX_JOBS=5 " + build_vars
|
||||
|
||||
override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
|
||||
desired_cuda = os.getenv("DESIRED_CUDA")
|
||||
|
||||
@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
|
||||
If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
|
||||
```bash
|
||||
docker build \
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
....
|
||||
--build-arg "NEW_ARG_1=${NEW_ARG_1}"
|
||||
```
|
||||
|
||||
3. **Update Dockerfile logic**:
|
||||
|
||||
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
|
||||
RUN bash ./install_cuda.sh 12.9
|
||||
ENV DESIRED_CUDA=12.9
|
||||
|
||||
FROM cuda as cuda13.0
|
||||
RUN bash ./install_cuda.sh 13.0
|
||||
ENV DESIRED_CUDA=13.0
|
||||
|
||||
FROM ${ROCM_IMAGE} as rocm
|
||||
ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
|
||||
ADD ./common/install_mkl.sh install_mkl.sh
|
||||
@ -80,10 +76,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
|
||||
RUN bash ./install_mnist.sh
|
||||
|
||||
FROM base as all_cuda
|
||||
COPY --from=cuda11.8 /usr/local/cuda-11.8 /usr/local/cuda-11.8
|
||||
COPY --from=cuda12.6 /usr/local/cuda-12.6 /usr/local/cuda-12.6
|
||||
COPY --from=cuda12.8 /usr/local/cuda-12.8 /usr/local/cuda-12.8
|
||||
COPY --from=cuda12.9 /usr/local/cuda-12.9 /usr/local/cuda-12.9
|
||||
COPY --from=cuda13.0 /usr/local/cuda-13.0 /usr/local/cuda-13.0
|
||||
|
||||
# Final step
|
||||
FROM ${BASE_TARGET} as final
|
||||
|
||||
@ -76,13 +76,10 @@ elif [[ "$image" == *cuda*linter* ]]; then
|
||||
elif [[ "$image" == *linter* ]]; then
|
||||
# Use a separate Dockerfile for linter to keep a small image size
|
||||
DOCKERFILE="linter/Dockerfile"
|
||||
elif [[ "$image" == *riscv* ]]; then
|
||||
# Use RISC-V specific Dockerfile
|
||||
DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
|
||||
fi
|
||||
|
||||
_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
|
||||
_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
|
||||
_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
|
||||
_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
|
||||
if [[ "$image" == *rocm* ]]; then
|
||||
_UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
|
||||
_UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
|
||||
@ -114,19 +111,31 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
|
||||
CUDA_VERSION=13.0.0
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
ANACONDA_PYTHON_VERSION=3.12
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
UCX_COMMIT=${_UCX_COMMIT}
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
TRITON=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
|
||||
CUDA_VERSION=12.8.1
|
||||
ANACONDA_PYTHON_VERSION=3.13
|
||||
GCC_VERSION=9
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
@ -156,13 +165,13 @@ case "$tag" in
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-clang12-onnx)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
ONNX=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.10-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
TRITON=yes
|
||||
@ -197,24 +206,23 @@ case "$tag" in
|
||||
UCC_COMMIT=${_UCC_COMMIT}
|
||||
PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-xpu-2025.0-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.0
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-2025.1-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.1
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-xpu-n-py3)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
XPU_VERSION=2025.2
|
||||
NINJA_VERSION=1.9.0
|
||||
TRITON=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
|
||||
# TODO (huydhn): Upgrade this to Python >= 3.10
|
||||
pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
@ -223,8 +231,8 @@ case "$tag" in
|
||||
DOCS=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
CUDA_VERSION=12.8.1
|
||||
CLANG_VERSION=12
|
||||
VISION=yes
|
||||
@ -235,8 +243,8 @@ case "$tag" in
|
||||
CLANG_VERSION=18
|
||||
VISION=yes
|
||||
;;
|
||||
pytorch-linux-jammy-py3.10-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.10
|
||||
pytorch-linux-jammy-py3.9-gcc11)
|
||||
ANACONDA_PYTHON_VERSION=3.9
|
||||
GCC_VERSION=11
|
||||
VISION=yes
|
||||
KATEX=yes
|
||||
@ -277,6 +285,7 @@ case "$tag" in
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
@ -287,15 +296,13 @@ case "$tag" in
|
||||
GCC_VERSION=11
|
||||
ACL=yes
|
||||
VISION=yes
|
||||
CONDA_CMAKE=yes
|
||||
OPENBLAS=yes
|
||||
# snadampal: skipping llvm src build install because the current version
|
||||
# from pytorch/llvm:9.0.1 is x86 specific
|
||||
SKIP_LLVM_SRC_BUILD_INSTALL=yes
|
||||
INDUCTOR_BENCHMARKS=yes
|
||||
;;
|
||||
pytorch-linux-noble-riscv64-py3.12-gcc14)
|
||||
GCC_VERSION=14
|
||||
;;
|
||||
*)
|
||||
# Catch-all for builds that are not hardcoded.
|
||||
VISION=yes
|
||||
@ -416,14 +423,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
|
||||
fi
|
||||
|
||||
if [ -n "$GCC_VERSION" ]; then
|
||||
if [[ "$image" == *riscv* ]]; then
|
||||
# Check RISC-V cross-compilation toolchain version
|
||||
if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
|
||||
echo "RISC-V GCC_VERSION=$GCC_VERSION, but:"
|
||||
drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version
|
||||
exit 1
|
||||
fi
|
||||
elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
|
||||
if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
|
||||
echo "GCC_VERSION=$GCC_VERSION, but:"
|
||||
drun gcc --version
|
||||
exit 1
|
||||
|
||||
@ -1,2 +0,0 @@
|
||||
transformers==4.54.0
|
||||
soxr==0.5.0
|
||||
1
.ci/docker/ci_commit_pins/huggingface.txt
Normal file
1
.ci/docker/ci_commit_pins/huggingface.txt
Normal file
@ -0,0 +1 @@
|
||||
243e186efbf7fb93328dd6b34927a4e8c8f24395
|
||||
@ -1 +0,0 @@
|
||||
v2.27.7-1
|
||||
@ -1 +1 @@
|
||||
74a23feff57432129df84d8099e622773cf77925
|
||||
e03a63be43e33596f7f0a43b0f530353785e4a59
|
||||
|
||||
@ -1 +1 @@
|
||||
d0e80f39c562c70986fc548fa6e5852ad86e16e7
|
||||
ae324eeac8e102a2b40370e341460f3791353398
|
||||
|
||||
@ -10,7 +10,7 @@ else
|
||||
arch_path='sbsa'
|
||||
fi
|
||||
|
||||
NVSHMEM_VERSION=3.3.24
|
||||
NVSHMEM_VERSION=3.3.9
|
||||
|
||||
function install_cuda {
|
||||
version=$1
|
||||
@ -62,16 +62,14 @@ function install_nvshmem {
|
||||
mkdir -p "${tmpdir}" && cd "${tmpdir}"
|
||||
|
||||
# nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
|
||||
# This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
|
||||
filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
|
||||
suffix=".tar.xz"
|
||||
url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
|
||||
filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
|
||||
url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
|
||||
|
||||
# download, unpack, install
|
||||
wget -q "${url}"
|
||||
tar xf "${filename}${suffix}"
|
||||
cp -a "${filename}/include/"* /usr/local/cuda/include/
|
||||
cp -a "${filename}/lib/"* /usr/local/cuda/lib64/
|
||||
tar xf "${filename}.tar.gz"
|
||||
cp -a "libnvshmem/include/"* /usr/local/cuda/include/
|
||||
cp -a "libnvshmem/lib/"* /usr/local/cuda/lib64/
|
||||
|
||||
# cleanup
|
||||
cd ..
|
||||
@ -128,6 +126,74 @@ function install_129 {
|
||||
ldconfig
|
||||
}
|
||||
|
||||
function prune_124 {
|
||||
echo "Pruning CUDA 12.4"
|
||||
#####################################################################################
|
||||
# CUDA 12.4 prune static libs
|
||||
#####################################################################################
|
||||
export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
|
||||
export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
|
||||
|
||||
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
|
||||
if [[ -n "$OVERRIDE_GENCODE" ]]; then
|
||||
export GENCODE=$OVERRIDE_GENCODE
|
||||
fi
|
||||
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
|
||||
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
|
||||
fi
|
||||
|
||||
# all CUDA libs except CuDNN and CuBLAS
|
||||
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
|
||||
| xargs -I {} bash -c \
|
||||
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
|
||||
|
||||
# prune CuDNN and CuBLAS
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
|
||||
|
||||
#####################################################################################
|
||||
# CUDA 12.4 prune visual tools
|
||||
#####################################################################################
|
||||
export CUDA_BASE="/usr/local/cuda-12.4/"
|
||||
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
|
||||
}
|
||||
|
||||
function prune_126 {
|
||||
echo "Pruning CUDA 12.6"
|
||||
#####################################################################################
|
||||
# CUDA 12.6 prune static libs
|
||||
#####################################################################################
|
||||
export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
|
||||
export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
|
||||
|
||||
export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
|
||||
|
||||
if [[ -n "$OVERRIDE_GENCODE" ]]; then
|
||||
export GENCODE=$OVERRIDE_GENCODE
|
||||
fi
|
||||
if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
|
||||
export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
|
||||
fi
|
||||
|
||||
# all CUDA libs except CuDNN and CuBLAS
|
||||
ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis" \
|
||||
| xargs -I {} bash -c \
|
||||
"echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
|
||||
|
||||
# prune CuDNN and CuBLAS
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
|
||||
$NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
|
||||
|
||||
#####################################################################################
|
||||
# CUDA 12.6 prune visual tools
|
||||
#####################################################################################
|
||||
export CUDA_BASE="/usr/local/cuda-12.6/"
|
||||
rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
|
||||
}
|
||||
|
||||
function install_128 {
|
||||
CUDNN_VERSION=9.8.0.87
|
||||
echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
@ -146,38 +212,18 @@ function install_128 {
|
||||
ldconfig
|
||||
}
|
||||
|
||||
function install_130 {
|
||||
CUDNN_VERSION=9.12.0.46
|
||||
echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
|
||||
# install CUDA 13.0 in the same container
|
||||
install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
|
||||
|
||||
# cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
|
||||
install_cudnn 13 $CUDNN_VERSION
|
||||
|
||||
install_nvshmem 13 $NVSHMEM_VERSION
|
||||
|
||||
CUDA_VERSION=13.0 bash install_nccl.sh
|
||||
|
||||
CUDA_VERSION=13.0 bash install_cusparselt.sh
|
||||
|
||||
ldconfig
|
||||
}
|
||||
|
||||
# idiomatic parameter and option handling in sh
|
||||
while test $# -gt 0
|
||||
do
|
||||
case "$1" in
|
||||
12.4) install_124;
|
||||
12.4) install_124; prune_124
|
||||
;;
|
||||
12.6|12.6.*) install_126;
|
||||
12.6|12.6.*) install_126; prune_126
|
||||
;;
|
||||
12.8|12.8.*) install_128;
|
||||
;;
|
||||
12.9|12.9.*) install_129;
|
||||
;;
|
||||
13.0|13.0.*) install_130;
|
||||
;;
|
||||
*) echo "bad argument $1"; exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
@ -5,15 +5,7 @@ set -ex
|
||||
# cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
|
||||
mkdir tmp_cusparselt && cd tmp_cusparselt
|
||||
|
||||
if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
|
||||
arch_path='sbsa'
|
||||
export TARGETARCH=${TARGETARCH:-$(uname -m)}
|
||||
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
|
||||
arch_path='x86_64'
|
||||
fi
|
||||
CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
|
||||
curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
|
||||
elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
|
||||
if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
|
||||
arch_path='sbsa'
|
||||
export TARGETARCH=${TARGETARCH:-$(uname -m)}
|
||||
if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
|
||||
|
||||
@ -5,7 +5,9 @@ set -ex
|
||||
source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
|
||||
|
||||
function install_huggingface() {
|
||||
pip_install -r huggingface-requirements.txt
|
||||
local version
|
||||
commit=$(get_pinned_commit huggingface)
|
||||
pip_install "git+https://github.com/huggingface/transformers@${commit}"
|
||||
}
|
||||
|
||||
function install_timm() {
|
||||
@ -24,12 +26,15 @@ function install_torchbench() {
|
||||
|
||||
python install.py --continue_on_fail
|
||||
|
||||
# TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
|
||||
# is regressing speedup metric. This needs to be investigated further
|
||||
pip install transformers==4.38.1
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
|
||||
chown -R jenkins torchbench
|
||||
chown -R jenkins /opt/conda
|
||||
}
|
||||
|
||||
# Pango is needed for weasyprint which is needed for doctr
|
||||
@ -43,4 +48,4 @@ install_huggingface
|
||||
install_timm
|
||||
|
||||
# Clean up
|
||||
conda_run pip uninstall -y torch torchvision torchaudio triton torchao
|
||||
conda_run pip uninstall -y torch torchvision torchaudio triton
|
||||
|
||||
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
|
||||
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
|
||||
elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
|
||||
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
|
||||
elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
|
||||
NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
|
||||
else
|
||||
echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
|
||||
exit 1
|
||||
|
||||
@ -19,8 +19,8 @@ pip_install \
|
||||
transformers==4.36.2
|
||||
|
||||
pip_install coloredlogs packaging
|
||||
pip_install onnxruntime==1.22.1
|
||||
pip_install onnxscript==0.4.0
|
||||
pip_install onnxruntime==1.18.1
|
||||
pip_install onnxscript==0.3.1
|
||||
|
||||
# Cache the transformers model to be used later by ONNX tests. We need to run the transformers
|
||||
# package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
|
||||
|
||||
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
|
||||
cd python
|
||||
fi
|
||||
|
||||
pip_install pybind11==3.0.1
|
||||
pip_install pybind11==2.13.6
|
||||
|
||||
# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
|
||||
as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
|
||||
|
||||
@ -44,12 +44,8 @@ function install_ucc() {
|
||||
|
||||
./autogen.sh
|
||||
|
||||
if [[ -n "$CUDA_VERSION" && $CUDA_VERSION == 13* ]]; then
|
||||
NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
|
||||
else
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
fi
|
||||
# We only run distributed tests on Tesla M60 and A10G
|
||||
NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
|
||||
|
||||
if [[ -n "$ROCM_VERSION" ]]; then
|
||||
if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
|
||||
|
||||
@ -65,14 +65,10 @@ function install_ubuntu() {
|
||||
|
||||
function install_rhel() {
|
||||
. /etc/os-release
|
||||
if [[ "${ID}" == "rhel" ]]; then
|
||||
if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
||||
echo "RHEL version ${VERSION_ID} not supported"
|
||||
exit
|
||||
fi
|
||||
elif [[ "${ID}" == "almalinux" ]]; then
|
||||
# Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
|
||||
VERSION_ID="8.8"
|
||||
|
||||
if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
|
||||
echo "RHEL version ${VERSION_ID} not supported"
|
||||
exit
|
||||
fi
|
||||
|
||||
dnf install -y 'dnf-command(config-manager)'
|
||||
@ -150,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
|
||||
XPU_DRIVER_VERSION="/lts/2350"
|
||||
fi
|
||||
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.1
|
||||
if [[ "$XPU_VERSION" == "2025.2" ]]; then
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
|
||||
else
|
||||
# Default use Intel® oneAPI Deep Learning Essentials 2025.0
|
||||
if [[ "$XPU_VERSION" == "2025.1" ]]; then
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
|
||||
else
|
||||
XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
|
||||
fi
|
||||
|
||||
# The installation depends on the base OS
|
||||
|
||||
@ -69,11 +69,6 @@ RUN bash ./install_cuda.sh 12.9
|
||||
RUN bash ./install_magma.sh 12.9
|
||||
RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
|
||||
|
||||
FROM cuda as cuda13.0
|
||||
RUN bash ./install_cuda.sh 13.0
|
||||
RUN bash ./install_magma.sh 13.0
|
||||
RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
|
||||
|
||||
FROM cpu as rocm
|
||||
ARG ROCM_VERSION
|
||||
ARG PYTORCH_ROCM_ARCH
|
||||
|
||||
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
|
||||
RUN python3 -m pip install --upgrade pip && \
|
||||
python3 -mpip install cmake==3.28.4
|
||||
ADD ./common/install_xpu.sh install_xpu.sh
|
||||
ENV XPU_VERSION 2025.2
|
||||
ENV XPU_VERSION 2025.1
|
||||
RUN bash ./install_xpu.sh && rm install_xpu.sh
|
||||
RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
|
||||
|
||||
@ -67,12 +67,6 @@ case ${image} in
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
;;
|
||||
manylinux2_28-builder:cuda13*)
|
||||
TARGET=cuda_final
|
||||
GPU_IMAGE=amd64/almalinux:8
|
||||
DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
|
||||
MANY_LINUX_VERSION="2_28"
|
||||
;;
|
||||
manylinuxaarch64-builder:cuda*)
|
||||
TARGET=cuda_final
|
||||
GPU_IMAGE=amd64/almalinux:8
|
||||
|
||||
@ -263,6 +263,11 @@ scipy==1.14.1 ; python_version >= "3.12"
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
tb-nightly==2.13.0a20230426
|
||||
#Description: TensorBoard
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
# needed by torchgen utils
|
||||
typing-extensions>=4.10.0
|
||||
#Description: type hints for python
|
||||
@ -339,7 +344,7 @@ onnx==1.18.0
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
|
||||
onnxscript==0.4.0
|
||||
onnxscript==0.3.1
|
||||
#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
|
||||
#Pinned versions:
|
||||
#test that import:
|
||||
@ -379,7 +384,7 @@ dataclasses_json==0.6.7
|
||||
cmake==4.0.0
|
||||
#Description: required for building
|
||||
|
||||
tlparse==0.4.0
|
||||
tlparse==0.3.30
|
||||
#Description: required for log parsing
|
||||
|
||||
cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
sphinx==5.3.0
|
||||
#Description: This is used to generate PyTorch docs
|
||||
#Pinned versions: 5.3.0
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
|
||||
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2
|
||||
|
||||
# TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
|
||||
# but it doesn't seem to work and hangs around idly. The initial thought that it is probably
|
||||
|
||||
@ -1,155 +0,0 @@
|
||||
# Cross-compilation Docker container for RISC-V architecture
|
||||
ARG UBUNTU_VERSION
|
||||
FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base
|
||||
|
||||
ARG UBUNTU_VERSION
|
||||
|
||||
ENV GCC_VERSION=14
|
||||
ENV PYTHON_VERSION=3.12.3
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION}
|
||||
ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION}
|
||||
ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/
|
||||
ENV SYSROOT=/opt/sysroot
|
||||
|
||||
# Install basic dependencies
|
||||
RUN apt-get update && apt-get install -y \
|
||||
ninja-build \
|
||||
autoconf \
|
||||
automake \
|
||||
libtool \
|
||||
patchelf \
|
||||
ccache \
|
||||
git \
|
||||
wget \
|
||||
python3-pip \
|
||||
python3-venv \
|
||||
python-is-python3 \
|
||||
cmake \
|
||||
sudo \
|
||||
lsb-release \
|
||||
gcc-${GCC_VERSION}-riscv64-linux-gnu \
|
||||
g++-${GCC_VERSION}-riscv64-linux-gnu \
|
||||
pkg-config \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install user
|
||||
COPY ./common/install_user.sh install_user.sh
|
||||
RUN bash ./install_user.sh && rm install_user.sh
|
||||
|
||||
FROM base as python
|
||||
ARG ZLIB_VERSION=1.3.1
|
||||
ARG FFI_VERSION=3.4.6
|
||||
ARG BZ2_VERSION=1.0.8
|
||||
ARG XZ_VERSION=5.4.6
|
||||
ARG OPENSSL_VERSION=3.2.1
|
||||
|
||||
# Set up sysroot directory for dependencies
|
||||
ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig
|
||||
ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT}
|
||||
|
||||
WORKDIR /opt
|
||||
|
||||
# Build zlib (for compression)
|
||||
RUN echo "--- Building zlib ---" \
|
||||
&& wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \
|
||||
&& tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd zlib-${ZLIB_VERSION}/ \
|
||||
&& mkdir build && cd build \
|
||||
&& ../configure --prefix=${SYSROOT} \
|
||||
&& make -j$(nproc) && make install \
|
||||
&& cd ../..
|
||||
|
||||
# Build libffi (for ctypes module)
|
||||
RUN echo "--- Building libffi ---" \
|
||||
&& wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \
|
||||
&& tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd libffi-${FFI_VERSION}/ \
|
||||
&& mkdir build && cd build \
|
||||
&& ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
|
||||
&& make -j$(nproc) && make install \
|
||||
&& cd ../..
|
||||
|
||||
# Build bzip2 (for bz2 module)
|
||||
RUN echo "--- Building bzip2 ---" \
|
||||
&& wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \
|
||||
&& tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd bzip2-${BZ2_VERSION}/ \
|
||||
&& make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \
|
||||
&& make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \
|
||||
&& make install PREFIX=${SYSROOT} \
|
||||
&& cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \
|
||||
&& cd ${SYSROOT}/lib/ \
|
||||
&& ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \
|
||||
&& ln -sf libbz2.so.1.0 libbz2.so \
|
||||
&& cd /opt/
|
||||
|
||||
# Build xz (for lzma module)
|
||||
RUN echo "--- Building xz ---" \
|
||||
&& wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \
|
||||
&& tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd xz-${XZ_VERSION} \
|
||||
&& mkdir build && cd build \
|
||||
&& ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
|
||||
&& make -j$(nproc) && make install \
|
||||
&& cd ../..
|
||||
|
||||
# Build OpenSSL (for ssl module)
|
||||
RUN echo "--- Building OpenSSL ---" \
|
||||
&& wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
|
||||
&& tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd openssl-${OPENSSL_VERSION}/ \
|
||||
&& mkdir build && cd build \
|
||||
&& ../Configure linux64-riscv64 --prefix=${SYSROOT} \
|
||||
&& make -j$(nproc) && make install_sw \
|
||||
&& cd ../..
|
||||
|
||||
# Build SQLite3 (for sqlite3 module)
|
||||
RUN echo "--- Building SQLite3 ---" \
|
||||
&& wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \
|
||||
&& tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \
|
||||
&& cd sqlite-autoconf-3450200 \
|
||||
&& mkdir build && cd build \
|
||||
&& ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
|
||||
&& make -j$(nproc) && make install \
|
||||
&& cd ../..
|
||||
|
||||
# Build and install RISC-V Python with all modules
|
||||
RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
|
||||
&& tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \
|
||||
&& cd Python-${PYTHON_VERSION} \
|
||||
&& mkdir build && cd build \
|
||||
&& ../configure \
|
||||
--host=riscv64-linux-gnu \
|
||||
--build=x86_64-linux-gnu \
|
||||
--prefix=${SYSROOT} \
|
||||
--enable-shared \
|
||||
--disable-ipv6 \
|
||||
--with-build-python=/usr/bin/python3 \
|
||||
--with-ensurepip=no \
|
||||
ac_cv_file__dev_ptmx=yes \
|
||||
ac_cv_file__dev_ptc=no \
|
||||
&& make -j$(nproc) \
|
||||
&& make install
|
||||
|
||||
FROM base as final
|
||||
COPY --from=python /opt/sysroot /opt/sysroot
|
||||
|
||||
# Install crossenv and cmake
|
||||
RUN pip install crossenv cmake==4.0.0 --break-system-packages \
|
||||
&& /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env
|
||||
|
||||
# Add pip-installed cmake binaries to PATH
|
||||
ENV PATH="/usr/local/bin:${PATH}"
|
||||
|
||||
# Set up cross Python environment
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
RUN source /opt/riscv-cross-env/bin/activate \
|
||||
&& pip install setuptools pyyaml typing_extensions wheel
|
||||
|
||||
# Set default environment variables for PyTorch build
|
||||
ENV Python_ROOT_DIR=${SYSROOT}
|
||||
ENV OPENSSL_ROOT_DIR=${SYSROOT}
|
||||
|
||||
USER jenkins
|
||||
CMD ["bash"]
|
||||
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
|
||||
ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
|
||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
|
||||
# (optional) Install non-default Ninja version
|
||||
ARG NINJA_VERSION
|
||||
|
||||
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
|
||||
ARG INDUCTOR_BENCHMARKS
|
||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
|
||||
|
||||
# Install XPU Dependencies
|
||||
ARG XPU_VERSION
|
||||
|
||||
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
|
||||
# (optional) Install UCC
|
||||
ARG UCX_COMMIT
|
||||
ARG UCC_COMMIT
|
||||
ARG CUDA_VERSION
|
||||
ENV UCX_COMMIT $UCX_COMMIT
|
||||
ENV UCC_COMMIT $UCC_COMMIT
|
||||
ENV UCX_HOME /usr
|
||||
@ -97,11 +96,11 @@ RUN rm install_openssl.sh
|
||||
ARG INDUCTOR_BENCHMARKS
|
||||
COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
|
||||
COPY ./common/common_utils.sh common_utils.sh
|
||||
COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
|
||||
COPY ci_commit_pins/huggingface.txt huggingface.txt
|
||||
COPY ci_commit_pins/timm.txt timm.txt
|
||||
COPY ci_commit_pins/torchbench.txt torchbench.txt
|
||||
RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
|
||||
RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
|
||||
|
||||
ARG TRITON
|
||||
ARG TRITON_CPU
|
||||
@ -182,6 +181,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
|
||||
RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi
|
||||
|
||||
# AWS specific CUDA build guidance
|
||||
ENV TORCH_CUDA_ARCH_LIST Maxwell
|
||||
ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
|
||||
ENV CUDA_PATH /usr/local/cuda
|
||||
|
||||
|
||||
@ -7,4 +7,4 @@ set -ex
|
||||
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
|
||||
|
||||
@ -1,31 +0,0 @@
|
||||
# 🔧 Lumen_cli
|
||||
A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.
|
||||
|
||||
|
||||
## Features
|
||||
- **Build**
|
||||
- external projects (e.g. vLLM)
|
||||
|
||||
## 📦 Installation
|
||||
at the root of the pytorch repo
|
||||
```bash
|
||||
pip install -e .ci/lumen_cli
|
||||
```
|
||||
|
||||
## Run the cli tool
|
||||
The cli tool must be used at root of pytorch repo, as example to run build external vllm:
|
||||
```bash
|
||||
python -m cli.run build external vllm
|
||||
```
|
||||
this will run the build steps with default behaviour for vllm project.
|
||||
|
||||
to see help messages, run
|
||||
```bash
|
||||
python3 -m cli.run --help
|
||||
```
|
||||
|
||||
## Add customized external build logics
|
||||
To add a new external build, for instance, add a new external build logics:
|
||||
1. create the build function in cli/lib folder
|
||||
2. register your target and the main build function at EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`
|
||||
3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml
|
||||
@ -1,37 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
|
||||
from cli.lib.core.vllm.vllm_build import VllmBuildRunner
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maps targets to their argparse configuration and runner
|
||||
# it adds new target to path python -m cli.run build external {target} with buildrunner
|
||||
_TARGETS: dict[str, TargetSpec] = {
|
||||
"vllm": {
|
||||
"runner": VllmBuildRunner,
|
||||
"help": "Build vLLM using docker buildx.",
|
||||
}
|
||||
# add yours ...
|
||||
}
|
||||
|
||||
|
||||
def register_build_commands(subparsers: argparse._SubParsersAction) -> None:
|
||||
build_parser = subparsers.add_parser(
|
||||
"build",
|
||||
help="Build related commands",
|
||||
formatter_class=RichHelp,
|
||||
)
|
||||
build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)
|
||||
overview = "\n".join(
|
||||
f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
|
||||
)
|
||||
external_parser = build_subparsers.add_parser(
|
||||
"external",
|
||||
help="Build external targets",
|
||||
description="Build third-party targets.\n\nAvailable targets:\n" + overview,
|
||||
formatter_class=RichHelp,
|
||||
)
|
||||
register_targets(external_parser, _TARGETS)
|
||||
@ -1,71 +0,0 @@
|
||||
"""
|
||||
Cli Argparser Utility helpers for CLI tasks.
|
||||
|
||||
"""
|
||||
|
||||
import argparse
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
|
||||
try:
|
||||
from typing import Any, Callable, Required, TypedDict # Python 3.11+
|
||||
except ImportError:
|
||||
from typing import Any, Callable, TypedDict
|
||||
|
||||
from typing_extensions import Required # Fallback for Python <3.11
|
||||
|
||||
|
||||
class BaseRunner(ABC):
|
||||
def __init__(self, args: Any) -> None:
|
||||
self.args = args
|
||||
|
||||
@abstractmethod
|
||||
def run(self) -> None:
|
||||
"""runs main logics, required"""
|
||||
|
||||
|
||||
# Pretty help: keep newlines + show defaults
|
||||
class RichHelp(
|
||||
argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
|
||||
):
|
||||
pass
|
||||
|
||||
|
||||
class TargetSpec(TypedDict, total=False):
|
||||
"""CLI subcommand specification with bA."""
|
||||
|
||||
runner: Required[type[BaseRunner]]
|
||||
help: str
|
||||
description: str
|
||||
add_arguments: Callable[[argparse.ArgumentParser], None]
|
||||
|
||||
|
||||
def register_targets(
|
||||
parser: argparse.ArgumentParser,
|
||||
target_specs: dict[str, TargetSpec],
|
||||
common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,
|
||||
) -> None:
|
||||
"""Register target subcommands."""
|
||||
targets = parser.add_subparsers(
|
||||
dest="target",
|
||||
required=True,
|
||||
metavar="{" + ",".join(target_specs.keys()) + "}",
|
||||
)
|
||||
|
||||
for name, spec in target_specs.items():
|
||||
desc = spec.get("description") or spec["runner"].__doc__ or ""
|
||||
|
||||
p = targets.add_parser(
|
||||
name,
|
||||
help=spec.get("help", ""),
|
||||
description=desc.strip(),
|
||||
formatter_class=RichHelp,
|
||||
)
|
||||
p.set_defaults(
|
||||
func=lambda args, cls=spec["runner"]: cls(args).run(),
|
||||
_runner_class=spec["runner"],
|
||||
)
|
||||
if "add_arguments" in spec and callable(spec["add_arguments"]):
|
||||
spec["add_arguments"](p)
|
||||
if common_args:
|
||||
common_args(p)
|
||||
@ -1,42 +0,0 @@
|
||||
"""
|
||||
Docker Utility helpers for CLI tasks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
import docker
|
||||
from docker.errors import APIError, NotFound
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# lazy singleton so we don't reconnect every call
|
||||
_docker_client: Optional[docker.DockerClient] = None
|
||||
|
||||
|
||||
def _get_client() -> docker.DockerClient:
|
||||
global _docker_client
|
||||
if _docker_client is None:
|
||||
_docker_client = docker.from_env()
|
||||
return _docker_client
|
||||
|
||||
|
||||
def local_image_exists(
|
||||
image_name: str, client: Optional[docker.DockerClient] = None
|
||||
) -> bool:
|
||||
"""Return True if a local Docker image exists."""
|
||||
if not image_name:
|
||||
return False
|
||||
|
||||
client = client or _get_client()
|
||||
try:
|
||||
client.images.get(image_name)
|
||||
return True
|
||||
except (NotFound, APIError) as e:
|
||||
logger.error(
|
||||
"Error when checking Docker image '%s': %s",
|
||||
image_name,
|
||||
e.explanation if hasattr(e, "explanation") else str(e),
|
||||
)
|
||||
return False
|
||||
@ -1,110 +0,0 @@
|
||||
"""
|
||||
Environment Variables and Dataclasses Utility helpers for CLI tasks.
|
||||
"""
|
||||
|
||||
import os
|
||||
from dataclasses import field, fields, is_dataclass, MISSING
|
||||
from pathlib import Path
|
||||
from textwrap import indent
|
||||
from typing import Optional, Union
|
||||
|
||||
from cli.lib.common.utils import str2bool
|
||||
|
||||
|
||||
def get_env(name: str, default: str = "") -> str:
|
||||
"""Get environment variable with default fallback."""
|
||||
return os.environ.get(name) or default
|
||||
|
||||
|
||||
def env_path_optional(
|
||||
name: str,
|
||||
default: Optional[Union[str, Path]] = None,
|
||||
resolve: bool = True,
|
||||
) -> Optional[Path]:
|
||||
"""Get environment variable as optional Path."""
|
||||
val = get_env(name) or default
|
||||
if not val:
|
||||
return None
|
||||
|
||||
path = Path(val)
|
||||
return path.resolve() if resolve else path
|
||||
|
||||
|
||||
def env_path(
|
||||
name: str,
|
||||
default: Optional[Union[str, Path]] = None,
|
||||
resolve: bool = True,
|
||||
) -> Path:
|
||||
"""Get environment variable as Path, raise if missing."""
|
||||
path = env_path_optional(name, default, resolve)
|
||||
if not path:
|
||||
raise ValueError(f"Missing path value for {name}")
|
||||
return path
|
||||
|
||||
|
||||
def env_bool(
|
||||
name: str,
|
||||
default: bool = False,
|
||||
) -> bool:
|
||||
val = get_env(name)
|
||||
if not val:
|
||||
return default
|
||||
return str2bool(val)
|
||||
|
||||
|
||||
def env_bool_field(
|
||||
name: str,
|
||||
default: bool = False,
|
||||
):
|
||||
return field(default_factory=lambda: env_bool(name, default))
|
||||
|
||||
|
||||
def env_path_field(
|
||||
name: str,
|
||||
default: Union[str, Path] = "",
|
||||
*,
|
||||
resolve: bool = True,
|
||||
) -> Path:
|
||||
return field(default_factory=lambda: env_path(name, default, resolve=resolve))
|
||||
|
||||
|
||||
def env_str_field(
|
||||
name: str,
|
||||
default: str = "",
|
||||
) -> str:
|
||||
return field(default_factory=lambda: get_env(name, default))
|
||||
|
||||
|
||||
def generate_dataclass_help(cls) -> str:
|
||||
"""Auto-generate help text for dataclass fields."""
|
||||
if not is_dataclass(cls):
|
||||
raise TypeError(f"{cls} is not a dataclass")
|
||||
|
||||
def get_value(f):
|
||||
if f.default is not MISSING:
|
||||
return f.default
|
||||
if f.default_factory is not MISSING:
|
||||
try:
|
||||
return f.default_factory()
|
||||
except Exception as e:
|
||||
return f"<error: {e}>"
|
||||
return "<required>"
|
||||
|
||||
lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]
|
||||
return indent("\n".join(lines), " ")
|
||||
|
||||
|
||||
def with_params_help(params_cls: type, title: str = "Parameter defaults"):
|
||||
"""
|
||||
Class decorator that appends a help table generated from another dataclass
|
||||
(e.g., VllmParameters) to the decorated class's docstring.
|
||||
"""
|
||||
if not is_dataclass(params_cls):
|
||||
raise TypeError(f"{params_cls} must be a dataclass")
|
||||
|
||||
def _decorator(cls: type) -> type:
|
||||
block = generate_dataclass_help(params_cls)
|
||||
cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"
|
||||
return cls
|
||||
|
||||
return _decorator
|
||||
@ -1,143 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import textwrap
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from cli.lib.common.utils import get_wheels
|
||||
from jinja2 import Template
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable, Mapping
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_TPL_CONTENT = Template(
|
||||
textwrap.dedent("""\
|
||||
## {{ title }}
|
||||
|
||||
```{{ lang }}
|
||||
{{ content }}
|
||||
```
|
||||
""")
|
||||
)
|
||||
|
||||
_TPL_LIST_ITEMS = Template(
|
||||
textwrap.dedent("""\
|
||||
## {{ title }}
|
||||
{% for it in items %}
|
||||
- {{ it.pkg }}: {{ it.relpath }}
|
||||
{% else %}
|
||||
_(no item found)_
|
||||
{% endfor %}
|
||||
""")
|
||||
)
|
||||
|
||||
_TPL_TABLE = Template(
|
||||
textwrap.dedent("""\
|
||||
{%- if rows %}
|
||||
| {{ cols | join(' | ') }} |
|
||||
|{%- for _ in cols %} --- |{%- endfor %}
|
||||
{%- for r in rows %}
|
||||
| {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
|
||||
{%- endfor %}
|
||||
{%- else %}
|
||||
_(no data)_
|
||||
{%- endif %}
|
||||
""")
|
||||
)
|
||||
|
||||
|
||||
def gh_summary_path() -> Path | None:
|
||||
"""Return the Path to the GitHub step summary file, or None if not set."""
|
||||
p = os.environ.get("GITHUB_STEP_SUMMARY")
|
||||
return Path(p) if p else None
|
||||
|
||||
|
||||
def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
|
||||
"""
|
||||
Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
|
||||
append_content: default true, if True, append to the end of the file, else overwrite the whole file
|
||||
|
||||
Returns:
|
||||
True if written successfully (in GitHub Actions environment),
|
||||
False if skipped (e.g., running locally where the variable is not set).
|
||||
"""
|
||||
sp = gh_summary_path()
|
||||
if not sp:
|
||||
logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
|
||||
return False
|
||||
|
||||
md_clean = textwrap.dedent(md).strip() + "\n"
|
||||
|
||||
mode = "a" if append_content else "w"
|
||||
with sp.open(mode, encoding="utf-8") as f:
|
||||
f.write(md_clean)
|
||||
return True
|
||||
|
||||
|
||||
def md_heading(text: str, level: int = 2) -> str:
|
||||
"""Generate a Markdown heading string with the given level (1-6)."""
|
||||
return f"{'#' * max(1, min(level, 6))} {text}\n"
|
||||
|
||||
|
||||
def md_details(summary: str, content: str) -> str:
|
||||
"""Generate a collapsible <details> block with a summary and inner content."""
|
||||
return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
|
||||
|
||||
|
||||
def summarize_content_from_file(
|
||||
output_dir: Path,
|
||||
freeze_file: str,
|
||||
title: str = "Content from file",
|
||||
code_lang: str = "", # e.g. "text" or "ini"
|
||||
) -> bool:
|
||||
f = Path(output_dir) / freeze_file
|
||||
if not f.exists():
|
||||
return False
|
||||
content = f.read_text(encoding="utf-8").strip()
|
||||
md = render_content(content, title=title, lang=code_lang)
|
||||
return write_gh_step_summary(md)
|
||||
|
||||
|
||||
def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
|
||||
items = get_wheels(path, max_depth=max_depth)
|
||||
if not items:
|
||||
return False
|
||||
md = render_list(items, title=title)
|
||||
return write_gh_step_summary(md)
|
||||
|
||||
|
||||
def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
|
||||
"""
|
||||
Render a list of dicts as a Markdown table using Jinja template.
|
||||
"""
|
||||
rows = list(rows)
|
||||
cols = list({k for r in rows for k in r.keys()})
|
||||
md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
|
||||
return md
|
||||
|
||||
|
||||
def render_list(
|
||||
items: Iterable[str],
|
||||
*,
|
||||
title: str = "List",
|
||||
) -> str:
|
||||
tpl = _TPL_LIST_ITEMS
|
||||
md = tpl.render(title=title, items=items)
|
||||
return md
|
||||
|
||||
|
||||
def render_content(
|
||||
content: str,
|
||||
*,
|
||||
title: str = "Content",
|
||||
lang: str = "text",
|
||||
) -> str:
|
||||
tpl = _TPL_CONTENT
|
||||
md = tpl.render(title=title, content=content, lang=lang)
|
||||
return md
|
||||
@ -1,69 +0,0 @@
|
||||
"""
|
||||
Git Utility helpers for CLI tasks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from cli.lib.common.path_helper import remove_dir
|
||||
from git import GitCommandError, RemoteProgress, Repo
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PrintProgress(RemoteProgress):
|
||||
"""Simple progress logger for git operations."""
|
||||
|
||||
def __init__(self, interval: int = 5):
|
||||
super().__init__()
|
||||
self._last_percent = -1
|
||||
self._interval = interval
|
||||
|
||||
def update(self, op_code, cur, max=None, message=""):
|
||||
msg = self._cur_line or message
|
||||
if max and cur:
|
||||
percent = int(cur / max * 100)
|
||||
if percent != self._last_percent and percent % self._interval == 0:
|
||||
self._last_percent = percent
|
||||
logger.info("Progress: %d%% - %s", percent, msg)
|
||||
elif msg:
|
||||
logger.info(msg)
|
||||
|
||||
|
||||
def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):
|
||||
"""Clone repository with pinned commit and optional submodules."""
|
||||
dst = dst or target
|
||||
|
||||
try:
|
||||
logger.info("Cloning %s to %s", target, dst)
|
||||
|
||||
# Clone and fetch
|
||||
remove_dir(dst)
|
||||
r = Repo.clone_from(repo, dst, progress=PrintProgress())
|
||||
r.git.fetch("--all", "--tags")
|
||||
|
||||
# Checkout pinned commit
|
||||
commit = get_post_build_pinned_commit(target)
|
||||
logger.info("Checking out pinned %s commit %s", target, commit)
|
||||
r.git.checkout(commit)
|
||||
|
||||
# Update submodules if requested
|
||||
if update_submodules and r.submodules:
|
||||
logger.info("Updating %d submodule(s)", len(r.submodules))
|
||||
for sm in r.submodules:
|
||||
sm.update(init=True, recursive=True, progress=PrintProgress())
|
||||
|
||||
logger.info("Successfully cloned %s", target)
|
||||
return r, commit
|
||||
|
||||
except GitCommandError as e:
|
||||
logger.error("Git operation failed: %s", e)
|
||||
raise
|
||||
|
||||
|
||||
def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:
|
||||
path = Path(prefix) / f"{name}.txt"
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Pin file not found: {path}")
|
||||
return path.read_text(encoding="utf-8").strip()
|
||||
@ -1,14 +0,0 @@
|
||||
"""
|
||||
Logger Utility helpers for CLI tasks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import sys
|
||||
|
||||
|
||||
def setup_logging(level: int = logging.INFO):
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
stream=sys.stdout,
|
||||
)
|
||||
@ -1,62 +0,0 @@
|
||||
"""Path utility helpers for CLI tasks."""
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_path(path: Union[str, Path], resolve: bool = False) -> Path:
|
||||
"""Convert to Path object, optionally resolving to absolute path."""
|
||||
if not path:
|
||||
raise ValueError("Path cannot be None or empty")
|
||||
result = Path(path)
|
||||
return result.resolve() if resolve else result
|
||||
|
||||
|
||||
def ensure_dir_exists(path: Union[str, Path]) -> Path:
|
||||
"""Create directory if it doesn't exist."""
|
||||
path_obj = get_path(path)
|
||||
path_obj.mkdir(parents=True, exist_ok=True)
|
||||
return path_obj
|
||||
|
||||
|
||||
def remove_dir(path: Union[str, Path, None]) -> None:
|
||||
"""Remove directory if it exists."""
|
||||
if not path:
|
||||
return
|
||||
path_obj = get_path(path)
|
||||
if path_obj.exists():
|
||||
shutil.rmtree(path_obj)
|
||||
|
||||
|
||||
def force_create_dir(path: Union[str, Path]) -> Path:
|
||||
"""Remove directory if exists, then create fresh empty directory."""
|
||||
remove_dir(path)
|
||||
return ensure_dir_exists(path)
|
||||
|
||||
|
||||
def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:
|
||||
"""Copy file or directory from src to dst."""
|
||||
src_path = get_path(src, resolve=True)
|
||||
dst_path = get_path(dst, resolve=True)
|
||||
|
||||
if not src_path.exists():
|
||||
raise FileNotFoundError(f"Source does not exist: {src_path}")
|
||||
|
||||
dst_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if src_path.is_file():
|
||||
shutil.copy2(src_path, dst_path)
|
||||
elif src_path.is_dir():
|
||||
shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
|
||||
else:
|
||||
raise ValueError(f"Unsupported path type: {src_path}")
|
||||
|
||||
|
||||
def is_path_exist(path: Union[str, Path, None]) -> bool:
|
||||
"""Check if path exists."""
|
||||
return bool(path and get_path(path).exists())
|
||||
@ -1,71 +0,0 @@
|
||||
import glob
|
||||
import logging
|
||||
import shlex
|
||||
import shutil
|
||||
import sys
|
||||
from collections.abc import Iterable
|
||||
from importlib.metadata import PackageNotFoundError, version # noqa: UP035
|
||||
from typing import Optional, Union
|
||||
|
||||
from cli.lib.common.utils import run_command
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def pip_install_packages(
|
||||
packages: Iterable[str] = (),
|
||||
env=None,
|
||||
*,
|
||||
requirements: Optional[str] = None,
|
||||
constraints: Optional[str] = None,
|
||||
prefer_uv: bool = False,
|
||||
) -> None:
|
||||
use_uv = prefer_uv and shutil.which("uv") is not None
|
||||
base = (
|
||||
[sys.executable, "-m", "uv", "pip", "install"]
|
||||
if use_uv
|
||||
else [sys.executable, "-m", "pip", "install"]
|
||||
)
|
||||
cmd = base[:]
|
||||
if requirements:
|
||||
cmd += ["-r", requirements]
|
||||
if constraints:
|
||||
cmd += ["-c", constraints]
|
||||
cmd += list(packages)
|
||||
logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
|
||||
run_command(" ".join(map(shlex.quote, cmd)), env=env)
|
||||
|
||||
|
||||
def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
|
||||
wheel = first_matching_pkg(pattern)
|
||||
target = f"{wheel}[{extras}]" if extras else wheel
|
||||
logger.info("Installing %s...", target)
|
||||
pip_install_packages([target], prefer_uv=pref_uv)
|
||||
|
||||
|
||||
def run_python(args: Union[str, list[str]], env=None):
|
||||
"""
|
||||
Run the python in the current environment.
|
||||
"""
|
||||
if isinstance(args, str):
|
||||
args = shlex.split(args)
|
||||
cmd = [sys.executable] + args
|
||||
run_command(" ".join(map(shlex.quote, cmd)), env=env)
|
||||
|
||||
|
||||
def pkg_exists(name: str) -> bool:
|
||||
try:
|
||||
pkg_version = version(name)
|
||||
logger.info("%s already exist with version: %s", name, pkg_version)
|
||||
return True
|
||||
except PackageNotFoundError:
|
||||
logger.info("%s is not installed", name)
|
||||
return False
|
||||
|
||||
|
||||
def first_matching_pkg(pattern: str) -> str:
|
||||
matches = sorted(glob.glob(pattern))
|
||||
if not matches:
|
||||
raise FileNotFoundError(f"No wheel matching: {pattern}")
|
||||
return matches[0]
|
||||
@ -1,139 +0,0 @@
|
||||
"""
|
||||
General Utility helpers for CLI tasks.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import subprocess
|
||||
import sys
|
||||
from contextlib import contextmanager
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def run_command(
|
||||
cmd: str,
|
||||
use_shell: bool = False,
|
||||
log_cmd: bool = True,
|
||||
cwd: Optional[str] = None,
|
||||
env: Optional[dict] = None,
|
||||
check: bool = True,
|
||||
) -> int:
|
||||
"""Run a command with optional shell execution."""
|
||||
if use_shell:
|
||||
args = cmd
|
||||
log_prefix = "[shell]"
|
||||
executable = "/bin/bash"
|
||||
else:
|
||||
args = shlex.split(cmd)
|
||||
log_prefix = "[cmd]"
|
||||
executable = None
|
||||
|
||||
if log_cmd:
|
||||
display_cmd = cmd if use_shell else " ".join(args)
|
||||
logger.info("%s %s", log_prefix, display_cmd)
|
||||
|
||||
run_env = {**os.environ, **(env or {})}
|
||||
|
||||
proc = subprocess.run(
|
||||
args,
|
||||
shell=use_shell,
|
||||
executable=executable,
|
||||
stdout=sys.stdout,
|
||||
stderr=sys.stderr,
|
||||
cwd=cwd,
|
||||
env=run_env,
|
||||
check=False,
|
||||
)
|
||||
|
||||
if check and proc.returncode != 0:
|
||||
logger.error(
|
||||
"%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd
|
||||
)
|
||||
raise subprocess.CalledProcessError(
|
||||
proc.returncode, args if not use_shell else cmd
|
||||
)
|
||||
|
||||
return proc.returncode
|
||||
|
||||
|
||||
def str2bool(value: Optional[str]) -> bool:
|
||||
"""Convert environment variables to boolean values."""
|
||||
if not value:
|
||||
return False
|
||||
if not isinstance(value, str):
|
||||
raise ValueError(
|
||||
f"Expected a string value for boolean conversion, got {type(value)}"
|
||||
)
|
||||
value = value.strip().lower()
|
||||
|
||||
true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}
|
||||
false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}
|
||||
|
||||
if value in true_value_set:
|
||||
return True
|
||||
if value in false_value_set:
|
||||
return False
|
||||
raise ValueError(f"Invalid string value for boolean conversion: {value}")
|
||||
|
||||
|
||||
@contextmanager
|
||||
def temp_environ(updates: dict[str, str]):
|
||||
"""
|
||||
Temporarily set environment variables and restore them after the block.
|
||||
Args:
|
||||
updates: Dict of environment variables to set.
|
||||
"""
|
||||
missing = object()
|
||||
old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
|
||||
try:
|
||||
os.environ.update(updates)
|
||||
yield
|
||||
finally:
|
||||
for k, v in old.items():
|
||||
if v is missing:
|
||||
os.environ.pop(k, None)
|
||||
else:
|
||||
os.environ[k] = v # type: ignore[arg-type]
|
||||
|
||||
|
||||
@contextmanager
|
||||
def working_directory(path: str):
|
||||
"""
|
||||
Temporarily change the working directory inside a context.
|
||||
"""
|
||||
if not path:
|
||||
# No-op context
|
||||
yield
|
||||
return
|
||||
prev_cwd = os.getcwd()
|
||||
try:
|
||||
os.chdir(path)
|
||||
yield
|
||||
finally:
|
||||
os.chdir(prev_cwd)
|
||||
|
||||
|
||||
def get_wheels(
|
||||
output_dir: Path,
|
||||
max_depth: Optional[int] = None,
|
||||
) -> list[str]:
|
||||
"""Return a list of wheels found in the given output directory."""
|
||||
root = Path(output_dir)
|
||||
if not root.exists():
|
||||
return []
|
||||
items = []
|
||||
for dirpath, _, filenames in os.walk(root):
|
||||
depth = Path(dirpath).relative_to(root).parts
|
||||
if max_depth is not None and len(depth) > max_depth:
|
||||
continue
|
||||
for fname in sorted(filenames):
|
||||
if fname.endswith(".whl"):
|
||||
pkg = fname.split("-")[0]
|
||||
relpath = str((Path(dirpath) / fname).relative_to(root))
|
||||
items.append({"pkg": pkg, "relpath": relpath})
|
||||
return items
|
||||
@ -1,256 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import textwrap
|
||||
from typing import Any
|
||||
|
||||
from cli.lib.common.gh_summary import write_gh_step_summary
|
||||
from cli.lib.common.git_helper import clone_external_repo
|
||||
from cli.lib.common.pip_helper import pip_install_packages
|
||||
from cli.lib.common.utils import run_command, temp_environ, working_directory
|
||||
from jinja2 import Template
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_TPL_VLLM_INFO = Template(
|
||||
textwrap.dedent("""\
|
||||
## Vllm against Pytorch CI Test Summary
|
||||
**Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
|
||||
{%- if torch_sha %}
|
||||
**Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
|
||||
{%- endif %}
|
||||
""")
|
||||
)
|
||||
|
||||
|
||||
def sample_vllm_test_library():
|
||||
"""
|
||||
Simple sample to unblock the vllm ci development, which is mimic to
|
||||
https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
|
||||
see run_test_plan for more details
|
||||
"""
|
||||
# TODO(elainewy): Read from yaml file to handle the env and tests for vllm
|
||||
return {
|
||||
"vllm_basic_correctness_test": {
|
||||
"title": "Basic Correctness Test",
|
||||
"id": "vllm_basic_correctness_test",
|
||||
"env_vars": {
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
},
|
||||
"steps": [
|
||||
"pytest -v -s basic_correctness/test_cumem.py",
|
||||
"pytest -v -s basic_correctness/test_basic_correctness.py",
|
||||
"pytest -v -s basic_correctness/test_cpu_offload.py",
|
||||
"VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
|
||||
],
|
||||
},
|
||||
"vllm_basic_models_test": {
|
||||
"title": "Basic models test",
|
||||
"id": "vllm_basic_models_test",
|
||||
"steps": [
|
||||
"pytest -v -s models/test_transformers.py",
|
||||
"pytest -v -s models/test_registry.py",
|
||||
"pytest -v -s models/test_utils.py",
|
||||
"pytest -v -s models/test_vision.py",
|
||||
"pytest -v -s models/test_initialization.py",
|
||||
],
|
||||
},
|
||||
"vllm_entrypoints_test": {
|
||||
"title": "Entrypoints Test ",
|
||||
"id": "vllm_entrypoints_test",
|
||||
"env_vars": {
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
},
|
||||
"steps": [
|
||||
" ".join(
|
||||
[
|
||||
"pytest",
|
||||
"-v",
|
||||
"-s",
|
||||
"entrypoints/llm",
|
||||
"--ignore=entrypoints/llm/test_lazy_outlines.py",
|
||||
"--ignore=entrypoints/llm/test_generate.py",
|
||||
"--ignore=entrypoints/llm/test_generate_multiple_loras.py",
|
||||
"--ignore=entrypoints/llm/test_collective_rpc.py",
|
||||
]
|
||||
),
|
||||
"pytest -v -s entrypoints/llm/test_lazy_outlines.py",
|
||||
"pytest -v -s entrypoints/llm/test_generate.py ",
|
||||
"pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
|
||||
"VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
|
||||
],
|
||||
},
|
||||
"vllm_regression_test": {
|
||||
"title": "Regression Test",
|
||||
"id": "vllm_regression_test",
|
||||
"package_install": ["modelscope"],
|
||||
"steps": [
|
||||
"pytest -v -s test_regression.py",
|
||||
],
|
||||
},
|
||||
"vllm_lora_tp_test_distributed": {
|
||||
"title": "LoRA TP Test (Distributed)",
|
||||
"id": "vllm_lora_tp_test_distributed",
|
||||
"env_vars": {
|
||||
"VLLM_WORKER_MULTIPROC_METHOD": "spawn",
|
||||
},
|
||||
"num_gpus": 4,
|
||||
"steps": [
|
||||
"pytest -v -s -x lora/test_chatglm3_tp.py",
|
||||
"echo $VLLM_WORKER_MULTIPROC_METHOD",
|
||||
"pytest -v -s -x lora/test_llama_tp.py",
|
||||
"pytest -v -s -x lora/test_multi_loras_with_tp.py",
|
||||
],
|
||||
},
|
||||
"vllm_lora_280_failure_test": {
|
||||
"title": "LoRA 280 failure test",
|
||||
"id": "vllm_lora_280_failure_test",
|
||||
"steps": ["pytest -v lora/test_quant_model.py"],
|
||||
},
|
||||
"vllm_multi_model_processor_test": {
|
||||
"title": "Multi-Modal Processor Test",
|
||||
"id": "vllm_multi_model_processor_test",
|
||||
"package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
|
||||
"steps": [
|
||||
"pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
|
||||
],
|
||||
},
|
||||
"vllm_pytorch_compilation_unit_tests": {
|
||||
"title": "PyTorch Compilation Unit Tests",
|
||||
"id": "vllm_pytorch_compilation_unit_tests",
|
||||
"steps": [
|
||||
"pytest -v -s compile/test_pass_manager.py",
|
||||
"pytest -v -s compile/test_fusion.py",
|
||||
"pytest -v -s compile/test_fusion_attn.py",
|
||||
"pytest -v -s compile/test_silu_mul_quant_fusion.py",
|
||||
"pytest -v -s compile/test_sequence_parallelism.py",
|
||||
"pytest -v -s compile/test_async_tp.py",
|
||||
"pytest -v -s compile/test_fusion_all_reduce.py",
|
||||
"pytest -v -s compile/test_decorator.py",
|
||||
],
|
||||
},
|
||||
# TODO(elainewy):need to add g6 with 4 gpus to run this test
|
||||
"vllm_lora_test": {
|
||||
"title": "LoRA Test %N",
|
||||
"id": "lora_test",
|
||||
"parallelism": 4,
|
||||
"steps": [
|
||||
"echo '[checking] list sharded lora tests:'",
|
||||
" ".join(
|
||||
[
|
||||
"pytest -q --collect-only lora",
|
||||
"--shard-id=$$BUILDKITE_PARALLEL_JOB",
|
||||
"--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
|
||||
"--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
|
||||
]
|
||||
),
|
||||
"echo '[checking] Done. list lora tests'",
|
||||
" ".join(
|
||||
[
|
||||
"pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
|
||||
"--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
|
||||
"--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
|
||||
]
|
||||
),
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
|
||||
"""
|
||||
a method to check if the test plan is parallelism or not.
|
||||
"""
|
||||
parallelism = int(tests.get("parallelism", "0"))
|
||||
is_parallel = parallelism and parallelism > 1
|
||||
|
||||
if not is_parallel:
|
||||
return False
|
||||
|
||||
if shard_id > num_shards:
|
||||
raise RuntimeError(
|
||||
f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
|
||||
)
|
||||
|
||||
if num_shards != parallelism:
|
||||
raise RuntimeError(
|
||||
f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
|
||||
)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def run_test_plan(
|
||||
test_plan: str,
|
||||
test_target: str,
|
||||
tests_map: dict[str, Any],
|
||||
shard_id: int = 0,
|
||||
num_shards: int = 0,
|
||||
):
|
||||
"""
|
||||
a method to run list of tests based on the test plan.
|
||||
"""
|
||||
logger.info("run %s tests.....", test_target)
|
||||
if test_plan not in tests_map:
|
||||
raise RuntimeError(
|
||||
f"test {test_plan} not found, please add it to test plan pool"
|
||||
)
|
||||
tests = tests_map[test_plan]
|
||||
pkgs = tests.get("package_install", [])
|
||||
title = tests.get("title", "unknown test")
|
||||
|
||||
is_parallel = check_parallelism(tests, title, shard_id, num_shards)
|
||||
if is_parallel:
|
||||
title = title.replace("%N", f"{shard_id}/{num_shards}")
|
||||
|
||||
logger.info("Running tests: %s", title)
|
||||
if pkgs:
|
||||
logger.info("Installing packages: %s", pkgs)
|
||||
pip_install_packages(packages=pkgs, prefer_uv=True)
|
||||
with (
|
||||
working_directory(tests.get("working_directory", "tests")),
|
||||
temp_environ(tests.get("env_vars", {})),
|
||||
):
|
||||
failures = []
|
||||
for step in tests["steps"]:
|
||||
logger.info("Running step: %s", step)
|
||||
if is_parallel:
|
||||
step = replace_buildkite_placeholders(step, shard_id, num_shards)
|
||||
logger.info("Running parallel step: %s", step)
|
||||
code = run_command(cmd=step, check=False, use_shell=True)
|
||||
if code != 0:
|
||||
failures.append(step)
|
||||
logger.info("Finish running step: %s", step)
|
||||
if failures:
|
||||
logger.error("Failed tests: %s", failures)
|
||||
raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
|
||||
logger.info("Done. All tests passed")
|
||||
|
||||
|
||||
def clone_vllm(dst: str = "vllm"):
|
||||
_, commit = clone_external_repo(
|
||||
target="vllm",
|
||||
repo="https://github.com/vllm-project/vllm.git",
|
||||
dst=dst,
|
||||
update_submodules=True,
|
||||
)
|
||||
return commit
|
||||
|
||||
|
||||
def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
|
||||
mapping = {
|
||||
"$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
|
||||
"$$BUILDKITE_PARALLEL_JOB": str(shard_id),
|
||||
}
|
||||
for k in sorted(mapping, key=len, reverse=True):
|
||||
step = step.replace(k, mapping[k])
|
||||
return step
|
||||
|
||||
|
||||
def summarize_build_info(vllm_commit: str) -> bool:
|
||||
torch_sha = os.getenv("GITHUB_SHA")
|
||||
md = (
|
||||
_TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
|
||||
+ "\n"
|
||||
)
|
||||
return write_gh_step_summary(md)
|
||||
@ -1,285 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import textwrap
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from cli.lib.common.cli_helper import BaseRunner
|
||||
from cli.lib.common.docker_helper import local_image_exists
|
||||
from cli.lib.common.envs_helper import (
|
||||
env_bool_field,
|
||||
env_path_field,
|
||||
env_str_field,
|
||||
with_params_help,
|
||||
)
|
||||
from cli.lib.common.gh_summary import (
|
||||
gh_summary_path,
|
||||
summarize_content_from_file,
|
||||
summarize_wheels,
|
||||
)
|
||||
from cli.lib.common.path_helper import (
|
||||
copy,
|
||||
ensure_dir_exists,
|
||||
force_create_dir,
|
||||
get_path,
|
||||
is_path_exist,
|
||||
)
|
||||
from cli.lib.common.utils import run_command
|
||||
from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# Default path for docker build artifacts
|
||||
_DEFAULT_RESULT_PATH = "./shared"
|
||||
|
||||
# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build
|
||||
_VLLM_TEMP_FOLDER = "tmp"
|
||||
|
||||
|
||||
@dataclass
|
||||
class VllmBuildParameters:
|
||||
"""
|
||||
Parameters defining the vllm external input configurations.
|
||||
Combine with VllmDockerBuildArgs to define the vllm build environment
|
||||
"""
|
||||
|
||||
# USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
|
||||
# Otherwise docker build pull torch nightly during build
|
||||
# TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
|
||||
use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
|
||||
torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
|
||||
|
||||
# USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE
|
||||
# Otherwise, pull dockerfile's default image remotely
|
||||
# BASE_IMAGE: name:tag (only needed when use_local_base_image is True)
|
||||
use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)
|
||||
base_image: str = env_str_field("BASE_IMAGE")
|
||||
|
||||
# USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.
|
||||
# otherwise, use vllm's default dockerfile.torch_nightly for build
|
||||
# DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
|
||||
use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
|
||||
dockerfile_path: Path = env_path_field(
|
||||
"DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
|
||||
)
|
||||
|
||||
# OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
|
||||
output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
|
||||
|
||||
# --- Build args ----------------------------------------------------------
|
||||
target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
|
||||
|
||||
tag_name: str = env_str_field("TAG", "vllm-wheels")
|
||||
|
||||
cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")
|
||||
|
||||
python_version: str = env_str_field("PYTHON_VERSION", "3.12")
|
||||
|
||||
max_jobs: str = env_str_field("MAX_JOBS", "64")
|
||||
|
||||
sccache_bucket: str = env_str_field("SCCACHE_BUCKET")
|
||||
|
||||
sccache_region: str = env_str_field("SCCACHE_REGION")
|
||||
|
||||
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
|
||||
|
||||
def __post_init__(self):
|
||||
checks = [
|
||||
(
|
||||
self.use_torch_whl, # flag
|
||||
True, # trigger_value
|
||||
"torch_whls_path", # resource
|
||||
is_path_exist, # check_func
|
||||
"TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",
|
||||
),
|
||||
(
|
||||
self.use_local_base_image,
|
||||
True,
|
||||
"base_image",
|
||||
local_image_exists,
|
||||
f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",
|
||||
),
|
||||
(
|
||||
self.use_local_dockerfile,
|
||||
True,
|
||||
"dockerfile_path",
|
||||
is_path_exist,
|
||||
" DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",
|
||||
),
|
||||
]
|
||||
for flag, trigger_value, attr_name, check_func, error_msg in checks:
|
||||
value = getattr(self, attr_name)
|
||||
if flag == trigger_value:
|
||||
if not value or not check_func(value):
|
||||
raise ValueError(error_msg)
|
||||
else:
|
||||
logger.info("flag %s is not set", flag)
|
||||
if not self.output_dir:
|
||||
raise ValueError("missing required output_dir")
|
||||
|
||||
|
||||
@with_params_help(VllmBuildParameters)
|
||||
class VllmBuildRunner(BaseRunner):
|
||||
"""
|
||||
Build vLLM using docker buildx.
|
||||
|
||||
Environment variable options:
|
||||
"USE_TORCH_WHEEL": "1: use local wheels; 0: pull nightly from pypi",
|
||||
"TORCH_WHEELS_PATH": "Path to local wheels (when USE_TORCH_WHEEL=1)",
|
||||
|
||||
"USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",
|
||||
"BASE_IMAGE": "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",
|
||||
|
||||
"USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",
|
||||
"DOCKERFILE_PATH": "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",
|
||||
|
||||
"OUTPUT_DIR": "e.g. './shared'",
|
||||
|
||||
"TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",
|
||||
"CUDA_VERSION": "e.g. '12.8.1'",
|
||||
"PYTHON_VERSION": "e.g. '3.12'",
|
||||
"MAX_JOBS": "e.g. '64'",
|
||||
"SCCACHE_BUCKET": "e.g. 'my-bucket'",
|
||||
"SCCACHE_REGION": "e.g. 'us-west-2'",
|
||||
"""
|
||||
|
||||
def __init__(self, args=None):
|
||||
self.work_directory = "vllm"
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
main function to run vllm build
|
||||
1. prepare vllm build environment
|
||||
2. prepare the docker build command args
|
||||
3. run docker build
|
||||
"""
|
||||
inputs = VllmBuildParameters()
|
||||
logger.info("Running vllm build with inputs: %s", inputs)
|
||||
vllm_commit = clone_vllm()
|
||||
|
||||
self.cp_dockerfile_if_exist(inputs)
|
||||
# cp torch wheels from root direct to vllm workspace if exist
|
||||
self.cp_torch_whls_if_exist(inputs)
|
||||
|
||||
# make sure the output dir to store the build artifacts exist
|
||||
ensure_dir_exists(Path(inputs.output_dir))
|
||||
|
||||
cmd = self._generate_docker_build_cmd(inputs)
|
||||
logger.info("Running docker build: \n %s", cmd)
|
||||
|
||||
try:
|
||||
run_command(cmd, cwd="vllm", env=os.environ.copy())
|
||||
finally:
|
||||
self.genearte_vllm_build_summary(vllm_commit, inputs)
|
||||
|
||||
def genearte_vllm_build_summary(
|
||||
self, vllm_commit: str, inputs: VllmBuildParameters
|
||||
):
|
||||
if not gh_summary_path():
|
||||
return logger.info("Skipping, not detect GH Summary env var....")
|
||||
logger.info("Generate GH Summary ...")
|
||||
# summarize vllm build info
|
||||
summarize_build_info(vllm_commit)
|
||||
|
||||
# summarize vllm build artifacts
|
||||
vllm_artifact_dir = inputs.output_dir / "wheels"
|
||||
summarize_content_from_file(
|
||||
vllm_artifact_dir,
|
||||
"build_summary.txt",
|
||||
title="Vllm build env pip package summary",
|
||||
)
|
||||
summarize_wheels(
|
||||
inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
|
||||
)
|
||||
summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
|
||||
|
||||
def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
|
||||
if not inputs.use_torch_whl:
|
||||
return ""
|
||||
tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"
|
||||
tmp_path = Path(tmp_dir)
|
||||
force_create_dir(tmp_path)
|
||||
copy(inputs.torch_whls_path, tmp_dir)
|
||||
return tmp_dir
|
||||
|
||||
def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
|
||||
if not inputs.use_local_dockerfile:
|
||||
logger.info("using vllm default dockerfile.torch_nightly for build")
|
||||
return
|
||||
dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)
|
||||
vllm_torch_dockerfile = Path(
|
||||
f"./{self.work_directory}/docker/Dockerfile.nightly_torch"
|
||||
)
|
||||
copy(dockerfile_path, vllm_torch_dockerfile)
|
||||
|
||||
def get_result_path(self, path):
|
||||
"""
|
||||
Get the absolute path of the result path
|
||||
"""
|
||||
if not path:
|
||||
path = _DEFAULT_RESULT_PATH
|
||||
abs_path = get_path(path, resolve=True)
|
||||
return abs_path
|
||||
|
||||
def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:
|
||||
if not torch_whl_dir:
|
||||
return ""
|
||||
return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"
|
||||
|
||||
def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:
|
||||
"""
|
||||
Returns:
|
||||
- base_image_arg: docker buildx arg string for base image
|
||||
- final_base_image_arg: docker buildx arg string for vllm-base stage
|
||||
- pull_flag: --pull=true or --pull=false depending on whether the image exists locally
|
||||
"""
|
||||
if not inputs.use_local_base_image:
|
||||
return "", "", ""
|
||||
|
||||
base_image = inputs.base_image
|
||||
|
||||
# set both base image and final base image to the same local image
|
||||
base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"
|
||||
final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"
|
||||
|
||||
if local_image_exists(base_image):
|
||||
pull_flag = "--pull=false"
|
||||
return base_image_arg, final_base_image_arg, pull_flag
|
||||
logger.info(
|
||||
"[INFO] Local image not found:%s will try to pull from remote", {base_image}
|
||||
)
|
||||
return base_image_arg, final_base_image_arg, ""
|
||||
|
||||
def _generate_docker_build_cmd(
|
||||
self,
|
||||
inputs: VllmBuildParameters,
|
||||
) -> str:
|
||||
base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(
|
||||
inputs
|
||||
)
|
||||
torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)
|
||||
|
||||
return textwrap.dedent(
|
||||
f"""
|
||||
docker buildx build \
|
||||
--output type=local,dest={inputs.output_dir} \
|
||||
-f docker/Dockerfile.nightly_torch \
|
||||
{pull_flag} \
|
||||
{torch_arg} \
|
||||
{base_image_arg} \
|
||||
{final_base_image_arg} \
|
||||
--build-arg max_jobs={inputs.max_jobs} \
|
||||
--build-arg CUDA_VERSION={inputs.cuda_version} \
|
||||
--build-arg PYTHON_VERSION={inputs.python_version} \
|
||||
--build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \
|
||||
--build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \
|
||||
--build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \
|
||||
--build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \
|
||||
--target {inputs.target_stage} \
|
||||
-t {inputs.tag_name} \
|
||||
--progress=plain .
|
||||
"""
|
||||
).strip()
|
||||
@ -1,263 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
from collections.abc import Iterable
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from cli.lib.common.cli_helper import BaseRunner
|
||||
from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
|
||||
from cli.lib.common.path_helper import copy, remove_dir
|
||||
from cli.lib.common.pip_helper import (
|
||||
pip_install_first_match,
|
||||
pip_install_packages,
|
||||
pkg_exists,
|
||||
run_python,
|
||||
)
|
||||
from cli.lib.common.utils import run_command, working_directory
|
||||
from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class VllmTestParameters:
|
||||
"""
|
||||
Parameters defining the vllm external test input
|
||||
|
||||
!!!DO NOT ADD SECRETS IN THIS CLASS!!!
|
||||
you can put environment variable name in VllmTestParameters if it's not the same as the secret one
|
||||
fetch secrests directly from env variables during runtime
|
||||
"""
|
||||
|
||||
torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")
|
||||
|
||||
vllm_whls_path: Path = env_path_field(
|
||||
"VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"
|
||||
)
|
||||
|
||||
torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
|
||||
|
||||
def __post_init__(self):
|
||||
if not self.torch_whls_path.exists():
|
||||
raise ValueError("missing torch_whls_path")
|
||||
if not self.vllm_whls_path.exists():
|
||||
raise ValueError("missing vllm_whls_path")
|
||||
|
||||
|
||||
class TestInpuType(Enum):
|
||||
TEST_PLAN = "test_plan"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
class VllmTestRunner(BaseRunner):
|
||||
def __init__(self, args: Any):
|
||||
self.work_directory = "vllm"
|
||||
self.test_plan = ""
|
||||
self.test_type = TestInpuType.UNKNOWN
|
||||
|
||||
self.shard_id = args.shard_id
|
||||
self.num_shards = args.num_shards
|
||||
|
||||
if args.test_plan:
|
||||
self.test_plan = args.test_plan
|
||||
self.test_type = TestInpuType.TEST_PLAN
|
||||
|
||||
# Matches the structeur in the artifacts.zip from torcb build
|
||||
self.TORCH_WHL_PATH_REGEX = "torch*.whl"
|
||||
self.TORCH_WHL_EXTRA = "opt-einsum"
|
||||
self.TORCH_ADDITIONAL_WHLS_REGEX = [
|
||||
"vision/torchvision*.whl",
|
||||
"audio/torchaudio*.whl",
|
||||
]
|
||||
|
||||
# Match the structure of the artifacts.zip from vllm external build
|
||||
self.VLLM_TEST_WHLS_REGEX = [
|
||||
"xformers/*.whl",
|
||||
"vllm/vllm*.whl",
|
||||
"flashinfer-python/flashinfer*.whl",
|
||||
]
|
||||
|
||||
def prepare(self):
|
||||
"""
|
||||
prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
|
||||
"""
|
||||
params = VllmTestParameters()
|
||||
logger.info("Display VllmTestParameters %s", params)
|
||||
self._set_envs(params)
|
||||
|
||||
clone_vllm(dst=self.work_directory)
|
||||
with working_directory(self.work_directory):
|
||||
remove_dir(Path("vllm"))
|
||||
self._install_wheels(params)
|
||||
self._install_dependencies()
|
||||
# verify the torches are not overridden by test dependencies
|
||||
check_versions()
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
main function to run vllm test
|
||||
"""
|
||||
self.prepare()
|
||||
with working_directory(self.work_directory):
|
||||
if self.test_type == TestInpuType.TEST_PLAN:
|
||||
if self.num_shards > 1:
|
||||
run_test_plan(
|
||||
self.test_plan,
|
||||
"vllm",
|
||||
sample_vllm_test_library(),
|
||||
self.shard_id,
|
||||
self.num_shards,
|
||||
)
|
||||
else:
|
||||
run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
|
||||
else:
|
||||
raise ValueError(f"Unknown test type {self.test_type}")
|
||||
|
||||
def _install_wheels(self, params: VllmTestParameters):
|
||||
logger.info("Running vllm test with inputs: %s", params)
|
||||
if not pkg_exists("torch"):
|
||||
# install torch from local whls if it's not installed yet.
|
||||
torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
|
||||
pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
|
||||
|
||||
torch_whls_path = [
|
||||
f"{str(params.torch_whls_path)}/{whl_path}"
|
||||
for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
|
||||
]
|
||||
for torch_whl in torch_whls_path:
|
||||
pip_install_first_match(torch_whl)
|
||||
logger.info("Done. Installed torch and other torch-related wheels ")
|
||||
|
||||
logger.info("Installing vllm wheels")
|
||||
vllm_whls_path = [
|
||||
f"{str(params.vllm_whls_path)}/{whl_path}"
|
||||
for whl_path in self.VLLM_TEST_WHLS_REGEX
|
||||
]
|
||||
for vllm_whl in vllm_whls_path:
|
||||
pip_install_first_match(vllm_whl)
|
||||
logger.info("Done. Installed vllm wheels")
|
||||
|
||||
def _install_test_dependencies(self):
|
||||
"""
|
||||
This method replaces torch dependencies with local torch wheel info in
|
||||
requirements/test.in file from vllm repo. then generates the test.txt
|
||||
in runtime
|
||||
"""
|
||||
logger.info("generate test.txt from requirements/test.in with local torch whls")
|
||||
preprocess_test_in()
|
||||
copy("requirements/test.txt", "snapshot_constraint.txt")
|
||||
|
||||
run_command(
|
||||
f"{sys.executable} -m uv pip compile requirements/test.in "
|
||||
"-o test.txt "
|
||||
"--index-strategy unsafe-best-match "
|
||||
"--constraint snapshot_constraint.txt "
|
||||
"--torch-backend cu128"
|
||||
)
|
||||
pip_install_packages(requirements="test.txt", prefer_uv=True)
|
||||
logger.info("Done. installed requirements for test dependencies")
|
||||
|
||||
def _install_dependencies(self):
|
||||
pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
|
||||
pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
# using script from vllm repo to remove all torch packages from requirements txt
|
||||
run_python("use_existing_torch.py")
|
||||
|
||||
# install common packages
|
||||
for requirements in ["requirements/common.txt", "requirements/build.txt"]:
|
||||
pip_install_packages(
|
||||
requirements=requirements,
|
||||
prefer_uv=True,
|
||||
)
|
||||
# install test packages
|
||||
self._install_test_dependencies()
|
||||
|
||||
def _set_envs(self, inputs: VllmTestParameters):
|
||||
os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
|
||||
if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
|
||||
logger.warning(
|
||||
"Missing supported TORCH_CUDA_ARCH_LIST. "
|
||||
"Currently support TORCH_CUDA_ARCH_LIST env var "
|
||||
"with supported arch [8.0, 8.9, 9.0]"
|
||||
)
|
||||
|
||||
os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
|
||||
if not get_env("HF_TOKEN"):
|
||||
raise ValueError(
|
||||
"missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
|
||||
)
|
||||
if not get_env("TORCH_CUDA_ARCH_LIST"):
|
||||
raise ValueError(
|
||||
"missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
|
||||
)
|
||||
|
||||
|
||||
def preprocess_test_in(
|
||||
target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
|
||||
):
|
||||
"""
|
||||
This modifies the target_file file in place in vllm work directory.
|
||||
It removes torch and unwanted packages in target_file and replace with local torch whls
|
||||
package with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"
|
||||
"""
|
||||
additional_package_to_move = list(additional_packages or ())
|
||||
pkgs_to_remove = [
|
||||
"torch",
|
||||
"torchvision",
|
||||
"torchaudio",
|
||||
"xformers",
|
||||
"mamba_ssm",
|
||||
] + additional_package_to_move
|
||||
# Read current requirements
|
||||
target_path = Path(target_file)
|
||||
lines = target_path.read_text().splitlines()
|
||||
|
||||
pkgs_to_add = []
|
||||
|
||||
# Remove lines starting with the package names (==, @, >=) — case-insensitive
|
||||
pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
|
||||
kept_lines = [line for line in lines if not pattern.match(line)]
|
||||
|
||||
# Get local installed torch/vision/audio from pip freeze
|
||||
# This is hacky, but it works
|
||||
pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
|
||||
header_lines = [
|
||||
line
|
||||
for line in pip_freeze.splitlines()
|
||||
if re.match(
|
||||
r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
|
||||
)
|
||||
]
|
||||
|
||||
# Write back: header_lines + blank + kept_lines
|
||||
out_lines = header_lines + [""] + kept_lines
|
||||
if pkgs_to_add:
|
||||
out_lines += [""] + pkgs_to_add
|
||||
|
||||
out = "\n".join(out_lines) + "\n"
|
||||
target_path.write_text(out)
|
||||
logger.info("[INFO] Updated %s", target_file)
|
||||
|
||||
|
||||
def validate_cuda(value: str) -> bool:
|
||||
VALID_VALUES = {"8.0", "8.9", "9.0"}
|
||||
return all(v in VALID_VALUES for v in value.split())
|
||||
|
||||
|
||||
def check_versions():
|
||||
"""
|
||||
check installed packages version
|
||||
"""
|
||||
logger.info("Double check installed packages")
|
||||
patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
|
||||
for pkg in patterns:
|
||||
pkg_exists(pkg)
|
||||
logger.info("Done. checked installed packages")
|
||||
@ -1,40 +0,0 @@
|
||||
# main.py
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from cli.build_cli.register_build import register_build_commands
|
||||
from cli.lib.common.logger import setup_logging
|
||||
from cli.test_cli.register_test import register_test_commands
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def main():
|
||||
# Define top-level parser
|
||||
parser = argparse.ArgumentParser(description="Lumos CLI")
|
||||
subparsers = parser.add_subparsers(dest="command", required=True)
|
||||
parser.add_argument(
|
||||
"--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"
|
||||
)
|
||||
|
||||
# registers second-level subcommands
|
||||
register_build_commands(subparsers)
|
||||
register_test_commands(subparsers)
|
||||
|
||||
# parse args after all options are registered
|
||||
args = parser.parse_args()
|
||||
|
||||
# setup global logging
|
||||
setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))
|
||||
logger.debug("Parsed args: %s", args)
|
||||
|
||||
if hasattr(args, "func"):
|
||||
args.func(args)
|
||||
else:
|
||||
parser.print_help()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,62 +0,0 @@
|
||||
import argparse
|
||||
import logging
|
||||
|
||||
from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
|
||||
from cli.lib.core.vllm.vllm_test import VllmTestRunner
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Maps targets to their argparse configuration and runner
|
||||
# it adds new target to path python -m cli.run build external {target} with buildrunner
|
||||
_TARGETS: dict[str, TargetSpec] = {
|
||||
"vllm": {
|
||||
"runner": VllmTestRunner,
|
||||
"help": "test vLLM with pytorch main",
|
||||
}
|
||||
# add yours ...
|
||||
}
|
||||
|
||||
|
||||
def common_args(parser: argparse.ArgumentParser) -> None:
|
||||
"""
|
||||
Add common CLI arguments to the given parser.
|
||||
"""
|
||||
parser.add_argument(
|
||||
"--shard-id",
|
||||
type=int,
|
||||
default=1,
|
||||
help="a shard id to run, e.g. '0,1,2,3'",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--num-shards",
|
||||
type=int,
|
||||
default=1,
|
||||
help="a number of shards to run, e.g. '4'",
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group(required=True)
|
||||
group.add_argument(
|
||||
"-tp",
|
||||
"--test-plan",
|
||||
type=str,
|
||||
help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
|
||||
)
|
||||
|
||||
|
||||
def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
|
||||
build_parser = subparsers.add_parser(
|
||||
"test",
|
||||
help="test related commands",
|
||||
formatter_class=RichHelp,
|
||||
)
|
||||
build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
|
||||
overview = "\n".join(
|
||||
f" {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
|
||||
)
|
||||
external_parser = build_subparsers.add_parser(
|
||||
"external",
|
||||
help="Test external targets",
|
||||
description="Test third-party targets.\n\nAvailable targets:\n" + overview,
|
||||
formatter_class=RichHelp,
|
||||
)
|
||||
register_targets(external_parser, _TARGETS, common_args=common_args)
|
||||
@ -1,23 +0,0 @@
|
||||
[project]
|
||||
name = "lumen-ci"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"pyyaml==6.0.2",
|
||||
"GitPython==3.1.45",
|
||||
"docker==7.1.0",
|
||||
"pytest==7.3.2",
|
||||
"uv==0.8.6"
|
||||
]
|
||||
|
||||
[tool.setuptools]
|
||||
packages = ["cli"]
|
||||
|
||||
[tool.setuptools.package-dir]
|
||||
cli = "cli"
|
||||
|
||||
[tool.ruff.lint]
|
||||
# Enable preview mode for linting
|
||||
preview = true
|
||||
|
||||
# Now you can select your preview rules, like RUF048
|
||||
extend-select = ["RUF048"]
|
||||
@ -1,47 +0,0 @@
|
||||
# tests/test_cli.py
|
||||
import io
|
||||
import sys
|
||||
import unittest
|
||||
from contextlib import redirect_stderr, redirect_stdout
|
||||
from unittest.mock import patch
|
||||
|
||||
from cli.run import main
|
||||
|
||||
|
||||
class TestArgparseCLI(unittest.TestCase):
|
||||
@patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)
|
||||
@patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)
|
||||
def test_cli_run_build_external(self, mock_init, mock_run):
|
||||
from cli.run import main # import after patches if needed
|
||||
|
||||
test_args = ["cli.run", "build", "external", "vllm"]
|
||||
with patch.object(sys, "argv", test_args):
|
||||
# argparse may call sys.exit on error; capture to avoid test aborts
|
||||
try:
|
||||
main()
|
||||
except SystemExit:
|
||||
pass
|
||||
mock_init.assert_called_once() # got constructed
|
||||
mock_run.assert_called_once_with() # run() called
|
||||
|
||||
def test_build_help(self):
|
||||
test_args = ["cli.run", "build", "--help"]
|
||||
|
||||
with patch.object(sys, "argv", test_args):
|
||||
stdout = io.StringIO()
|
||||
stderr = io.StringIO()
|
||||
|
||||
# --help always raises SystemExit(0)
|
||||
with self.assertRaises(SystemExit) as cm:
|
||||
with redirect_stdout(stdout), redirect_stderr(stderr):
|
||||
main()
|
||||
|
||||
self.assertEqual(cm.exception.code, 0)
|
||||
|
||||
output = stdout.getvalue()
|
||||
self.assertIn("usage", output)
|
||||
self.assertIn("external", output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,115 +0,0 @@
|
||||
import argparse
|
||||
import io
|
||||
import unittest
|
||||
from contextlib import redirect_stderr
|
||||
from unittest.mock import patch
|
||||
|
||||
from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec
|
||||
|
||||
|
||||
# ---- Dummy runners for unittests----
|
||||
class FooRunner(BaseRunner):
|
||||
"""Foo description from docstring."""
|
||||
|
||||
def run(self) -> None: # replaced by mock
|
||||
pass
|
||||
|
||||
|
||||
class BarRunner(BaseRunner):
|
||||
def run(self) -> None: # replaced by mock
|
||||
pass
|
||||
|
||||
|
||||
def add_foo_args(p: argparse.ArgumentParser) -> None:
|
||||
p.add_argument("--x", type=int, required=True, help="x value")
|
||||
|
||||
|
||||
def common_args(p: argparse.ArgumentParser) -> None:
|
||||
p.add_argument("--verbose", action="store_true", help="verbose flag")
|
||||
|
||||
|
||||
def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:
|
||||
parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)
|
||||
register_targets(
|
||||
parser=parser,
|
||||
target_specs=specs,
|
||||
common_args=common_args,
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
def get_subparser(
|
||||
parser: argparse.ArgumentParser, name: str
|
||||
) -> argparse.ArgumentParser:
|
||||
subparsers_action = next(
|
||||
a
|
||||
for a in parser._subparsers._group_actions # type: ignore[attr-defined]
|
||||
if isinstance(a, argparse._SubParsersAction)
|
||||
)
|
||||
return subparsers_action.choices[name]
|
||||
|
||||
|
||||
class TestRegisterTargets(unittest.TestCase):
|
||||
def test_metavar_lists_targets(self):
|
||||
specs: dict[str, TargetSpec] = {
|
||||
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
|
||||
"bar": {"runner": BarRunner},
|
||||
}
|
||||
parser = build_parser(specs)
|
||||
subparsers_action = next(
|
||||
a
|
||||
for a in parser._subparsers._group_actions # type: ignore[attr-defined]
|
||||
if isinstance(a, argparse._SubParsersAction)
|
||||
)
|
||||
self.assertEqual(subparsers_action.metavar, "{foo,bar}")
|
||||
|
||||
def test_add_arguments_and_common_args_present(self):
|
||||
specs: dict[str, TargetSpec] = {
|
||||
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
|
||||
}
|
||||
parser = build_parser(specs)
|
||||
foo = get_subparser(parser, "foo")
|
||||
help_text = foo.format_help()
|
||||
self.assertIn("--x", help_text)
|
||||
self.assertIn("--verbose", help_text)
|
||||
|
||||
def test_runner_constructed_with_ns_and_run_called(self):
|
||||
specs: dict[str, TargetSpec] = {
|
||||
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
|
||||
}
|
||||
parser = build_parser(specs)
|
||||
|
||||
with (
|
||||
patch.object(FooRunner, "__init__", return_value=None) as mock_init,
|
||||
patch.object(FooRunner, "run", return_value=None) as mock_run,
|
||||
):
|
||||
ns = parser.parse_args(["foo", "--x", "3", "--verbose"])
|
||||
ns.func(ns) # set by register_targets
|
||||
# __init__ received the Namespace
|
||||
self.assertEqual(mock_init.call_count, 1)
|
||||
(called_ns,), _ = mock_init.call_args
|
||||
self.assertIsInstance(called_ns, argparse.Namespace)
|
||||
# run() called with no args
|
||||
mock_run.assert_called_once_with()
|
||||
|
||||
def test_runner_docstring_used_as_description_when_missing(self):
|
||||
specs: dict[str, TargetSpec] = {
|
||||
"foo": {"runner": FooRunner, "add_arguments": add_foo_args},
|
||||
}
|
||||
parser = build_parser(specs)
|
||||
foo = get_subparser(parser, "foo")
|
||||
help_text = foo.format_help()
|
||||
self.assertIn("Foo description from docstring.", help_text)
|
||||
|
||||
def test_missing_target_raises_systemexit_with_usage(self):
|
||||
specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}
|
||||
parser = build_parser(specs)
|
||||
buf = io.StringIO()
|
||||
with self.assertRaises(SystemExit), redirect_stderr(buf):
|
||||
parser.parse_args([])
|
||||
err = buf.getvalue()
|
||||
self.assertIn("usage:", err)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,75 +0,0 @@
|
||||
import unittest
|
||||
from unittest import mock
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import docker.errors as derr
|
||||
from cli.lib.common.docker_helper import _get_client, local_image_exists
|
||||
|
||||
|
||||
class TestDockerImageHelpers(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# Reset the singleton in the target module
|
||||
patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)
|
||||
self.addCleanup(patcher.stop)
|
||||
patcher.start()
|
||||
|
||||
def test_local_image_exists_true(self):
|
||||
# Mock a docker client whose images.get returns an object (no exception)
|
||||
mock_client = MagicMock()
|
||||
mock_client.images.get.return_value = object()
|
||||
ok = local_image_exists("repo:tag", client=mock_client)
|
||||
self.assertTrue(ok)
|
||||
|
||||
def test_local_image_exists_not_found_false(self):
|
||||
mock_client = MagicMock()
|
||||
# Raise docker.errors.NotFound
|
||||
mock_client.images.get.side_effect = derr.NotFound("nope")
|
||||
ok = local_image_exists("missing:latest", client=mock_client)
|
||||
self.assertFalse(ok)
|
||||
|
||||
def test_local_image_exists_api_error_false(self):
|
||||
mock_client = MagicMock()
|
||||
mock_client.images.get.side_effect = derr.APIError("boom", None)
|
||||
|
||||
ok = local_image_exists("broken:tag", client=mock_client)
|
||||
self.assertFalse(ok)
|
||||
|
||||
def test_local_image_exists_uses_lazy_singleton(self):
|
||||
# Patch docker.from_env used by _get_client()
|
||||
with mock.patch(
|
||||
"cli.lib.common.docker_helper.docker.from_env"
|
||||
) as mock_from_env:
|
||||
mock_docker_client = MagicMock()
|
||||
mock_from_env.return_value = mock_docker_client
|
||||
|
||||
# First call should create and cache the client
|
||||
c1 = _get_client()
|
||||
self.assertIs(c1, mock_docker_client)
|
||||
mock_from_env.assert_called_once()
|
||||
|
||||
# Second call should reuse cached client (no extra from_env calls)
|
||||
c2 = _get_client()
|
||||
self.assertIs(c2, mock_docker_client)
|
||||
mock_from_env.assert_called_once() # still once
|
||||
|
||||
def test_local_image_exists_without_client_param_calls_get_client_once(self):
|
||||
# Ensure _get_client is called and cached; local_image_exists should reuse it
|
||||
with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:
|
||||
mock_client = MagicMock()
|
||||
mock_get_client.return_value = mock_client
|
||||
|
||||
# 1st call
|
||||
local_image_exists("repo:tag")
|
||||
# 2nd call
|
||||
local_image_exists("repo:tag2")
|
||||
|
||||
# local_image_exists should call _get_client each time,
|
||||
# but your _get_client itself caches docker.from_env.
|
||||
self.assertEqual(mock_get_client.call_count, 2)
|
||||
self.assertEqual(mock_client.images.get.call_count, 2)
|
||||
mock_client.images.get.assert_any_call("repo:tag")
|
||||
mock_client.images.get.assert_any_call("repo:tag2")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,149 +0,0 @@
|
||||
import os
|
||||
import unittest
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import cli.lib.common.envs_helper as m
|
||||
|
||||
|
||||
class TestEnvHelpers(unittest.TestCase):
|
||||
def setUp(self):
|
||||
# Keep a copy of the original environment to restore later
|
||||
self._env_backup = dict(os.environ)
|
||||
|
||||
def tearDown(self):
|
||||
# Restore environment to original state
|
||||
os.environ.clear()
|
||||
os.environ.update(self._env_backup)
|
||||
|
||||
# -------- get_env --------
|
||||
def test_get_env_unset_returns_default(self):
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
self.assertEqual(m.get_env("FOO", "default"), "default")
|
||||
|
||||
def test_get_env_empty_returns_default(self):
|
||||
with patch.dict(os.environ, {"FOO": ""}, clear=True):
|
||||
self.assertEqual(m.get_env("FOO", "default"), "default")
|
||||
|
||||
def test_get_env_set_returns_value(self):
|
||||
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
|
||||
self.assertEqual(m.get_env("FOO", "default"), "bar")
|
||||
|
||||
def test_get_env_not_exist_returns_default(self):
|
||||
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
|
||||
self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")
|
||||
|
||||
def test_get_env_not_exist_without_default(self):
|
||||
with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
|
||||
self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")
|
||||
|
||||
# -------- env_bool --------
|
||||
def test_env_bool_uses_default_when_unset(self):
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
self.assertTrue(m.env_bool("FLAG", default=True))
|
||||
self.assertFalse(m.env_bool("FLAG", default=False))
|
||||
|
||||
def test_env_bool_uses_str2bool_when_set(self):
|
||||
# Patch str2bool used by env_bool so we don't depend on its exact behavior
|
||||
def fake_str2bool(s: str) -> bool:
|
||||
return s.lower() in {"1", "true", "yes", "on", "y"}
|
||||
|
||||
with (
|
||||
patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),
|
||||
patch.object(m, "str2bool", fake_str2bool),
|
||||
):
|
||||
self.assertTrue(m.env_bool("FLAG", default=False))
|
||||
|
||||
# -------- env_path_optional / env_path --------
|
||||
def test_env_path_optional_unset_returns_none_by_default(self):
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
self.assertIsNone(m.env_path_optional("P"))
|
||||
|
||||
def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):
|
||||
with patch.dict(os.environ, {"P": ""}, clear=True):
|
||||
self.assertIsNone(m.env_path_optional("P"))
|
||||
|
||||
def test_env_path_optional_unset_returns_default_str(self):
|
||||
# default as string; resolve=True by default -> absolute path
|
||||
default_str = "x/y"
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
p = m.env_path_optional("P", default=default_str)
|
||||
self.assertIsInstance(p, Path)
|
||||
self.assertIsNotNone(p)
|
||||
if p:
|
||||
self.assertTrue(p.is_absolute())
|
||||
self.assertEqual(p.parts[-2:], ("x", "y"))
|
||||
|
||||
def test_env_path_optional_unset_returns_default_path_no_resolve(self):
|
||||
d = Path("z")
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
p = m.env_path_optional("P", default=d, resolve=False)
|
||||
self.assertEqual(p, d)
|
||||
|
||||
def test_env_path_optional_respects_resolve_true(self):
|
||||
with patch.dict(os.environ, {"P": "a/b"}, clear=True):
|
||||
p = m.env_path_optional("P", resolve=True)
|
||||
self.assertIsInstance(p, Path)
|
||||
if p:
|
||||
self.assertTrue(p.is_absolute())
|
||||
|
||||
def test_env_path_optional_respects_resolve_false(self):
|
||||
with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):
|
||||
p = m.env_path_optional("P", resolve=False)
|
||||
self.assertEqual(p, Path("rel/dir"))
|
||||
if p:
|
||||
self.assertFalse(p.is_absolute())
|
||||
|
||||
def test_env_path_raises_when_missing_and_default_none(self):
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
with self.assertRaises(ValueError):
|
||||
m.env_path("P", None, resolve=True)
|
||||
|
||||
def test_env_path_returns_path_when_present(self):
|
||||
tmp = Path("./b").resolve()
|
||||
with patch.dict(os.environ, {"P": str(tmp)}, clear=True):
|
||||
p = m.env_path("P", None, resolve=True)
|
||||
self.assertEqual(p, tmp)
|
||||
|
||||
# -------- dataclass field helpers --------
|
||||
def test_dataclass_fields_read_env_at_instantiation(self):
|
||||
@dataclass
|
||||
class Cfg:
|
||||
flag: bool = m.env_bool_field("FLAG", default=False)
|
||||
out: Path = m.env_path_field("OUT", default="ab", resolve=True)
|
||||
name: str = m.env_str_field("NAME", default="anon")
|
||||
|
||||
# First instantiation
|
||||
with patch.dict(
|
||||
os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True
|
||||
):
|
||||
cfg1 = Cfg()
|
||||
self.assertTrue(cfg1.flag)
|
||||
self.assertIsInstance(cfg1.out, Path)
|
||||
self.assertTrue(cfg1.out.is_absolute())
|
||||
self.assertEqual(cfg1.name, "alice")
|
||||
cfg1.name = "bob" # change instance value
|
||||
self.assertEqual(cfg1.name, "bob") # change is reflected
|
||||
|
||||
# Change env; new instance should reflect new values
|
||||
with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):
|
||||
cfg2 = Cfg()
|
||||
self.assertFalse(cfg2.flag) # str2bool("false") -> False
|
||||
self.assertTrue("ab" in str(cfg2.out))
|
||||
self.assertIsInstance(cfg2.out, Path)
|
||||
self.assertTrue(cfg2.out.is_absolute())
|
||||
self.assertEqual(cfg2.name, "anon") # empty -> fallback to default
|
||||
|
||||
def test_dataclass_path_field_with_default_value(self):
|
||||
@dataclass
|
||||
class C2:
|
||||
out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)
|
||||
|
||||
with patch.dict(os.environ, {}, clear=True):
|
||||
c = C2()
|
||||
self.assertEqual(c.out, Path("some/dir"))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,122 +0,0 @@
|
||||
# test_path_utils.py
|
||||
# Run: pytest -q
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
|
||||
from cli.lib.common.path_helper import (
|
||||
copy,
|
||||
ensure_dir_exists,
|
||||
force_create_dir,
|
||||
get_path,
|
||||
is_path_exist,
|
||||
remove_dir,
|
||||
)
|
||||
|
||||
|
||||
class TestPathHelper(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.tmpdir = TemporaryDirectory()
|
||||
self.tmp_path = Path(self.tmpdir.name)
|
||||
|
||||
def tearDown(self):
|
||||
self.tmpdir.cleanup()
|
||||
|
||||
# -------- get_path --------
|
||||
def test_get_path_returns_path_for_str(self):
|
||||
# Use relative path to avoid absolute-ness
|
||||
rel_str = "sub/f.txt"
|
||||
os.chdir(self.tmp_path)
|
||||
p = get_path(rel_str, resolve=False)
|
||||
self.assertIsInstance(p, Path)
|
||||
self.assertFalse(p.is_absolute())
|
||||
self.assertEqual(str(p), rel_str)
|
||||
|
||||
def test_get_path_resolves(self):
|
||||
rel_str = "sub/f.txt"
|
||||
p = get_path(str(self.tmp_path / rel_str), resolve=True)
|
||||
self.assertTrue(p.is_absolute())
|
||||
self.assertTrue(str(p).endswith(rel_str))
|
||||
|
||||
def test_get_path_with_path_input(self):
|
||||
p_in = self.tmp_path / "sub/f.txt"
|
||||
p_out = get_path(p_in, resolve=False)
|
||||
self.assertTrue(str(p_out) == str(p_in))
|
||||
|
||||
def test_get_path_with_none_raises(self):
|
||||
with self.assertRaises(ValueError):
|
||||
get_path(None) # type: ignore[arg-type]
|
||||
|
||||
def test_get_path_invalid_type_raises(self):
|
||||
with self.assertRaises(TypeError):
|
||||
get_path(123) # type: ignore[arg-type]
|
||||
|
||||
# -------- ensure_dir_exists / force_create_dir / remove_dir --------
|
||||
def test_ensure_dir_exists_creates_and_is_idempotent(self):
|
||||
d = self.tmp_path / "made"
|
||||
ensure_dir_exists(d)
|
||||
self.assertTrue(d.exists() and d.is_dir())
|
||||
ensure_dir_exists(d)
|
||||
|
||||
def test_force_create_dir_clears_existing(self):
|
||||
d = self.tmp_path / "fresh"
|
||||
(d / "inner").mkdir(parents=True)
|
||||
(d / "inner" / "f.txt").write_text("x")
|
||||
force_create_dir(d)
|
||||
self.assertTrue(d.exists())
|
||||
self.assertEqual(list(d.iterdir()), [])
|
||||
|
||||
def test_remove_dir_none_is_noop(self):
|
||||
remove_dir(None) # type: ignore[arg-type]
|
||||
|
||||
def test_remove_dir_nonexistent_is_noop(self):
|
||||
ghost = self.tmp_path / "ghost"
|
||||
remove_dir(ghost)
|
||||
|
||||
def test_remove_dir_accepts_str(self):
|
||||
d = self.tmp_path / "to_rm"
|
||||
d.mkdir()
|
||||
remove_dir(str(d))
|
||||
self.assertFalse(d.exists())
|
||||
|
||||
# -------- copy --------
|
||||
def test_copy_file_to_file(self):
|
||||
src = self.tmp_path / "src.txt"
|
||||
dst = self.tmp_path / "out" / "dst.txt"
|
||||
src.write_text("hello")
|
||||
copy(src, dst)
|
||||
self.assertEqual(dst.read_text(), "hello")
|
||||
|
||||
def test_copy_dir_to_new_dir(self):
|
||||
src = self.tmp_path / "srcdir"
|
||||
(src / "a").mkdir(parents=True)
|
||||
(src / "a" / "f.txt").write_text("content")
|
||||
dst = self.tmp_path / "destdir"
|
||||
copy(src, dst)
|
||||
self.assertEqual((dst / "a" / "f.txt").read_text(), "content")
|
||||
|
||||
def test_copy_dir_into_existing_dir_overwrite_true_merges(self):
|
||||
src = self.tmp_path / "srcdir"
|
||||
dst = self.tmp_path / "destdir"
|
||||
(src / "x").mkdir(parents=True)
|
||||
(src / "x" / "new.txt").write_text("new")
|
||||
dst.mkdir()
|
||||
(dst / "existing.txt").write_text("old")
|
||||
copy(src, dst)
|
||||
self.assertEqual((dst / "existing.txt").read_text(), "old")
|
||||
self.assertEqual((dst / "x" / "new.txt").read_text(), "new")
|
||||
|
||||
def test_is_str_path_exist(self):
|
||||
p = self.tmp_path / "x.txt"
|
||||
p.write_text("1")
|
||||
self.assertTrue(is_path_exist(str(p)))
|
||||
self.assertTrue(is_path_exist(p))
|
||||
self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))
|
||||
self.assertFalse(is_path_exist(self.tmp_path / "missing"))
|
||||
self.assertFalse(is_path_exist(""))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@ -1,185 +0,0 @@
|
||||
# tests/test_run_test_plan.py
|
||||
import importlib
|
||||
from contextlib import nullcontext
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
MOD = "cli.lib.core.vllm.lib"
|
||||
|
||||
# We import inside tests so the MOD override above applies everywhere
|
||||
run_test_plan_import_path = f"{MOD}.run_test_plan"
|
||||
|
||||
|
||||
def _get_cmd(c):
|
||||
# Support both kwargs and positional args
|
||||
return c.kwargs.get("cmd", c.args[0] if c.args else None)
|
||||
|
||||
|
||||
def _get_check(c):
|
||||
if "check" in c.kwargs:
|
||||
return c.kwargs["check"]
|
||||
# If positional, assume second arg is 'check' when present; default False
|
||||
return c.args[1] if len(c.args) > 1 else False
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patch_module(monkeypatch):
|
||||
"""
|
||||
Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',
|
||||
'run_command', 'logger') inside the target module and expose them.
|
||||
"""
|
||||
module = importlib.import_module(MOD)
|
||||
|
||||
# Create fakes/mocks
|
||||
pip_install_packages = MagicMock(name="pip_install_packages")
|
||||
run_command = MagicMock(name="run_command", return_value=0)
|
||||
|
||||
# temp_environ / working_directory: record calls but act as context managers
|
||||
temp_calls: list[dict] = []
|
||||
workdir_calls: list[str] = []
|
||||
|
||||
def fake_working_directory(path: str):
|
||||
workdir_calls.append(path)
|
||||
return nullcontext()
|
||||
|
||||
def fake_temp_env(map: dict[str, str]):
|
||||
temp_calls.append(map)
|
||||
return nullcontext()
|
||||
|
||||
logger = SimpleNamespace(
|
||||
info=MagicMock(name="logger.info"),
|
||||
error=MagicMock(name="logger.error"),
|
||||
)
|
||||
|
||||
# Apply patches (raise if attribute doesn't exist)
|
||||
monkeypatch.setattr(
|
||||
module, "pip_install_packages", pip_install_packages, raising=True
|
||||
)
|
||||
monkeypatch.setattr(module, "run_command", run_command, raising=True)
|
||||
monkeypatch.setattr(
|
||||
module, "working_directory", fake_working_directory, raising=True
|
||||
)
|
||||
monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
|
||||
monkeypatch.setattr(module, "logger", logger, raising=True)
|
||||
|
||||
return SimpleNamespace(
|
||||
module=module,
|
||||
run_test_plan=module.run_test_plan, # expose to avoid getattr("constant") (Ruff B009)
|
||||
pip_install_packages=pip_install_packages,
|
||||
run_command=run_command,
|
||||
temp_calls=temp_calls,
|
||||
workdir_calls=workdir_calls,
|
||||
logger=logger,
|
||||
)
|
||||
|
||||
|
||||
def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):
|
||||
run_test_plan = patch_module.run_test_plan
|
||||
|
||||
tests_map = {
|
||||
"basic": {
|
||||
"title": "Basic suite",
|
||||
"package_install": [],
|
||||
"working_directory": "tests",
|
||||
"env_vars": {"GLOBAL_FLAG": "1"},
|
||||
"steps": [
|
||||
"export A=x && pytest -q",
|
||||
"export B=y && pytest -q tests/unit",
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
# One exit code per step (export + two pytest)
|
||||
patch_module.run_command.side_effect = [0, 0, 0]
|
||||
|
||||
run_test_plan("basic", "cpu", tests_map)
|
||||
|
||||
calls = patch_module.run_command.call_args_list
|
||||
cmds = [_get_cmd(c) for c in calls]
|
||||
checks = [_get_check(c) for c in calls]
|
||||
|
||||
assert cmds == [
|
||||
"export A=x && pytest -q",
|
||||
"export B=y && pytest -q tests/unit",
|
||||
]
|
||||
assert all(chk is False for chk in checks)
|
||||
|
||||
assert patch_module.workdir_calls == ["tests"]
|
||||
assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
|
||||
|
||||
|
||||
def test_installs_packages_when_present(monkeypatch, patch_module):
|
||||
run_test_plan = patch_module.module.run_test_plan
|
||||
|
||||
tests_map = {
|
||||
"with_pkgs": {
|
||||
"title": "Needs deps",
|
||||
"package_install": ["timm==1.0.0", "flash-attn"],
|
||||
"steps": ["pytest -q"],
|
||||
}
|
||||
}
|
||||
|
||||
patch_module.run_command.return_value = 0
|
||||
|
||||
run_test_plan("with_pkgs", "gpu", tests_map)
|
||||
|
||||
patch_module.pip_install_packages.assert_called_once_with(
|
||||
packages=["timm==1.0.0", "flash-attn"],
|
||||
prefer_uv=True,
|
||||
)
|
||||
|
||||
|
||||
def test_raises_on_missing_plan(patch_module):
|
||||
run_test_plan = patch_module.module.run_test_plan
|
||||
with pytest.raises(RuntimeError) as ei:
|
||||
run_test_plan("nope", "cpu", tests_map={})
|
||||
|
||||
assert "test nope not found" in str(ei.value)
|
||||
|
||||
|
||||
def test_aggregates_failures_and_raises(monkeypatch, patch_module):
|
||||
run_test_plan = patch_module.module.run_test_plan
|
||||
|
||||
tests_map = {
|
||||
"mix": {
|
||||
"title": "Some pass some fail",
|
||||
"steps": [
|
||||
"pytest test_a.py", # 0 → pass
|
||||
"pytest test_b.py", # 1 → fail
|
||||
"pytest test_c.py", # 2 → fail
|
||||
],
|
||||
}
|
||||
}
|
||||
|
||||
# Simulate pass, fail, fail
|
||||
patch_module.run_command.side_effect = [0, 1, 2]
|
||||
|
||||
with pytest.raises(RuntimeError) as ei:
|
||||
run_test_plan("mix", "cpu", tests_map)
|
||||
|
||||
msg = str(ei.value)
|
||||
assert "2 pytest runs failed" in msg
|
||||
# Ensure logger captured failed tests list
|
||||
patch_module.logger.error.assert_called_once()
|
||||
# And we attempted all three commands
|
||||
assert patch_module.run_command.call_count == 3
|
||||
|
||||
|
||||
def test_custom_working_directory_used(patch_module):
|
||||
run_test_plan = patch_module.module.run_test_plan
|
||||
|
||||
tests_map = {
|
||||
"customwd": {
|
||||
"title": "Custom wd",
|
||||
"working_directory": "examples/ci",
|
||||
"steps": ["pytest -q"],
|
||||
}
|
||||
}
|
||||
|
||||
patch_module.run_command.return_value = 0
|
||||
run_test_plan("customwd", "cpu", tests_map)
|
||||
|
||||
assert patch_module.workdir_calls == ["examples/ci"]
|
||||
@ -1,143 +0,0 @@
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from cli.lib.common.utils import temp_environ, working_directory # <-- replace import
|
||||
|
||||
|
||||
class EnvIsolatedTestCase(unittest.TestCase):
|
||||
"""Base class that snapshots os.environ and CWD for isolation."""
|
||||
|
||||
def setUp(self):
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
self._env_backup = dict(os.environ)
|
||||
|
||||
# Snapshot/repair CWD if it's gone
|
||||
try:
|
||||
self._cwd_backup = os.getcwd()
|
||||
except FileNotFoundError:
|
||||
# If CWD no longer exists, switch to a safe place and record that
|
||||
self._cwd_backup = tempfile.gettempdir()
|
||||
os.chdir(self._cwd_backup)
|
||||
|
||||
# Create a temporary directory for the test to run in
|
||||
self._temp_dir = tempfile.mkdtemp()
|
||||
os.chdir(self._temp_dir)
|
||||
|
||||
def tearDown(self):
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
|
||||
# Restore cwd first (before cleaning up temp dir)
|
||||
try:
|
||||
os.chdir(self._cwd_backup)
|
||||
except OSError:
|
||||
os.chdir(tempfile.gettempdir())
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
shutil.rmtree(self._temp_dir, ignore_errors=True)
|
||||
except Exception:
|
||||
pass # Ignore cleanup errors
|
||||
|
||||
# Restore env
|
||||
to_del = set(os.environ.keys()) - set(self._env_backup.keys())
|
||||
for k in to_del:
|
||||
os.environ.pop(k, None)
|
||||
for k, v in self._env_backup.items():
|
||||
os.environ[k] = v
|
||||
|
||||
|
||||
class TestTempEnviron(EnvIsolatedTestCase):
|
||||
def test_sets_and_restores_new_var(self):
|
||||
var = "TEST_TMP_ENV_NEW"
|
||||
self.assertNotIn(var, os.environ)
|
||||
|
||||
with temp_environ({var: "123"}):
|
||||
self.assertEqual(os.environ[var], "123")
|
||||
|
||||
self.assertNotIn(var, os.environ) # removed after exit
|
||||
|
||||
def test_overwrites_and_restores_existing_var(self):
|
||||
var = "TEST_TMP_ENV_OVERWRITE"
|
||||
os.environ[var] = "orig"
|
||||
|
||||
with temp_environ({var: "override"}):
|
||||
self.assertEqual(os.environ[var], "override")
|
||||
|
||||
self.assertEqual(os.environ[var], "orig") # restored
|
||||
|
||||
def test_multiple_vars_and_missing_cleanup(self):
|
||||
v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"
|
||||
os.environ.pop(v1, None)
|
||||
os.environ[v2] = "keep"
|
||||
|
||||
with temp_environ({v1: "a", v2: "b"}):
|
||||
self.assertEqual(os.environ[v1], "a")
|
||||
self.assertEqual(os.environ[v2], "b")
|
||||
|
||||
self.assertNotIn(v1, os.environ) # newly-added -> removed
|
||||
self.assertEqual(os.environ[v2], "keep") # pre-existing -> restored
|
||||
|
||||
def test_restores_even_on_exception(self):
|
||||
var = "TEST_TMP_ENV_EXCEPTION"
|
||||
self.assertNotIn(var, os.environ)
|
||||
|
||||
with self.assertRaises(RuntimeError):
|
||||
with temp_environ({var: "x"}):
|
||||
self.assertEqual(os.environ[var], "x")
|
||||
raise RuntimeError("boom")
|
||||
|
||||
self.assertNotIn(var, os.environ) # removed after exception
|
||||
|
||||
|
||||
class TestWorkingDirectory(EnvIsolatedTestCase):
|
||||
def test_changes_and_restores(self):
|
||||
start = Path.cwd()
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
target = Path(td) / "wd"
|
||||
target.mkdir()
|
||||
|
||||
with working_directory(str(target)):
|
||||
self.assertEqual(Path.cwd().resolve(), target.resolve())
|
||||
|
||||
self.assertEqual(Path.cwd(), start)
|
||||
|
||||
def test_noop_when_empty_path(self):
|
||||
start = Path.cwd()
|
||||
with working_directory(""):
|
||||
self.assertEqual(Path.cwd(), start)
|
||||
self.assertEqual(Path.cwd(), start)
|
||||
|
||||
def test_restores_on_exception(self):
|
||||
start = Path.cwd()
|
||||
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
target = Path(td) / "wd_exc"
|
||||
target.mkdir()
|
||||
|
||||
with self.assertRaises(ValueError):
|
||||
with working_directory(str(target)):
|
||||
# Normalize both sides to handle /var -> /private/var
|
||||
self.assertEqual(Path.cwd().resolve(), target.resolve())
|
||||
raise ValueError("boom")
|
||||
|
||||
self.assertEqual(Path.cwd().resolve(), start.resolve())
|
||||
|
||||
def test_raises_for_missing_dir(self):
|
||||
start = Path.cwd()
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
missing = Path(td) / "does_not_exist"
|
||||
with self.assertRaises(FileNotFoundError):
|
||||
# os.chdir should raise before yielding
|
||||
with working_directory(str(missing)):
|
||||
pass
|
||||
self.assertEqual(Path.cwd(), start)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main(verbosity=2)
|
||||
@ -1,176 +0,0 @@
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import cli.lib.core.vllm.vllm_build as vllm_build
|
||||
|
||||
|
||||
_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"
|
||||
|
||||
|
||||
class TestVllmBuildParameters(unittest.TestCase):
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)
|
||||
@patch(
|
||||
"cli.lib.common.envs_helper.env_path_optional",
|
||||
side_effect=lambda name, default=None, resolve=True: {
|
||||
"DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),
|
||||
"TORCH_WHEELS_PATH": Path("/abs/dist"),
|
||||
"OUTPUT_DIR": Path("/abs/shared"),
|
||||
}.get(name, Path(default) if default is not None else None),
|
||||
)
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"USE_TORCH_WHEEL": "1",
|
||||
"USE_LOCAL_BASE_IMAGE": "1",
|
||||
"USE_LOCAL_DOCKERFILE": "1",
|
||||
"BASE_IMAGE": "my/image:tag",
|
||||
"DOCKERFILE_PATH": "vllm/Dockerfile",
|
||||
"TORCH_WHEELS_PATH": "dist",
|
||||
"OUTPUT_DIR": "shared",
|
||||
},
|
||||
clear=True,
|
||||
)
|
||||
def test_params_success_normalizes_and_validates(
|
||||
self, mock_env_path, mock_is_path, mock_local_img
|
||||
):
|
||||
params = vllm_build.VllmBuildParameters()
|
||||
self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
|
||||
self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
|
||||
self.assertEqual(params.output_dir, Path("/abs/shared"))
|
||||
self.assertEqual(params.base_image, "my/image:tag")
|
||||
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
|
||||
@patch.dict(
|
||||
os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
|
||||
)
|
||||
def test_params_missing_torch_whls_raises(self, _is_path):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
os.chdir(td)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
vllm_build.VllmBuildParameters(
|
||||
use_local_base_image=False,
|
||||
use_local_dockerfile=False,
|
||||
)
|
||||
err = cm.exception
|
||||
self.assertIn("TORCH_WHEELS_PATH", str(err))
|
||||
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)
|
||||
@patch.dict(
|
||||
os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
|
||||
)
|
||||
def test_params_missing_local_base_image_raises(self, _local_img):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
os.chdir(td)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
vllm_build.VllmBuildParameters(
|
||||
use_torch_whl=False,
|
||||
use_local_dockerfile=False,
|
||||
)
|
||||
err = cm.exception
|
||||
self.assertIn("BASE_IMAGE", str(err))
|
||||
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
|
||||
clear=True,
|
||||
)
|
||||
def test_params_missing_dockerfile_raises(self, _is_path):
|
||||
with tempfile.TemporaryDirectory() as td:
|
||||
os.chdir(td)
|
||||
with self.assertRaises(ValueError) as cm:
|
||||
vllm_build.VllmBuildParameters(
|
||||
use_torch_whl=False,
|
||||
use_local_base_image=False,
|
||||
)
|
||||
err = cm.exception
|
||||
self.assertIn("DOCKERFILE_PATH", str(err))
|
||||
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{"OUTPUT_DIR": ""},
|
||||
clear=True,
|
||||
)
|
||||
def test_params_missing_output_dir(self, _is_path):
|
||||
with self.assertRaises(FileNotFoundError):
|
||||
vllm_build.VllmBuildParameters()
|
||||
|
||||
|
||||
class TestBuildCmdAndRun(unittest.TestCase):
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
|
||||
def test_generate_docker_build_cmd_includes_bits(self, _exists):
|
||||
runner = vllm_build.VllmBuildRunner()
|
||||
inputs = MagicMock()
|
||||
inputs.output_dir = Path("/abs/out")
|
||||
inputs.use_local_base_image = True
|
||||
inputs.base_image = "img:tag"
|
||||
inputs.torch_whls_path = Path("./vllm/tmp")
|
||||
inputs.max_jobs = 64
|
||||
inputs.cuda_version = "12.8.1"
|
||||
inputs.python_version = "3.12"
|
||||
inputs.sccache_bucket = "my-bucket"
|
||||
inputs.sccache_region = "us-west-2"
|
||||
inputs.torch_cuda_arch_list = "8.0;9.0"
|
||||
inputs.target_stage = "export-wheels"
|
||||
inputs.tag_name = "vllm-wheels"
|
||||
|
||||
cmd = runner._generate_docker_build_cmd(inputs)
|
||||
squashed = " ".join(cmd.split())
|
||||
|
||||
self.assertIn("--output type=local,dest=/abs/out", squashed)
|
||||
self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
|
||||
self.assertIn("--pull=false", squashed)
|
||||
self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)
|
||||
self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)
|
||||
self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)
|
||||
self.assertIn("--build-arg max_jobs=64", squashed)
|
||||
self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)
|
||||
self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)
|
||||
self.assertIn("--build-arg USE_SCCACHE=1", squashed)
|
||||
self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)
|
||||
self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)
|
||||
self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)
|
||||
self.assertIn("--target export-wheels", squashed)
|
||||
self.assertIn("-t vllm-wheels", squashed)
|
||||
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.run_command")
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")
|
||||
@patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")
|
||||
@patch.object(
|
||||
vllm_build.VllmBuildRunner,
|
||||
"_generate_docker_build_cmd",
|
||||
return_value="docker buildx ...",
|
||||
)
|
||||
@patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"USE_TORCH_WHEEL": "0",
|
||||
"USE_LOCAL_BASE_IMAGE": "0",
|
||||
"USE_LOCAL_DOCKERFILE": "0",
|
||||
"OUTPUT_DIR": "shared",
|
||||
},
|
||||
clear=True,
|
||||
)
|
||||
def test_run_calls_clone_prepare_and_build(
|
||||
self, mock_gen, mock_clone, mock_ensure, mock_run
|
||||
):
|
||||
params = MagicMock()
|
||||
params.output_dir = Path("shared")
|
||||
params.use_local_dockerfile = False
|
||||
params.use_torch_whl = False
|
||||
|
||||
with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):
|
||||
runner = vllm_build.VllmBuildRunner()
|
||||
runner.run()
|
||||
|
||||
mock_clone.assert_called_once()
|
||||
mock_ensure.assert_called_once_with(Path("shared"))
|
||||
mock_gen.assert_called_once_with(params)
|
||||
mock_run.assert_called_once()
|
||||
_, kwargs = mock_run.call_args
|
||||
assert kwargs.get("cwd") == "vllm"
|
||||
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
|
||||
magma/build_magma.sh
|
||||
|
||||
.PHONY: all
|
||||
all: magma-cuda130
|
||||
all: magma-cuda129
|
||||
all: magma-cuda128
|
||||
all: magma-cuda126
|
||||
@ -26,12 +25,6 @@ clean:
|
||||
$(RM) -r magma-*
|
||||
$(RM) -r output
|
||||
|
||||
.PHONY: magma-cuda130
|
||||
magma-cuda130: DESIRED_CUDA := 13.0
|
||||
magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
|
||||
magma-cuda130:
|
||||
$(DOCKER_RUN)
|
||||
|
||||
.PHONY: magma-cuda129
|
||||
magma-cuda129: DESIRED_CUDA := 12.9
|
||||
magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
|
||||
|
||||
@ -28,7 +28,6 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
|
||||
patch < ${PACKAGE_FILES}/CMake.patch
|
||||
patch < ${PACKAGE_FILES}/cmakelists.patch
|
||||
patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
|
||||
patch -p1 < ${PACKAGE_FILES}/cuda13.patch
|
||||
patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
|
||||
patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
|
||||
# The build.sh script expects to be executed from the sources root folder
|
||||
@ -38,7 +37,6 @@ popd
|
||||
# Package recipe, license and tarball
|
||||
# Folder and package name are backward compatible for the build workflow
|
||||
cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
|
||||
cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
|
||||
cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
|
||||
cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
|
||||
cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
|
||||
|
||||
@ -1,26 +0,0 @@
|
||||
diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
|
||||
index 73fed1b20..e77519bfe 100644
|
||||
--- a/interface_cuda/interface.cpp
|
||||
+++ b/interface_cuda/interface.cpp
|
||||
@@ -438,14 +438,20 @@ magma_print_environment()
|
||||
cudaDeviceProp prop;
|
||||
err = cudaGetDeviceProperties( &prop, dev );
|
||||
check_error( err );
|
||||
+ #ifdef MAGMA_HAVE_CUDA
|
||||
+#if CUDA_VERSION < 13000
|
||||
printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
|
||||
dev,
|
||||
prop.name,
|
||||
prop.clockRate / 1000.,
|
||||
+#else
|
||||
+ printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
|
||||
+ dev,
|
||||
+ prop.name,
|
||||
+#endif
|
||||
prop.totalGlobalMem / (1024.*1024.),
|
||||
prop.major,
|
||||
prop.minor );
|
||||
- #ifdef MAGMA_HAVE_CUDA
|
||||
int arch = prop.major*100 + prop.minor*10;
|
||||
if ( arch < MAGMA_CUDA_ARCH_MIN ) {
|
||||
printf("\n"
|
||||
@ -5,6 +5,10 @@ set -ex
|
||||
SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
|
||||
|
||||
case "${GPU_ARCH_TYPE:-BLANK}" in
|
||||
BLANK)
|
||||
# Legacy behavior for CircleCI
|
||||
bash "${SCRIPTPATH}/build_cuda.sh"
|
||||
;;
|
||||
cuda)
|
||||
bash "${SCRIPTPATH}/build_cuda.sh"
|
||||
;;
|
||||
|
||||
@ -66,9 +66,6 @@ case ${CUDA_VERSION} in
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
|
||||
fi
|
||||
;;
|
||||
13.0)
|
||||
TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
|
||||
;;
|
||||
12.6)
|
||||
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
|
||||
;;
|
||||
@ -113,15 +110,11 @@ DEPS_SONAME=(
|
||||
)
|
||||
|
||||
|
||||
# CUDA_VERSION 12.*, 13.*
|
||||
if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
# CUDA_VERSION 12.6, 12.8, 12.9
|
||||
if [[ $CUDA_VERSION == 12* ]]; then
|
||||
export USE_STATIC_CUDNN=0
|
||||
# Try parallelizing nvcc as well
|
||||
TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
|
||||
# Compress the fatbin with -compress-mode=size for CUDA 13
|
||||
if [[ $CUDA_VERSION == 13* ]]; then
|
||||
export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"
|
||||
fi
|
||||
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
|
||||
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
|
||||
echo "Bundling with cudnn and cublas."
|
||||
DEPS_LIST+=(
|
||||
@ -141,7 +134,6 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
"/usr/local/cuda/lib64/libnvrtc-builtins.so"
|
||||
"/usr/local/cuda/lib64/libcufile.so.0"
|
||||
"/usr/local/cuda/lib64/libcufile_rdma.so.1"
|
||||
"/usr/local/cuda/lib64/libnvshmem_host.so.3"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
|
||||
"/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
|
||||
)
|
||||
@ -160,7 +152,6 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
"libcudart.so.12"
|
||||
"libnvrtc.so.12"
|
||||
"libnvrtc-builtins.so"
|
||||
"libnvshmem_host.so.3"
|
||||
"libcufile.so.0"
|
||||
"libcufile_rdma.so.1"
|
||||
"libcupti.so.12"
|
||||
@ -174,29 +165,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
|
||||
else
|
||||
echo "Using nvidia libs from pypi."
|
||||
CUDA_RPATHS=(
|
||||
'$ORIGIN/../../nvidia/cublas/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_cupti/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_nvrtc/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_runtime/lib'
|
||||
'$ORIGIN/../../nvidia/cudnn/lib'
|
||||
'$ORIGIN/../../nvidia/nvshmem/lib'
|
||||
'$ORIGIN/../../nvidia/nccl/lib'
|
||||
'$ORIGIN/../../nvidia/cufft/lib'
|
||||
'$ORIGIN/../../nvidia/curand/lib'
|
||||
'$ORIGIN/../../nvidia/cusolver/lib'
|
||||
'$ORIGIN/../../nvidia/cusparse/lib'
|
||||
'$ORIGIN/../../nvidia/cusparselt/lib'
|
||||
'$ORIGIN/../../cusparselt/lib'
|
||||
'$ORIGIN/../../nvidia/nccl/lib'
|
||||
'$ORIGIN/../../nvidia/nvshmem/lib'
|
||||
'$ORIGIN/../../nvidia/nvtx/lib'
|
||||
'$ORIGIN/../../nvidia/cufile/lib'
|
||||
)
|
||||
if [[ $CUDA_VERSION == 13* ]]; then
|
||||
CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')
|
||||
else
|
||||
CUDA_RPATHS+=(
|
||||
'$ORIGIN/../../nvidia/cublas/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_cupti/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_nvrtc/lib'
|
||||
'$ORIGIN/../../nvidia/cuda_runtime/lib'
|
||||
'$ORIGIN/../../nvidia/cufft/lib'
|
||||
'$ORIGIN/../../nvidia/curand/lib'
|
||||
'$ORIGIN/../../nvidia/cusolver/lib'
|
||||
'$ORIGIN/../../nvidia/cusparse/lib'
|
||||
'$ORIGIN/../../cusparselt/lib'
|
||||
'$ORIGIN/../../nvidia/nvtx/lib'
|
||||
'$ORIGIN/../../nvidia/cufile/lib'
|
||||
)
|
||||
fi
|
||||
|
||||
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
|
||||
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
|
||||
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
|
||||
|
||||
@ -25,7 +25,6 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||
export USE_STATIC_MKL=1
|
||||
export USE_ONEMKL=1
|
||||
export USE_XCCL=1
|
||||
export USE_MPI=0
|
||||
|
||||
WHEELHOUSE_DIR="wheelhousexpu"
|
||||
LIBTORCH_HOUSE_DIR="libtorch_housexpu"
|
||||
|
||||
@ -50,6 +50,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
|
||||
export ATEN_THREADING=NATIVE
|
||||
fi
|
||||
|
||||
# Enable LLVM dependency for TensorExpr testing
|
||||
export USE_LLVM=/opt/llvm
|
||||
export LLVM_DIR=/opt/llvm/lib/cmake/llvm
|
||||
|
||||
if ! which conda; then
|
||||
# In ROCm CIs, we are doing cross compilation on build machines with
|
||||
@ -92,27 +95,6 @@ if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
|
||||
export ACL_ROOT_DIR=/ComputeLibrary
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
|
||||
if [[ -f /opt/riscv-cross-env/bin/activate ]]; then
|
||||
# shellcheck disable=SC1091
|
||||
source /opt/riscv-cross-env/bin/activate
|
||||
else
|
||||
echo "Activation file not found"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export CMAKE_CROSSCOMPILING=TRUE
|
||||
export CMAKE_SYSTEM_NAME=Linux
|
||||
export CMAKE_SYSTEM_PROCESSOR=riscv64
|
||||
|
||||
export USE_CUDA=0
|
||||
export USE_MKLDNN=0
|
||||
|
||||
export SLEEF_TARGET_EXEC_USE_QEMU=ON
|
||||
sudo chown -R jenkins /var/lib/jenkins/workspace /opt
|
||||
|
||||
fi
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
|
||||
POSSIBLE_JAVA_HOMES=()
|
||||
POSSIBLE_JAVA_HOMES+=(/usr/local)
|
||||
@ -173,7 +155,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
source /opt/intel/oneapi/mpi/latest/env/vars.sh
|
||||
# Enable XCCL build
|
||||
export USE_XCCL=1
|
||||
export USE_MPI=0
|
||||
# XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
|
||||
export USE_KINETO=0
|
||||
export TORCH_XPU_ARCH_LIST=pvc
|
||||
@ -195,16 +176,8 @@ fi
|
||||
|
||||
# We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
|
||||
# memory to build and will OOM
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
|
||||
J=2 # default to 2 jobs
|
||||
case "$RUNNER" in
|
||||
linux.12xlarge.memory|linux.24xlarge.memory)
|
||||
J=24
|
||||
;;
|
||||
esac
|
||||
echo "Building FlashAttention with job limit $J"
|
||||
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
|
||||
export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
|
||||
@ -219,6 +192,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
|
||||
export USE_ASAN=1
|
||||
export REL_WITH_DEB_INFO=1
|
||||
export UBSAN_FLAGS="-fno-sanitize-recover=all"
|
||||
unset USE_LLVM
|
||||
fi
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
|
||||
@ -239,7 +213,7 @@ fi
|
||||
|
||||
# Do not change workspace permissions for ROCm and s390x CI jobs
|
||||
# as it can leave workspace with bad permissions for cancelled jobs
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
|
||||
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
|
||||
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
|
||||
cleanup_workspace() {
|
||||
@ -284,7 +258,8 @@ else
|
||||
# XLA test build fails when WERROR=1
|
||||
# set only when building other architectures
|
||||
# or building non-XLA tests.
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
|
||||
if [[ "$BUILD_ENVIRONMENT" != *rocm* &&
|
||||
"$BUILD_ENVIRONMENT" != *xla* ]]; then
|
||||
# Install numpy-2.0.2 for builds which are backward compatible with 1.X
|
||||
python -mpip install numpy==2.0.2
|
||||
|
||||
@ -421,7 +396,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
|
||||
# don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
|
||||
python tools/stats/export_test_times.py
|
||||
fi
|
||||
# don't do this for bazel or s390x or riscv64 as they don't use sccache
|
||||
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
||||
# don't do this for bazel or s390x as they don't use sccache
|
||||
if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
|
||||
print_sccache_stats
|
||||
fi
|
||||
|
||||
@ -300,3 +300,24 @@ except RuntimeError as e:
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
###############################################################################
|
||||
# Check for C++ ABI compatibility to GCC-11 - GCC 13
|
||||
###############################################################################
|
||||
if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" == 'manywheel' ]]; then
|
||||
pushd /tmp
|
||||
# Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
|
||||
# gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
|
||||
# gcc 11 - CUDA 11.8, xpu, rocm
|
||||
# gcc 13 - CUDA 12.6, 12.8 and cpu
|
||||
# Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
|
||||
if [[ "$(uname -m)" == "s390x" ]]; then
|
||||
cxx_abi="19"
|
||||
elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
|
||||
cxx_abi="18"
|
||||
else
|
||||
cxx_abi="16"
|
||||
fi
|
||||
python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
|
||||
popd
|
||||
fi
|
||||
|
||||
@ -149,19 +149,6 @@ function get_pinned_commit() {
|
||||
cat .github/ci_commit_pins/"${1}".txt
|
||||
}
|
||||
|
||||
function detect_cuda_arch() {
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
|
||||
if command -v nvidia-smi; then
|
||||
TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
|
||||
elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
|
||||
# There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
|
||||
# minimum supported value here
|
||||
TORCH_CUDA_ARCH_LIST=8.0
|
||||
fi
|
||||
export TORCH_CUDA_ARCH_LIST
|
||||
fi
|
||||
}
|
||||
|
||||
function install_torchaudio() {
|
||||
local commit
|
||||
commit=$(get_pinned_commit audio)
|
||||
|
||||
@ -35,10 +35,11 @@ fi
|
||||
|
||||
print_cmake_info
|
||||
if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
|
||||
USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
# Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
|
||||
USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
|
||||
else
|
||||
# NB: we always build with distributed; USE_DISTRIBUTED turns off all
|
||||
# backends (specifically the gloo backend), so test that this case works too
|
||||
# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
|
||||
# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
|
||||
USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
|
||||
fi
|
||||
if which sccache > /dev/null; then
|
||||
|
||||
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
|
||||
fi
|
||||
popd
|
||||
|
||||
python -mpip install -r requirements.txt
|
||||
|
||||
# enable debug asserts in serialization
|
||||
export TORCH_SERIALIZATION_DEBUG=1
|
||||
|
||||
python -mpip install --no-input -r requirements.txt
|
||||
|
||||
setup_test_python() {
|
||||
# The CircleCI worker hostname doesn't resolve to an address.
|
||||
# This environment variable makes ProcessGroupGloo default to
|
||||
@ -178,15 +174,10 @@ checkout_install_torchbench() {
|
||||
# to install and test other models
|
||||
python install.py --continue_on_fail
|
||||
fi
|
||||
popd
|
||||
|
||||
pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
|
||||
# https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
|
||||
# its current version 0.12.0 doesn't work with transformers 4.54.0
|
||||
pip uninstall -y torchao
|
||||
|
||||
echo "Print all dependencies after TorchBench is installed"
|
||||
python -mpip freeze
|
||||
popd
|
||||
}
|
||||
|
||||
torchbench_setup_macos() {
|
||||
@ -306,47 +297,6 @@ test_torchbench_smoketest() {
|
||||
fi
|
||||
|
||||
done
|
||||
echo "Pytorch benchmark on mps device completed"
|
||||
}
|
||||
|
||||
test_aoti_torchbench_smoketest() {
|
||||
print_cmake_info
|
||||
|
||||
echo "Launching AOTInductor torchbench setup"
|
||||
pip_benchmark_deps
|
||||
# shellcheck disable=SC2119,SC2120
|
||||
torchbench_setup_macos
|
||||
|
||||
TEST_REPORTS_DIR=$(pwd)/test/test-reports
|
||||
mkdir -p "$TEST_REPORTS_DIR"
|
||||
|
||||
local device=mps
|
||||
local dtypes=(undefined float16 bfloat16 notset)
|
||||
local dtype=${dtypes[$1]}
|
||||
local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
|
||||
|
||||
echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
local dtype_arg="--${dtype}"
|
||||
if [ "$dtype" == notset ]; then
|
||||
dtype_arg="--float32"
|
||||
fi
|
||||
touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
|
||||
for model in "${models[@]}"; do
|
||||
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
|
||||
--performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
|
||||
--output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
|
||||
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
|
||||
--accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
|
||||
--output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
|
||||
done
|
||||
|
||||
echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
|
||||
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
|
||||
--performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
|
||||
--output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
|
||||
PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
|
||||
--accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
|
||||
--output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
|
||||
|
||||
echo "Pytorch benchmark on mps device completed"
|
||||
}
|
||||
@ -395,8 +345,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
|
||||
test_timm_perf
|
||||
elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
|
||||
test_torchbench_smoketest "${SHARD_NUMBER}"
|
||||
elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
|
||||
test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
|
||||
elif [[ $TEST_CONFIG == *"mps"* ]]; then
|
||||
test_python_mps
|
||||
elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
|
||||
|
||||
@ -45,7 +45,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
|
||||
# DTensor tests
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
|
||||
time python test/run_test.py --verbose -i distributed/tensor/test_utils.py
|
||||
|
||||
# DeviceMesh test
|
||||
time python test/run_test.py --verbose -i distributed/test_device_mesh
|
||||
|
||||
@ -91,7 +91,6 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
|
||||
export VALGRIND=OFF
|
||||
fi
|
||||
|
||||
detect_cuda_arch
|
||||
|
||||
if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
|
||||
# There are additional warnings on s390x, maybe due to newer gcc.
|
||||
@ -496,14 +495,6 @@ test_inductor_cpp_wrapper_shard() {
|
||||
-k 'take' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
|
||||
if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
|
||||
python test/run_test.py \
|
||||
--include inductor/test_mkldnn_pattern_matcher \
|
||||
-k 'xpu' \
|
||||
--shard "$1" "$NUM_TEST_SHARDS" \
|
||||
--verbose
|
||||
fi
|
||||
}
|
||||
|
||||
# "Global" flags for inductor benchmarking controlled by TEST_CONFIG
|
||||
@ -1060,10 +1051,20 @@ test_libtorch_api() {
|
||||
mkdir -p $TEST_REPORTS_DIR
|
||||
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
|
||||
"$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
|
||||
else
|
||||
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
|
||||
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
|
||||
|
||||
# On s390x, pytorch is built without llvm.
|
||||
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
|
||||
# test fails with errors like:
|
||||
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
|
||||
# unknown file: Failure
|
||||
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
|
||||
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
|
||||
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
|
||||
fi
|
||||
fi
|
||||
|
||||
# quantization is not fully supported on s390x yet
|
||||
@ -1638,10 +1639,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
|
||||
install_torchvision
|
||||
build_xla
|
||||
test_xla
|
||||
elif [[ "$TEST_CONFIG" == *vllm* ]]; then
|
||||
echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
|
||||
(cd .ci/lumen_cli && python -m pip install -e .)
|
||||
python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
|
||||
elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
|
||||
test_executorch
|
||||
elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
|
||||
@ -1695,6 +1692,7 @@ elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
|
||||
elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
|
||||
install_torchaudio
|
||||
install_torchvision
|
||||
install_torchao
|
||||
id=$((SHARD_NUMBER-1))
|
||||
# https://github.com/opencv/opencv-python/issues/885
|
||||
pip_install opencv-python==4.8.0.74
|
||||
|
||||
@ -61,10 +61,9 @@ if "%USE_XPU%"=="1" (
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
|
||||
call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
|
||||
if errorlevel 1 exit /b 1
|
||||
:: Reduce build time
|
||||
SET TORCH_XPU_ARCH_LIST=bmg
|
||||
:: Re-setup python env for build
|
||||
call pip install -r requirements.txt
|
||||
:: Reduce build time. Only have MTL self-hosted runner now
|
||||
SET TORCH_XPU_ARCH_LIST=xe-lpg
|
||||
SET USE_KINETO=0
|
||||
)
|
||||
|
||||
@echo on
|
||||
|
||||
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
|
||||
python -m pip install z3-solver==4.15.1.0
|
||||
|
||||
# Install tlparse for test\dynamo\test_structured_trace.py UTs.
|
||||
python -m pip install tlparse==0.4.0
|
||||
python -m pip install tlparse==0.3.30
|
||||
|
||||
# Install parameterized
|
||||
python -m pip install parameterized==0.8.1
|
||||
|
||||
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
|
||||
)
|
||||
|
||||
IF "%BUILD_VISION%" == "" (
|
||||
set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
|
||||
set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
|
||||
set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
|
||||
) ELSE (
|
||||
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
|
||||
|
||||
@ -1,59 +0,0 @@
|
||||
@echo off
|
||||
|
||||
set MODULE_NAME=pytorch
|
||||
|
||||
IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
|
||||
call internal\clone.bat
|
||||
cd %~dp0
|
||||
) ELSE (
|
||||
call internal\clean.bat
|
||||
)
|
||||
IF ERRORLEVEL 1 goto :eof
|
||||
|
||||
call internal\check_deps.bat
|
||||
IF ERRORLEVEL 1 goto :eof
|
||||
|
||||
REM Check for optional components
|
||||
|
||||
set USE_CUDA=
|
||||
set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
|
||||
|
||||
IF "%NVTOOLSEXT_PATH%"=="" (
|
||||
IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib" (
|
||||
set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
|
||||
) ELSE (
|
||||
echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
|
||||
exit /b 1
|
||||
)
|
||||
)
|
||||
|
||||
IF "%CUDA_PATH_V130%"=="" (
|
||||
IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" (
|
||||
set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
|
||||
) ELSE (
|
||||
echo CUDA 13.0 not found, failing
|
||||
exit /b 1
|
||||
)
|
||||
)
|
||||
|
||||
IF "%BUILD_VISION%" == "" (
|
||||
set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0
|
||||
set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
|
||||
) ELSE (
|
||||
set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
|
||||
)
|
||||
|
||||
set "CUDA_PATH=%CUDA_PATH_V130%"
|
||||
set "PATH=%CUDA_PATH_V130%\bin;%PATH%"
|
||||
|
||||
:optcheck
|
||||
|
||||
call internal\check_opts.bat
|
||||
IF ERRORLEVEL 1 goto :eof
|
||||
|
||||
if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
|
||||
call %~dp0\internal\copy.bat
|
||||
IF ERRORLEVEL 1 goto :eof
|
||||
|
||||
call %~dp0\internal\setup.bat
|
||||
IF ERRORLEVEL 1 goto :eof
|
||||
@ -26,7 +26,6 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
|
||||
if %CUDA_VER% EQU 126 goto cuda126
|
||||
if %CUDA_VER% EQU 128 goto cuda128
|
||||
if %CUDA_VER% EQU 129 goto cuda129
|
||||
if %CUDA_VER% EQU 130 goto cuda130
|
||||
|
||||
echo CUDA %CUDA_VERSION_STR% is not supported
|
||||
exit /b 1
|
||||
@ -114,33 +113,6 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
|
||||
|
||||
goto cuda_common
|
||||
|
||||
:cuda130
|
||||
|
||||
set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
|
||||
if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
|
||||
curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
|
||||
if errorlevel 1 exit /b 1
|
||||
set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
|
||||
set "ARGS="
|
||||
)
|
||||
|
||||
set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
|
||||
set CUDNN_LIB_FOLDER="lib"
|
||||
set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
|
||||
if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
|
||||
curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
|
||||
if errorlevel 1 exit /b 1
|
||||
set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
|
||||
)
|
||||
|
||||
@REM cuDNN 8.3+ required zlib to be installed on the path
|
||||
echo Installing ZLIB dlls
|
||||
curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
|
||||
7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
|
||||
xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
|
||||
|
||||
goto cuda_common
|
||||
|
||||
:cuda_common
|
||||
:: NOTE: We only install CUDA if we don't have it installed already.
|
||||
:: With GHA runners these should be pre-installed as part of our AMI process
|
||||
|
||||
@ -1,22 +1,12 @@
|
||||
set ADDITIONAL_OPTIONS=""
|
||||
set PYTHON_EXEC="python"
|
||||
|
||||
|
||||
if "%DESIRED_PYTHON%" == "3.13t" (
|
||||
echo Python version is set to 3.13t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.13t"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
) else if "%DESIRED_PYTHON%"=="3.14t" (
|
||||
echo Python version is set to 3.14 or 3.14t
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
|
||||
set ADDITIONAL_OPTIONS="Include_freethreaded=1"
|
||||
set PYTHON_EXEC="python3.14t"
|
||||
) else (
|
||||
echo Python version is set to %DESIRED_PYTHON%
|
||||
echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
|
||||
set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
|
||||
)
|
||||
|
||||
|
||||
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
|
||||
:xpu_bundle_install_start
|
||||
|
||||
set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
|
||||
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
|
||||
set XPU_BUNDLE_VERSION=2025.1.3+5
|
||||
set XPU_BUNDLE_VERSION=2025.0.1+20
|
||||
set XPU_BUNDLE_INSTALLED=0
|
||||
set XPU_BUNDLE_UNINSTALL=0
|
||||
set XPU_EXTRA_URL=NULL
|
||||
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
|
||||
set XPU_EXTRA_INSTALLED=0
|
||||
set XPU_EXTRA_UNINSTALL=0
|
||||
|
||||
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
|
||||
set XPU_BUNDLE_VERSION=2025.2.1+20
|
||||
if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
|
||||
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
|
||||
set XPU_BUNDLE_VERSION=2025.1.3+5
|
||||
)
|
||||
|
||||
:: Check if XPU bundle is target version or already installed
|
||||
@ -90,3 +90,14 @@ if errorlevel 1 exit /b 1
|
||||
del xpu_extra.exe
|
||||
|
||||
:xpu_install_end
|
||||
|
||||
if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
|
||||
:: Install Level Zero SDK
|
||||
set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
|
||||
curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
|
||||
echo "Installing level zero SDK..."
|
||||
7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
|
||||
set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
|
||||
del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
|
||||
|
||||
:install_end
|
||||
|
||||
@ -7,8 +7,6 @@ call "internal\install_python.bat"
|
||||
|
||||
%PYTHON_EXEC% --version
|
||||
set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
|
||||
if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
|
||||
if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
|
||||
|
||||
@ -128,35 +128,16 @@ export MACOSX_DEPLOYMENT_TARGET=10.15
|
||||
export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
|
||||
|
||||
SETUPTOOLS_PINNED_VERSION="==70.1.0"
|
||||
PYYAML_PINNED_VERSION="==5.3"
|
||||
PYYAML_PINNED_VERSION="=5.3"
|
||||
EXTRA_CONDA_INSTALL_FLAGS=""
|
||||
CONDA_ENV_CREATE_FLAGS=""
|
||||
RENAME_WHEEL=true
|
||||
case $desired_python in
|
||||
3.14t)
|
||||
echo "Using 3.14 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.14)
|
||||
echo "Using 3.14t deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
|
||||
desired_python="3.14.0rc1"
|
||||
RENAME_WHEEL=false
|
||||
;;
|
||||
3.13t)
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
CONDA_ENV_CREATE_FLAGS="python-freethreading"
|
||||
EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
|
||||
desired_python="3.13"
|
||||
@ -166,35 +147,35 @@ case $desired_python in
|
||||
echo "Using 3.13 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.1.0"
|
||||
NUMPY_PINNED_VERSION="=2.1.0"
|
||||
;;
|
||||
3.12)
|
||||
echo "Using 3.12 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=6.0.1"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.11)
|
||||
echo "Using 3.11 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.10)
|
||||
echo "Using 3.10 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
3.9)
|
||||
echo "Using 3.9 deps"
|
||||
SETUPTOOLS_PINNED_VERSION=">=70.1.0"
|
||||
PYYAML_PINNED_VERSION=">=5.3"
|
||||
NUMPY_PINNED_VERSION="==2.0.2"
|
||||
NUMPY_PINNED_VERSION="=2.0.2"
|
||||
;;
|
||||
*)
|
||||
echo "Using default deps"
|
||||
NUMPY_PINNED_VERSION="==1.11.3"
|
||||
NUMPY_PINNED_VERSION="=1.11.3"
|
||||
;;
|
||||
esac
|
||||
|
||||
@ -203,18 +184,12 @@ tmp_env_name="wheel_py$python_nodot"
|
||||
conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
|
||||
source activate "$tmp_env_name"
|
||||
|
||||
PINNED_PACKAGES=(
|
||||
"setuptools${SETUPTOOLS_PINNED_VERSION}"
|
||||
"pyyaml${PYYAML_PINNED_VERSION}"
|
||||
"numpy${NUMPY_PINNED_VERSION}"
|
||||
)
|
||||
retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
pip install requests ninja typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements-build.txt"
|
||||
pip install "numpy=${NUMPY_PINNED_VERSION}" "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions
|
||||
retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
|
||||
retry brew install libomp
|
||||
|
||||
# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
|
||||
# is build as part of tensorpipe submodule
|
||||
# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
|
||||
export USE_DISTRIBUTED=1
|
||||
|
||||
export USE_MKLDNN=OFF
|
||||
|
||||
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
|
||||
# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
|
||||
# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
|
||||
# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
|
||||
if [[ "$DESIRED_CUDA" == "cu129" ]]; then
|
||||
TRITON_CONSTRAINT="platform_system == 'Linux'"
|
||||
fi
|
||||
|
||||
|
||||
@ -51,12 +51,16 @@ s3_upload() {
|
||||
s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
|
||||
fi
|
||||
(
|
||||
cache_control_flag=""
|
||||
if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
|
||||
cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
|
||||
fi
|
||||
for pkg in ${PKG_DIR}/*.${extension}; do
|
||||
(
|
||||
set -x
|
||||
shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
|
||||
${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
|
||||
--metadata "checksum-sha256=${shm_id}"
|
||||
--metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
|
||||
)
|
||||
done
|
||||
)
|
||||
|
||||
@ -15,7 +15,8 @@ fi
|
||||
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
|
||||
export VC_YEAR=2022
|
||||
export USE_SCCACHE=0
|
||||
export XPU_VERSION=2025.2
|
||||
export XPU_VERSION=2025.1
|
||||
export XPU_ENABLE_KINETO=1
|
||||
fi
|
||||
|
||||
echo "Free space on filesystem before build:"
|
||||
|
||||
@ -8,7 +8,7 @@ export VC_YEAR=2022
|
||||
|
||||
if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
|
||||
export VC_YEAR=2022
|
||||
export XPU_VERSION=2025.2
|
||||
export XPU_VERSION=2025.1
|
||||
fi
|
||||
|
||||
pushd "$PYTORCH_ROOT/.ci/pytorch/"
|
||||
|
||||
1
.flake8
1
.flake8
@ -48,7 +48,6 @@ per-file-ignores =
|
||||
torch/__init__.py: F401,TOR901
|
||||
torch/_custom_op/impl.py: TOR901
|
||||
torch/_export/serde/upgrade.py: TOR901
|
||||
torch/_functorch/predispatch.py: TOR901
|
||||
torch/_functorch/vmap.py: TOR901
|
||||
torch/_inductor/test_operators.py: TOR901
|
||||
torch/_library/abstract_impl.py: TOR901
|
||||
|
||||
1
.github/actionlint.yaml
vendored
1
.github/actionlint.yaml
vendored
@ -54,7 +54,6 @@ self-hosted-runner:
|
||||
- linux.rocm.gpu.2
|
||||
- linux.rocm.gpu.4
|
||||
# gfx942 runners
|
||||
- linux.rocm.gpu.gfx942.1
|
||||
- linux.rocm.gpu.gfx942.2
|
||||
- linux.rocm.gpu.gfx942.4
|
||||
- rocm-docker
|
||||
|
||||
@ -1,81 +0,0 @@
|
||||
# .github/workflows/build-external.yml
|
||||
name: Build External packages
|
||||
|
||||
description: build external packages for PyTorch
|
||||
|
||||
inputs:
|
||||
cuda-arch-list:
|
||||
description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
|
||||
type: string
|
||||
required: true
|
||||
default: ""
|
||||
docker-image:
|
||||
description: Base image to use
|
||||
type: string
|
||||
required: true
|
||||
build-targets:
|
||||
description: Build targets
|
||||
type: string
|
||||
required: true
|
||||
torch-wheel-dir:
|
||||
description: Directory to built torch wheel
|
||||
type: string
|
||||
required: false
|
||||
default: dist
|
||||
output-dir:
|
||||
description: Directory to store build artifact
|
||||
default: external
|
||||
type: string
|
||||
required: false
|
||||
|
||||
outputs:
|
||||
build_time:
|
||||
description: "Total build time in seconds"
|
||||
value: ${{ steps.build-external.outputs.build_time }}
|
||||
output_dir:
|
||||
description: "Directory where build artifact is stored"
|
||||
value: ${{ steps.build-external.outputs.output_dir }}
|
||||
|
||||
runs:
|
||||
using: composite
|
||||
steps:
|
||||
- name: Build external packages in sequence
|
||||
id: build-external
|
||||
env:
|
||||
SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
|
||||
SCCACHE_REGION: us-east-1
|
||||
TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
|
||||
BASE_IMAGE: ${{ inputs.docker-image }}
|
||||
BUILD_TARGETS: ${{ inputs.build-targets }}
|
||||
PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
|
||||
|
||||
shell: bash
|
||||
run: |
|
||||
set -euo pipefail
|
||||
python3 --version
|
||||
docker images
|
||||
START_TIME=$(date +%s)
|
||||
(
|
||||
cd .ci/lumen_cli
|
||||
python3 -m pip install -e .
|
||||
)
|
||||
MAX_JOBS="$(nproc --ignore=6)"
|
||||
export MAX_JOBS
|
||||
|
||||
# Split the comma-separated list and build each target
|
||||
IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
|
||||
for target in "${TARGETS[@]}"; do
|
||||
OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
|
||||
export OUTPUT_DIR
|
||||
echo "Building external package: $target in directory $OUTPUT_DIR"
|
||||
python3 -m cli.run build external "$target"
|
||||
|
||||
done
|
||||
|
||||
END_TIME=$(date +%s)
|
||||
{
|
||||
echo "build_time=$((END_TIME - START_TIME))"
|
||||
if [ -d "$PARENT_OUTPUT_DIR" ]; then
|
||||
echo "output_dir=$PARENT_OUTPUT_DIR"
|
||||
fi
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
15
.github/actions/checkout-pytorch/action.yml
vendored
15
.github/actions/checkout-pytorch/action.yml
vendored
@ -57,21 +57,6 @@ runs:
|
||||
submodules: ${{ inputs.submodules }}
|
||||
show-progress: false
|
||||
|
||||
- name: Clean submodules post checkout
|
||||
id: clean-submodules
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
|
||||
shell: bash
|
||||
env:
|
||||
NO_SUDO: ${{ inputs.no-sudo }}
|
||||
run: |
|
||||
cd "${GITHUB_WORKSPACE}"
|
||||
# Clean stale submodule dirs
|
||||
if [ -z "${NO_SUDO}" ]; then
|
||||
sudo git submodule foreach --recursive git clean -ffdx
|
||||
else
|
||||
git submodule foreach --recursive git clean -ffdx
|
||||
fi
|
||||
|
||||
- name: Clean workspace (try again)
|
||||
if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
|
||||
(steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
|
||||
|
||||
5
.github/actions/setup-rocm/action.yml
vendored
5
.github/actions/setup-rocm/action.yml
vendored
@ -59,6 +59,11 @@ runs:
|
||||
echo "$msg"
|
||||
exit 1
|
||||
fi
|
||||
if [[ $ngpu -eq 1 ]]; then
|
||||
echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
|
||||
echo "$msg"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Runner diskspace health check
|
||||
uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
|
||||
|
||||
2
.github/ci_commit_pins/audio.txt
vendored
2
.github/ci_commit_pins/audio.txt
vendored
@ -1 +1 @@
|
||||
0757bbb660855272f7dd8d31cc84e7c631522805
|
||||
0c22347335f4c9a5b92a2f5bad65e05e2464c184
|
||||
|
||||
2
.github/ci_commit_pins/vllm.txt
vendored
2
.github/ci_commit_pins/vllm.txt
vendored
@ -1 +1 @@
|
||||
862f2ef893d9751db0a92bd2d4ae0e3d9677872f
|
||||
7e3a8dc90670fd312ce1e0d4eba9bf11c571e3ad
|
||||
|
||||
2
.github/ci_commit_pins/xla.txt
vendored
2
.github/ci_commit_pins/xla.txt
vendored
@ -1 +1 @@
|
||||
763e5b78d4fcd74a9e812256656c075f99d9a781
|
||||
b6a5b82b9948b610fa4c304d0d869c82b8f17db1
|
||||
|
||||
439
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
439
.github/ci_configs/vllm/Dockerfile.tmp_vllm
vendored
@ -1,439 +0,0 @@
|
||||
# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
|
||||
# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
|
||||
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
|
||||
# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
|
||||
# by default, it uses the torch-nightly-base stage from this docker image
|
||||
ARG BUILD_BASE_IMAGE=torch-nightly-base
|
||||
|
||||
# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
|
||||
# by default, it uses devel-ubuntu22.04 official image.
|
||||
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
|
||||
|
||||
|
||||
#################### TORCH NIGHTLY BASE IMAGE ####################
|
||||
# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
|
||||
From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
|
||||
ARG CUDA_VERSION=12.8.1
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG TARGETPLATFORM
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies if it does not existed
|
||||
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
|
||||
echo "Installing Python ${PYTHON_VERSION}..." && \
|
||||
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
|
||||
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y ccache software-properties-common git curl sudo && \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
|
||||
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
|
||||
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
|
||||
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
|
||||
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
|
||||
# as it was causing spam when compiling the CUTLASS kernels
|
||||
# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
|
||||
RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
|
||||
if [ "$current_gcc_version" -lt 10 ]; then \
|
||||
echo "GCC version is $current_gcc_version, installing gcc-10..."; \
|
||||
apt-get update && \
|
||||
apt-get install -y gcc-10 g++-10 && \
|
||||
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
|
||||
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
|
||||
else \
|
||||
echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
|
||||
fi && \
|
||||
gcc --version && g++ --version
|
||||
|
||||
# install uv for faster pip installs
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
python3 -m pip install uv==0.8.4
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
#################### TORCH NIGHTLY BASE IMAGE ####################
|
||||
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
# A base image for building vLLM with torch nightly or torch wheels
|
||||
# prepare basic build environment
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
USER root
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version >/dev/null 2>&1; then \
|
||||
python3 -m pip install uv==0.8.4; \
|
||||
fi
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# install build and runtime dependencies
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY pyproject.toml pyproject.toml
|
||||
|
||||
# install build and runtime dependencies without stable torch version
|
||||
RUN python3 use_existing_torch.py
|
||||
|
||||
# default mount file as placeholder, this just avoid the mount error
|
||||
# change to a different vllm folder if this does not exist anymore
|
||||
ARG TORCH_WHEELS_PATH="./requirements"
|
||||
ARG PINNED_TORCH_VERSION
|
||||
|
||||
# Install torch, torchaudio and torchvision based on the input
|
||||
# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip
|
||||
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
|
||||
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
|
||||
echo "[INFO] Installing torch wheels to build vllm"; \
|
||||
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
|
||||
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]"; \
|
||||
uv pip install --system "${vision_whl}"; \
|
||||
uv pip install --system "${audio_whl}"; \
|
||||
elif [ -n "$PINNED_TORCH_VERSION" ]; then \
|
||||
echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
|
||||
uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
else \
|
||||
echo "[INFO] Installing torch nightly with latest one to build vllm"; \
|
||||
uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
fi
|
||||
|
||||
# Install numba 0.61.2 for cuda environment
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system numba==0.61.2
|
||||
|
||||
# Install common dependencies from vllm common.txt
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/common.txt
|
||||
|
||||
|
||||
# Must put before installing xformers, so it can install the correct version of xfomrers.
|
||||
ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
|
||||
|
||||
ARG max_jobs=16
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
RUN echo ${TORCH_CUDA_ARCH_LIST}
|
||||
RUN echo ${MAX_JOBS}
|
||||
RUN pip freeze | grep -E 'ninja'
|
||||
|
||||
# Build xformers with cuda and torch nightly/wheel
|
||||
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
|
||||
# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
|
||||
ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
echo 'git clone xformers...' \
|
||||
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
|
||||
&& cd xformers \
|
||||
&& git checkout ${XFORMERS_COMMIT} \
|
||||
&& git submodule update --init --recursive \
|
||||
&& echo 'finish git clone xformers...' \
|
||||
&& rm -rf build \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
|
||||
&& cd .. \
|
||||
&& rm -rf xformers
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system xformers-dist/*.whl --verbose
|
||||
|
||||
# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
|
||||
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
|
||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
|
||||
|
||||
RUN cat torch_build_versions.txt
|
||||
RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
# Image used to build vllm wheel
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
COPY . .
|
||||
|
||||
RUN python3 use_existing_torch.py
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/build.txt
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
# Max jobs used by Ninja to build extensions
|
||||
ARG max_jobs=16
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
ARG nvcc_threads=4
|
||||
ENV NVCC_THREADS=$nvcc_threads
|
||||
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
|
||||
ARG USE_SCCACHE
|
||||
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
|
||||
ARG SCCACHE_REGION_NAME=us-west-2
|
||||
ARG SCCACHE_S3_NO_CREDENTIALS=0
|
||||
|
||||
# if USE_SCCACHE is set, use sccache to speed up compilation
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" = "1" ]; then \
|
||||
echo "Installing sccache..." \
|
||||
&& curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
|
||||
&& tar -xzf sccache.tar.gz \
|
||||
&& sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
|
||||
&& rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
|
||||
&& export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
|
||||
&& export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
|
||||
&& export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
|
||||
&& export SCCACHE_IDLE_TIMEOUT=0 \
|
||||
&& export CMAKE_BUILD_TYPE=Release \
|
||||
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
|
||||
&& sccache --show-stats \
|
||||
&& python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
|
||||
&& sccache --show-stats; \
|
||||
fi
|
||||
|
||||
ARG vllm_target_device="cuda"
|
||||
ENV VLLM_TARGET_DEVICE=${vllm_target_device}
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "$USE_SCCACHE" != "1" ]; then \
|
||||
# Clean any existing CMake artifacts
|
||||
rm -rf .deps && \
|
||||
mkdir -p .deps && \
|
||||
export VLLM_DOCKER_BUILD_CONTEXT=1 && \
|
||||
python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
|
||||
fi
|
||||
|
||||
RUN echo "[DEBUG] Listing current directory:" && \
|
||||
ls -al && \
|
||||
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
|
||||
|
||||
################### VLLM INSTALLED IMAGE ####################
|
||||
# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
|
||||
FROM ${FINAL_BASE_IMAGE} AS vllm-base
|
||||
USER root
|
||||
# prepare for environment starts
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
|
||||
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
|
||||
|
||||
# Install Python and other dependencies if it does not existed
|
||||
RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
|
||||
echo "Installing Python ${PYTHON_VERSION}..." && \
|
||||
echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
|
||||
echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y ccache software-properties-common git curl sudo && \
|
||||
for i in 1 2 3; do \
|
||||
add-apt-repository -y ppa:deadsnakes/ppa && break || \
|
||||
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
|
||||
done && \
|
||||
apt-get update -y && \
|
||||
apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
|
||||
update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
|
||||
update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
|
||||
ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
|
||||
curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
|
||||
else \
|
||||
echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
|
||||
fi \
|
||||
&& python3 --version && python3 -m pip --version
|
||||
|
||||
|
||||
# Get the torch versions, and whls used in previous stagtes for consistency
|
||||
COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
|
||||
COPY --from=base /workspace/xformers-dist /wheels/xformers
|
||||
COPY --from=build /workspace/vllm-dist /wheels/vllm
|
||||
RUN echo "[DEBUG] Listing current directory before torch install step:" && \
|
||||
ls -al && \
|
||||
echo "[DEBUG] Showing torch_build_versions.txt content:" && \
|
||||
cat torch_build_versions.txt
|
||||
|
||||
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||
# this won't be needed for future versions of this docker image
|
||||
# or future versions of triton.
|
||||
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
|
||||
|
||||
|
||||
# Install uv for faster pip installs if not existed
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
if ! python3 -m uv --version > /dev/null 2>&1; then \
|
||||
python3 -m pip install uv==0.8.4; \
|
||||
fi
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
# Default mount file as placeholder, this just avoid the mount error
|
||||
ARG TORCH_WHEELS_PATH="./requirements"
|
||||
# Install torch, torchaudio and torchvision
|
||||
# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
|
||||
# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
|
||||
RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
|
||||
--mount=type=cache,target=/root/.cache/uv \
|
||||
if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
|
||||
torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
|
||||
vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
|
||||
audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
|
||||
echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
|
||||
uv pip install --system "${torch_whl}[opt-einsum]"; \
|
||||
uv pip install --system "${vision_whl}"; \
|
||||
uv pip install --system "${audio_whl}"; \
|
||||
else \
|
||||
echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
|
||||
uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
|
||||
fi
|
||||
|
||||
# Install the vllm wheel from previous stage
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system /wheels/vllm/*.whl --verbose
|
||||
|
||||
# Install xformers wheel from previous stage
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system /wheels/xformers/*.whl --verbose
|
||||
|
||||
|
||||
# Build flashinfer from source.
|
||||
ARG torch_cuda_arch_list='8.0;8.9;9.0a'
|
||||
# install package for build flashinfer
|
||||
# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
|
||||
|
||||
RUN pip install build==1.3.0
|
||||
RUN pip freeze | grep -E 'setuptools|packaging|build'
|
||||
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
# Build flashinfer for torch nightly from source around 10 mins
|
||||
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
|
||||
# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
|
||||
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
git clone --depth 1 --recursive --shallow-submodules \
|
||||
--branch ${FLASHINFER_GIT_REF} \
|
||||
${FLASHINFER_GIT_REPO} flashinfer \
|
||||
&& echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
|
||||
&& cd flashinfer \
|
||||
&& python3 -m flashinfer.aot \
|
||||
&& python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
|
||||
&& cd .. \
|
||||
&& rm -rf flashinfer
|
||||
|
||||
# install flashinfer python
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system wheels/flashinfer/*.whl --verbose
|
||||
|
||||
# Logging to confirm the torch versions
|
||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
|
||||
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
|
||||
################### VLLM INSTALLED IMAGE ####################
|
||||
|
||||
|
||||
#################### UNITTEST IMAGE #############################
|
||||
FROM vllm-base as test
|
||||
|
||||
ENV UV_HTTP_TIMEOUT=500
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
# Use copy mode to avoid hardlink failures with Docker cache mounts
|
||||
ENV UV_LINK_MODE=copy
|
||||
|
||||
COPY tests/ tests/
|
||||
COPY examples examples
|
||||
COPY benchmarks benchmarks
|
||||
COPY ./vllm/collect_env.py .
|
||||
COPY requirements/common.txt requirements/common.txt
|
||||
COPY use_existing_torch.py use_existing_torch.py
|
||||
COPY pyproject.toml pyproject.toml
|
||||
# Install build and runtime dependencies without stable torch version
|
||||
COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
|
||||
|
||||
RUN python3 use_existing_torch.py
|
||||
|
||||
# install packages
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/common.txt
|
||||
# enable fast downloads from hf (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system hf_transfer
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER 1
|
||||
|
||||
# install development dependencies (for testing)
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -e tests/vllm_test_utils
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system -r requirements/nightly_torch_test.txt
|
||||
|
||||
# Workaround for #17068
|
||||
# pinned commit for v2.2.4
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
|
||||
|
||||
# Logging to confirm the torch versions
|
||||
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
|
||||
|
||||
# Logging to confirm all the packages are installed
|
||||
RUN pip freeze
|
||||
|
||||
#################### UNITTEST IMAGE #############################
|
||||
|
||||
#################### EXPORT STAGE ####################
|
||||
FROM scratch as export-wheels
|
||||
|
||||
# Just copy the wheels we prepared in previous stages
|
||||
COPY --from=base /workspace/xformers-dist /wheels/xformers
|
||||
COPY --from=build /workspace/vllm-dist /wheels/vllm
|
||||
COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
|
||||
COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
|
||||
24
.github/dependabot.yml
vendored
24
.github/dependabot.yml
vendored
@ -1,24 +0,0 @@
|
||||
version: 2
|
||||
updates:
|
||||
# Update to the latest transformers version with dependabot
|
||||
- package-ecosystem: "pip"
|
||||
directory: "/.ci/docker/ci_commit_pins"
|
||||
schedule:
|
||||
interval: "daily"
|
||||
target-branch: "main"
|
||||
allow:
|
||||
- dependency-name: "transformers"
|
||||
ignore:
|
||||
- dependency-name: "*"
|
||||
update-types: ["version-update:semver-patch"]
|
||||
commit-message:
|
||||
prefix: "[Dependabot] Update"
|
||||
include: "scope"
|
||||
labels:
|
||||
- "dependencies"
|
||||
- "open source"
|
||||
- "python"
|
||||
- "topic: not user facing"
|
||||
- "module: ci"
|
||||
- "module: inductor"
|
||||
- "ciflow/inductor"
|
||||
2
.github/pytorch-probot.yml
vendored
2
.github/pytorch-probot.yml
vendored
@ -22,12 +22,10 @@ ciflow_push_tags:
|
||||
- ciflow/rocm
|
||||
- ciflow/rocm-mi300
|
||||
- ciflow/s390
|
||||
- ciflow/riscv64
|
||||
- ciflow/slow
|
||||
- ciflow/trunk
|
||||
- ciflow/unstable
|
||||
- ciflow/xpu
|
||||
- ciflow/vllm
|
||||
- ciflow/torchbench
|
||||
- ciflow/op-benchmark
|
||||
- ciflow/pull
|
||||
|
||||
5
.github/requirements/conda-env-macOS-ARM64
vendored
Normal file
5
.github/requirements/conda-env-macOS-ARM64
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
# Not pinning certifi so that we can always get the latest certificates
|
||||
certifi
|
||||
pip=23.2.1
|
||||
pkg-config=0.29.2
|
||||
wheel=0.37.1
|
||||
@ -28,7 +28,7 @@ pyyaml==6.0.2
|
||||
scipy==1.12.0
|
||||
setuptools==72.1.0
|
||||
sympy==1.13.3
|
||||
tlparse==0.4.0
|
||||
tlparse==0.3.30
|
||||
tensorboard==2.13.0
|
||||
typing-extensions==4.12.2
|
||||
unittest-xml-reporting<=3.2.0,>=2.0.0
|
||||
|
||||
31
.github/scripts/amd/package_triton_wheel.sh
vendored
31
.github/scripts/amd/package_triton_wheel.sh
vendored
@ -1,4 +1,3 @@
|
||||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
|
||||
@ -51,15 +50,29 @@ do
|
||||
cp $lib $TRITON_ROCM_DIR/lib/
|
||||
done
|
||||
|
||||
# Required ROCm libraries
|
||||
if [[ "${MAJOR_VERSION}" == "6" ]]; then
|
||||
libamdhip="libamdhip64.so.6"
|
||||
else
|
||||
libamdhip="libamdhip64.so.5"
|
||||
fi
|
||||
|
||||
# Required ROCm libraries - ROCm 6.0
|
||||
ROCM_SO=(
|
||||
"libamdhip64.so"
|
||||
"libhsa-runtime64.so"
|
||||
"libdrm.so"
|
||||
"libdrm_amdgpu.so"
|
||||
"libamd_comgr.so"
|
||||
"librocprofiler-register.so"
|
||||
"${libamdhip}"
|
||||
"libhsa-runtime64.so.1"
|
||||
"libdrm.so.2"
|
||||
"libdrm_amdgpu.so.1"
|
||||
)
|
||||
if [[ $ROCM_INT -ge 60400 ]]; then
|
||||
ROCM_SO+=("libamd_comgr.so.3")
|
||||
else
|
||||
ROCM_SO+=("libamd_comgr.so.2")
|
||||
fi
|
||||
|
||||
if [[ $ROCM_INT -ge 60100 ]]; then
|
||||
ROCM_SO+=("librocprofiler-register.so.0")
|
||||
fi
|
||||
|
||||
for lib in "${ROCM_SO[@]}"
|
||||
do
|
||||
@ -81,6 +94,10 @@ do
|
||||
fi
|
||||
|
||||
cp $file_path $TRITON_ROCM_DIR/lib
|
||||
# When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
|
||||
LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
|
||||
ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
|
||||
|
||||
done
|
||||
|
||||
# Copy Include Files
|
||||
|
||||
16
.github/scripts/amd/patch_triton_wheel.sh
vendored
16
.github/scripts/amd/patch_triton_wheel.sh
vendored
@ -19,13 +19,15 @@ replace_needed_sofiles() {
|
||||
find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
|
||||
origname=$2
|
||||
patchedname=$3
|
||||
set +e
|
||||
origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
|
||||
ERRCODE=$?
|
||||
set -e
|
||||
if [ "$ERRCODE" -eq "0" ]; then
|
||||
echo "patching $sofile entry $origname to $patchedname"
|
||||
$PATCHELF_BIN --replace-needed $origname $patchedname $sofile
|
||||
if [[ "$origname" != "$patchedname" ]]; then
|
||||
set +e
|
||||
origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
|
||||
ERRCODE=$?
|
||||
set -e
|
||||
if [ "$ERRCODE" -eq "0" ]; then
|
||||
echo "patching $sofile entry $origname to $patchedname"
|
||||
$PATCHELF_BIN --replace-needed $origname $patchedname $sofile
|
||||
fi
|
||||
fi
|
||||
done
|
||||
}
|
||||
|
||||
107
.github/scripts/generate_binary_build_matrix.py
vendored
107
.github/scripts/generate_binary_build_matrix.py
vendored
@ -16,19 +16,17 @@ from typing import Optional
|
||||
|
||||
|
||||
# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
|
||||
CUDA_ARCHES = ["12.6", "12.8", "12.9"]
|
||||
CUDA_STABLE = "12.8"
|
||||
CUDA_ARCHES_FULL_VERSION = {
|
||||
"12.6": "12.6.3",
|
||||
"12.8": "12.8.1",
|
||||
"12.9": "12.9.1",
|
||||
"13.0": "13.0.0",
|
||||
}
|
||||
CUDA_ARCHES_CUDNN_VERSION = {
|
||||
"12.6": "9",
|
||||
"12.8": "9",
|
||||
"12.9": "9",
|
||||
"13.0": "9",
|
||||
}
|
||||
|
||||
# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
|
||||
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
|
||||
|
||||
CPU_S390X_ARCH = ["cpu-s390x"]
|
||||
|
||||
CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
|
||||
CUDA_AARCH64_ARCHES = ["12.9-aarch64"]
|
||||
|
||||
|
||||
PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
@ -56,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
@ -73,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
@ -90,49 +88,32 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
|
||||
"nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
),
|
||||
"13.0": (
|
||||
"nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
|
||||
),
|
||||
"xpu": (
|
||||
"intel-cmplr-lib-rt==2025.2.1 | "
|
||||
"intel-cmplr-lib-ur==2025.2.1 | "
|
||||
"intel-cmplr-lic-rt==2025.2.1 | "
|
||||
"intel-sycl-rt==2025.2.1 | "
|
||||
"oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"onemkl-sycl-blas==2025.2.0 | "
|
||||
"onemkl-sycl-dft==2025.2.0 | "
|
||||
"onemkl-sycl-lapack==2025.2.0 | "
|
||||
"onemkl-sycl-rng==2025.2.0 | "
|
||||
"onemkl-sycl-sparse==2025.2.0 | "
|
||||
"dpcpp-cpp-rt==2025.2.1 | "
|
||||
"intel-opencl-rt==2025.2.1 | "
|
||||
"mkl==2025.2.0 | "
|
||||
"intel-openmp==2025.2.1 | "
|
||||
"tbb==2022.2.0 | "
|
||||
"tcmlib==1.4.0 | "
|
||||
"umf==0.11.0 | "
|
||||
"intel-pti==0.13.1"
|
||||
"intel-cmplr-lib-rt==2025.1.1 | "
|
||||
"intel-cmplr-lib-ur==2025.1.1 | "
|
||||
"intel-cmplr-lic-rt==2025.1.1 | "
|
||||
"intel-sycl-rt==2025.1.1 | "
|
||||
"oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
|
||||
"onemkl-sycl-blas==2025.1.0 | "
|
||||
"onemkl-sycl-dft==2025.1.0 | "
|
||||
"onemkl-sycl-lapack==2025.1.0 | "
|
||||
"onemkl-sycl-rng==2025.1.0 | "
|
||||
"onemkl-sycl-sparse==2025.1.0 | "
|
||||
"dpcpp-cpp-rt==2025.1.1 | "
|
||||
"intel-opencl-rt==2025.1.1 | "
|
||||
"mkl==2025.1.0 | "
|
||||
"intel-openmp==2025.1.1 | "
|
||||
"tbb==2022.1.0 | "
|
||||
"tcmlib==1.3.0 | "
|
||||
"umf==0.10.0 | "
|
||||
"intel-pti==0.12.3"
|
||||
),
|
||||
}
|
||||
|
||||
@ -143,7 +124,9 @@ def get_nccl_wheel_version(arch_version: str) -> str:
|
||||
requirements = map(
|
||||
str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
|
||||
)
|
||||
return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
|
||||
return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
|
||||
1
|
||||
]
|
||||
|
||||
|
||||
def read_nccl_pin(arch_version: str) -> str:
|
||||
@ -210,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
|
||||
"cpu": "libtorch-cxx11-builder:cpu",
|
||||
}
|
||||
|
||||
FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
|
||||
|
||||
|
||||
def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
|
||||
@ -240,8 +223,6 @@ def generate_libtorch_matrix(
|
||||
if os == "linux":
|
||||
arches += CUDA_ARCHES
|
||||
arches += ROCM_ARCHES
|
||||
if "13.0" in arches:
|
||||
arches.remove("13.0")
|
||||
elif os == "windows":
|
||||
arches += CUDA_ARCHES
|
||||
if libtorch_variants is None:
|
||||
@ -333,8 +314,8 @@ def generate_wheels_matrix(
|
||||
# TODO: Enable python 3.13t on cpu-s390x
|
||||
if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
|
||||
continue
|
||||
# TODO: Enable python 3.14 for rest
|
||||
if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
|
||||
# TODO: Enable python 3.14 on non linux OSes
|
||||
if os != "linux" and (
|
||||
python_version == "3.14" or python_version == "3.14t"
|
||||
):
|
||||
continue
|
||||
@ -342,7 +323,7 @@ def generate_wheels_matrix(
|
||||
# cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
|
||||
|
||||
if (
|
||||
arch_version in ["13.0", "12.9", "12.8", "12.6"]
|
||||
arch_version in ["12.9", "12.8", "12.6"]
|
||||
and os == "linux"
|
||||
or arch_version in CUDA_AARCH64_ARCHES
|
||||
):
|
||||
@ -375,6 +356,29 @@ def generate_wheels_matrix(
|
||||
), # include special case for aarch64 build, remove the -aarch64 postfix
|
||||
}
|
||||
)
|
||||
# Special build building to use on Colab. Python 3.11 for 12.6 CUDA
|
||||
if python_version == "3.11" and arch_version == CUDA_STABLE:
|
||||
ret.append(
|
||||
{
|
||||
"python_version": python_version,
|
||||
"gpu_arch_type": gpu_arch_type,
|
||||
"gpu_arch_version": gpu_arch_version,
|
||||
"desired_cuda": translate_desired_cuda(
|
||||
gpu_arch_type, gpu_arch_version
|
||||
),
|
||||
"container_image": WHEEL_CONTAINER_IMAGES[
|
||||
arch_version
|
||||
].split(":")[0],
|
||||
"container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
|
||||
arch_version
|
||||
].split(":")[1],
|
||||
"package_type": package_type,
|
||||
"pytorch_extra_install_requirements": "",
|
||||
"build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace( # noqa: B950
|
||||
".", "_"
|
||||
),
|
||||
}
|
||||
)
|
||||
else:
|
||||
ret.append(
|
||||
{
|
||||
@ -405,7 +409,6 @@ def generate_wheels_matrix(
|
||||
return ret
|
||||
|
||||
|
||||
validate_nccl_dep_consistency("13.0")
|
||||
validate_nccl_dep_consistency("12.9")
|
||||
validate_nccl_dep_consistency("12.8")
|
||||
validate_nccl_dep_consistency("12.6")
|
||||
|
||||
BIN
.github/scripts/gql_mocks.json.gz
vendored
BIN
.github/scripts/gql_mocks.json.gz
vendored
Binary file not shown.
185
.github/scripts/test_trymerge.py
vendored
185
.github/scripts/test_trymerge.py
vendored
@ -27,7 +27,6 @@ from trymerge import (
|
||||
get_drci_classifications,
|
||||
gh_get_team_members,
|
||||
GitHubPR,
|
||||
iter_issue_timeline_until_comment,
|
||||
JobCheckState,
|
||||
main as trymerge_main,
|
||||
MandatoryChecksMissingError,
|
||||
@ -35,8 +34,6 @@ from trymerge import (
|
||||
RE_GHSTACK_DESC,
|
||||
read_merge_rules,
|
||||
remove_job_name_suffix,
|
||||
sha_from_committed_event,
|
||||
sha_from_force_push_after,
|
||||
validate_revert,
|
||||
)
|
||||
|
||||
@ -73,9 +70,6 @@ def mock_query(
|
||||
if key in mocked_queries:
|
||||
return mocked_queries[key]
|
||||
|
||||
# TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved
|
||||
raise ValueError(f"Key {key} could not be found in gql_mocks")
|
||||
|
||||
try:
|
||||
rc = fallback_function(*args)
|
||||
except HTTPError as err:
|
||||
@ -127,7 +121,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
|
||||
self.force = force
|
||||
self.pr_num = 76123
|
||||
self.dry_run = True
|
||||
self.comment_id = 12345 # Set to non-zero value
|
||||
self.comment_id = 0
|
||||
self.reason = "this is for testing"
|
||||
self.ignore_current = False
|
||||
self.check_mergeability = False
|
||||
@ -155,9 +149,9 @@ def mock_revert(
|
||||
def mock_merge(
|
||||
pr: GitHubPR,
|
||||
repo: GitRepo,
|
||||
comment_id: int,
|
||||
dry_run: bool = False,
|
||||
skip_mandatory_checks: bool = False,
|
||||
comment_id: Optional[int] = None,
|
||||
timeout_minutes: int = 400,
|
||||
stale_pr_days: int = 3,
|
||||
ignore_current: bool = False,
|
||||
@ -473,9 +467,9 @@ class TestTryMerge(TestCase):
|
||||
mock_merge.assert_called_once_with(
|
||||
mock.ANY,
|
||||
mock.ANY,
|
||||
comment_id=mock.ANY,
|
||||
dry_run=mock.ANY,
|
||||
skip_mandatory_checks=True,
|
||||
comment_id=mock.ANY,
|
||||
ignore_current=False,
|
||||
)
|
||||
|
||||
@ -488,9 +482,9 @@ class TestTryMerge(TestCase):
|
||||
mock_merge.assert_called_once_with(
|
||||
mock.ANY,
|
||||
mock.ANY,
|
||||
comment_id=mock.ANY,
|
||||
dry_run=mock.ANY,
|
||||
skip_mandatory_checks=False,
|
||||
comment_id=mock.ANY,
|
||||
ignore_current=False,
|
||||
)
|
||||
|
||||
@ -1141,176 +1135,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
|
||||
)
|
||||
|
||||
|
||||
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
|
||||
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
|
||||
@mock.patch(
|
||||
"trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
|
||||
)
|
||||
class TestTimelineFunctions(TestCase):
|
||||
"""Tests for the new timeline-related functions"""
|
||||
|
||||
def test_sha_from_committed_event(self, *args: Any) -> None:
|
||||
"""Test extracting SHA from committed event"""
|
||||
# Based on actual GitHub API format - committed events have "sha" at top level
|
||||
event = {
|
||||
"event": "committed",
|
||||
"sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
|
||||
)
|
||||
|
||||
# Test with missing SHA
|
||||
event_no_sha = {"event": "committed"}
|
||||
self.assertIsNone(sha_from_committed_event(event_no_sha))
|
||||
|
||||
def test_sha_from_force_push_after(self, *args: Any) -> None:
|
||||
"""Test extracting SHA from force push event"""
|
||||
# NOTE: The current function doesn't handle the actual GitHub API format
|
||||
# Real force push events have "commit_id" at top level, but this function
|
||||
# looks for "after", "after_commit", "after_sha", or "head_sha" fields
|
||||
|
||||
# Test with the legacy format the current function handles
|
||||
event_legacy = {
|
||||
"event": "head_ref_force_pushed",
|
||||
"after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_force_push_after(event_legacy),
|
||||
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
)
|
||||
|
||||
# Test with current GitHub API format (should return None with current implementation)
|
||||
event_real_api = {
|
||||
"event": "head_ref_force_pushed",
|
||||
"commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
}
|
||||
self.assertEqual(
|
||||
sha_from_force_push_after(event_real_api),
|
||||
"ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
|
||||
) # Current function doesn't handle commit_id
|
||||
|
||||
# Test with missing SHA
|
||||
event_no_sha = {"event": "head_ref_force_pushed"}
|
||||
self.assertIsNone(sha_from_force_push_after(event_no_sha))
|
||||
|
||||
@mock.patch("trymerge.gh_fetch_json_list")
|
||||
def test_iter_issue_timeline_until_comment(
|
||||
self, mock_gh_fetch_json_list: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test timeline iteration until target comment"""
|
||||
# Mock timeline data based on actual GitHub API format
|
||||
timeline_data = [
|
||||
{"event": "commented", "id": 100, "body": "first comment"},
|
||||
{"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
|
||||
{"event": "commented", "id": 200, "body": "target comment"},
|
||||
{"event": "commented", "id": 300, "body": "after target"},
|
||||
]
|
||||
mock_gh_fetch_json_list.return_value = timeline_data
|
||||
|
||||
# Test iteration stops at target comment
|
||||
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
|
||||
self.assertEqual(len(events), 3) # Should stop at target comment
|
||||
self.assertEqual(events[0]["event"], "commented")
|
||||
self.assertEqual(events[0]["id"], 100)
|
||||
self.assertEqual(events[1]["event"], "committed")
|
||||
self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
|
||||
self.assertEqual(events[2]["event"], "commented")
|
||||
self.assertEqual(events[2]["id"], 200)
|
||||
|
||||
@mock.patch("trymerge.gh_fetch_json_list")
|
||||
def test_iter_issue_timeline_until_comment_not_found(
|
||||
self, mock_gh_fetch_json_list: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test timeline iteration when target comment is not found"""
|
||||
# Mock empty timeline
|
||||
mock_gh_fetch_json_list.return_value = []
|
||||
|
||||
events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
|
||||
self.assertEqual(len(events), 0)
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_commit_after_comment(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
"""Test get_commit_sha_at_comment returns correct SHA after comment"""
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit2")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_force_push_before_comment(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "head_ref_force_pushed", "commit_id": "commit3"},
|
||||
{"event": "commented", "id": 100},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
{"event": "commented", "id": 100},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_multiple_comments(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "committed", "sha": "commit1"},
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "committed", "sha": "commit2"},
|
||||
{"event": "commented", "id": 200},
|
||||
{"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
|
||||
{"event": "commented", "id": 300},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(200)
|
||||
self.assertEqual(sha, "commit2")
|
||||
sha = pr.get_commit_sha_at_comment(300)
|
||||
self.assertEqual(sha, "commit3")
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_no_events(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.return_value = [
|
||||
{"event": "commented", "id": 100},
|
||||
{"event": "labeled", "label": {"name": "test"}},
|
||||
]
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertIsNone(sha)
|
||||
|
||||
@mock.patch("trymerge.iter_issue_timeline_until_comment")
|
||||
def test_get_commit_sha_at_comment_exception(
|
||||
self, mock_iter_timeline: Any, *args: Any
|
||||
) -> None:
|
||||
mock_iter_timeline.side_effect = Exception("API error")
|
||||
pr = GitHubPR("pytorch", "pytorch", 77700)
|
||||
sha = pr.get_commit_sha_at_comment(100)
|
||||
self.assertIsNone(sha)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
201
.github/scripts/trymerge.py
vendored
201
.github/scripts/trymerge.py
vendored
@ -108,6 +108,10 @@ GH_CHECKSUITES_FRAGMENT = """
|
||||
fragment PRCheckSuites on CheckSuiteConnection {
|
||||
edges {
|
||||
node {
|
||||
app {
|
||||
name
|
||||
databaseId
|
||||
}
|
||||
workflowRun {
|
||||
workflow {
|
||||
name
|
||||
@ -450,63 +454,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
|
||||
IGNORABLE_FAILED_CHECKS_THESHOLD = 10
|
||||
|
||||
|
||||
def iter_issue_timeline_until_comment(
|
||||
org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
|
||||
) -> Any:
|
||||
"""
|
||||
Yield timeline entries in order until (and including) the entry whose id == target_comment_id
|
||||
for a 'commented' event. Stops once the target comment is encountered.
|
||||
"""
|
||||
page = 1
|
||||
|
||||
while page <= max_pages:
|
||||
url = (
|
||||
f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
|
||||
)
|
||||
params = {"per_page": 100, "page": page}
|
||||
|
||||
batch = gh_fetch_json_list(url, params)
|
||||
|
||||
if not batch:
|
||||
return
|
||||
for ev in batch:
|
||||
# The target is the issue comment row with event == "commented" and id == issue_comment_id
|
||||
if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
|
||||
yield ev # nothing in the timeline after this matters, so stop early
|
||||
return
|
||||
yield ev
|
||||
if len(batch) < 100:
|
||||
return
|
||||
page += 1
|
||||
|
||||
# If we got here without finding the comment, then we either hit a bug or some github PR
|
||||
# has a _really_ long timeline.
|
||||
# The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
|
||||
raise RuntimeError(
|
||||
f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
|
||||
f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
|
||||
)
|
||||
|
||||
|
||||
def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA from committed event in timeline"""
|
||||
return ev.get("sha")
|
||||
|
||||
|
||||
def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
|
||||
"""Extract SHA from force push event in timeline"""
|
||||
# The current GitHub API format
|
||||
commit_id = ev.get("commit_id")
|
||||
if commit_id:
|
||||
return str(commit_id)
|
||||
|
||||
# Legacy format
|
||||
after = ev.get("after") or ev.get("after_commit") or {}
|
||||
if isinstance(after, dict):
|
||||
return after.get("sha") or after.get("oid")
|
||||
return ev.get("after_sha") or ev.get("head_sha")
|
||||
|
||||
|
||||
def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
|
||||
rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
|
||||
return rc["data"]["repository"]["pullRequest"]
|
||||
@ -794,24 +741,16 @@ class GitHubPR:
|
||||
def last_commit(self) -> Any:
|
||||
return self.info["commits"]["nodes"][-1]["commit"]
|
||||
|
||||
def last_commit_sha(self, default: Optional[str] = None) -> str:
|
||||
# for commits, the oid is the sha
|
||||
|
||||
if default is None:
|
||||
return str(self.last_commit()["oid"])
|
||||
|
||||
return str(self.last_commit().get("oid", default))
|
||||
|
||||
def get_merge_base(self) -> str:
|
||||
if self.merge_base:
|
||||
return self.merge_base
|
||||
|
||||
last_commit_sha = self.last_commit_sha()
|
||||
last_commit_oid = self.last_commit()["oid"]
|
||||
# NB: We could use self.base_ref() here for regular PR, however, that doesn't
|
||||
# work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
|
||||
# so let's just use main instead
|
||||
self.merge_base = gh_fetch_merge_base(
|
||||
self.org, self.project, last_commit_sha, self.default_branch()
|
||||
self.org, self.project, last_commit_oid, self.default_branch()
|
||||
)
|
||||
|
||||
# Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
|
||||
@ -900,44 +839,6 @@ class GitHubPR:
|
||||
def get_commit_count(self) -> int:
|
||||
return int(self.info["commits_with_authors"]["totalCount"])
|
||||
|
||||
def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
|
||||
"""
|
||||
Get the PR head commit SHA that was present when a specific comment was posted.
|
||||
This ensures we only merge the state of the PR at the time the merge command was issued,
|
||||
not any subsequent commits that may have been pushed after.
|
||||
|
||||
Returns None if no head-changing events found before the comment or if the comment was not found.
|
||||
"""
|
||||
head = None
|
||||
|
||||
try:
|
||||
for event in iter_issue_timeline_until_comment(
|
||||
self.org, self.project, self.pr_num, comment_id
|
||||
):
|
||||
etype = event.get("event")
|
||||
if etype == "committed":
|
||||
sha = sha_from_committed_event(event)
|
||||
if sha:
|
||||
head = sha
|
||||
print(f"Timeline: Found commit event for SHA {sha}")
|
||||
elif etype == "head_ref_force_pushed":
|
||||
sha = sha_from_force_push_after(event)
|
||||
if sha:
|
||||
head = sha
|
||||
print(f"Timeline: Found force push event for SHA {sha}")
|
||||
elif etype == "commented":
|
||||
if event.get("id") == comment_id:
|
||||
print(f"Timeline: Found final comment with sha {sha}")
|
||||
return head
|
||||
except Exception as e:
|
||||
print(
|
||||
f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
|
||||
)
|
||||
return None
|
||||
|
||||
print(f"Did not find comment with id {comment_id} in the PR timeline")
|
||||
return None
|
||||
|
||||
def get_pr_creator_login(self) -> str:
|
||||
return cast(str, self.info["author"]["login"])
|
||||
|
||||
@ -1254,7 +1155,7 @@ class GitHubPR:
|
||||
*,
|
||||
skip_mandatory_checks: bool = False,
|
||||
dry_run: bool = False,
|
||||
comment_id: int,
|
||||
comment_id: Optional[int] = None,
|
||||
ignore_current_checks: Optional[list[str]] = None,
|
||||
) -> None:
|
||||
# Raises exception if matching rule is not found
|
||||
@ -1270,7 +1171,7 @@ class GitHubPR:
|
||||
skip_internal_checks=can_skip_internal_checks(self, comment_id),
|
||||
ignore_current_checks=ignore_current_checks,
|
||||
)
|
||||
additional_merged_prs = self.merge_changes_locally(
|
||||
additional_merged_prs = self.merge_changes(
|
||||
repo, skip_mandatory_checks, comment_id
|
||||
)
|
||||
|
||||
@ -1299,7 +1200,7 @@ class GitHubPR:
|
||||
broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
|
||||
flaky_checks=ignorable_checks.get("FLAKY", []),
|
||||
unstable_checks=ignorable_checks.get("UNSTABLE", []),
|
||||
last_commit_sha=self.last_commit_sha(default=""),
|
||||
last_commit_sha=self.last_commit().get("oid", ""),
|
||||
merge_base_sha=self.get_merge_base(),
|
||||
merge_commit_sha=merge_commit_sha,
|
||||
is_failed=False,
|
||||
@ -1320,7 +1221,7 @@ class GitHubPR:
|
||||
dry_run=dry_run,
|
||||
)
|
||||
|
||||
def merge_changes_locally(
|
||||
def merge_changes(
|
||||
self,
|
||||
repo: GitRepo,
|
||||
skip_mandatory_checks: bool = False,
|
||||
@ -1329,15 +1230,27 @@ class GitHubPR:
|
||||
skip_all_rule_checks: bool = False,
|
||||
) -> list["GitHubPR"]:
|
||||
"""
|
||||
:param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
|
||||
:param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
|
||||
"""
|
||||
branch_to_merge_into = self.default_branch() if branch is None else branch
|
||||
if repo.current_branch() != branch_to_merge_into:
|
||||
repo.checkout(branch_to_merge_into)
|
||||
if not self.is_ghstack_pr():
|
||||
msg = self.gen_commit_message()
|
||||
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
|
||||
repo.fetch(self.last_commit()["oid"], pr_branch_name)
|
||||
repo._run_git("merge", "--squash", pr_branch_name)
|
||||
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
|
||||
|
||||
# It's okay to skip the commit SHA check for ghstack PRs since
|
||||
# authoring requires write access to the repo.
|
||||
if self.is_ghstack_pr():
|
||||
# Did the PR change since we started the merge?
|
||||
pulled_sha = repo.show_ref(pr_branch_name)
|
||||
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
|
||||
if pulled_sha != latest_pr_status.last_commit()["oid"]:
|
||||
raise RuntimeError(
|
||||
"PR has been updated since CI checks last passed. Please rerun the merge command."
|
||||
)
|
||||
return []
|
||||
else:
|
||||
return self.merge_ghstack_into(
|
||||
repo,
|
||||
skip_mandatory_checks,
|
||||
@ -1345,48 +1258,6 @@ class GitHubPR:
|
||||
skip_all_rule_checks=skip_all_rule_checks,
|
||||
)
|
||||
|
||||
msg = self.gen_commit_message()
|
||||
pr_branch_name = f"__pull-request-{self.pr_num}__init__"
|
||||
|
||||
# Determine which commit SHA to merge
|
||||
commit_to_merge = None
|
||||
if not comment_id:
|
||||
raise ValueError("Must provide --comment-id when merging regular PRs")
|
||||
|
||||
# Get the commit SHA that was present when the comment was made
|
||||
commit_to_merge = self.get_commit_sha_at_comment(comment_id)
|
||||
if not commit_to_merge:
|
||||
raise RuntimeError(
|
||||
f"Could not find commit that was pushed before comment {comment_id}"
|
||||
)
|
||||
|
||||
# Validate that this commit is the latest commit on the PR
|
||||
latest_commit = self.last_commit_sha()
|
||||
if commit_to_merge != latest_commit:
|
||||
raise RuntimeError(
|
||||
f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
|
||||
f"but now the latest commit on the PR is {latest_commit}. "
|
||||
f"Please re-issue the merge command to merge the latest commit."
|
||||
)
|
||||
|
||||
print(f"Merging commit {commit_to_merge} locally")
|
||||
|
||||
repo.fetch(commit_to_merge, pr_branch_name)
|
||||
repo._run_git("merge", "--squash", pr_branch_name)
|
||||
repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
|
||||
|
||||
# Did the PR change since we started the merge?
|
||||
pulled_sha = repo.show_ref(pr_branch_name)
|
||||
latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
|
||||
if (
|
||||
pulled_sha != latest_pr_status.last_commit_sha()
|
||||
or pulled_sha != commit_to_merge
|
||||
):
|
||||
raise RuntimeError(
|
||||
"PR has been updated since CI checks last passed. Please rerun the merge command."
|
||||
)
|
||||
return []
|
||||
|
||||
|
||||
class MergeRuleFailedError(RuntimeError):
|
||||
def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
|
||||
@ -1591,7 +1462,7 @@ def find_matching_merge_rule(
|
||||
pending_checks = []
|
||||
failed_checks = []
|
||||
|
||||
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
|
||||
hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
|
||||
if len(failed_checks) > 0:
|
||||
if reject_reason_score < 30000:
|
||||
reject_reason_score = 30000
|
||||
@ -2289,14 +2160,14 @@ def categorize_checks(
|
||||
def merge(
|
||||
pr: GitHubPR,
|
||||
repo: GitRepo,
|
||||
comment_id: int,
|
||||
dry_run: bool = False,
|
||||
skip_mandatory_checks: bool = False,
|
||||
comment_id: Optional[int] = None,
|
||||
timeout_minutes: int = 400,
|
||||
stale_pr_days: int = 3,
|
||||
ignore_current: bool = False,
|
||||
) -> None:
|
||||
initial_commit_sha = pr.last_commit_sha()
|
||||
initial_commit_sha = pr.last_commit()["oid"]
|
||||
pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
|
||||
print(f"Attempting merge of {initial_commit_sha} ({pr_link})")
|
||||
|
||||
@ -2367,7 +2238,7 @@ def merge(
|
||||
f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
|
||||
)
|
||||
pr = GitHubPR(pr.org, pr.project, pr.pr_num)
|
||||
if initial_commit_sha != pr.last_commit_sha():
|
||||
if initial_commit_sha != pr.last_commit()["oid"]:
|
||||
raise RuntimeError(
|
||||
"New commits were pushed while merging. Please rerun the merge command."
|
||||
)
|
||||
@ -2534,7 +2405,7 @@ def main() -> None:
|
||||
if args.check_mergeability:
|
||||
if pr.is_ghstack_pr():
|
||||
get_ghstack_prs(repo, pr) # raises error if out of sync
|
||||
pr.merge_changes_locally(
|
||||
pr.merge_changes(
|
||||
repo,
|
||||
skip_mandatory_checks=True,
|
||||
skip_all_rule_checks=True,
|
||||
@ -2549,18 +2420,12 @@ def main() -> None:
|
||||
gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
|
||||
return
|
||||
try:
|
||||
# Ensure comment id is set, else fail
|
||||
if not args.comment_id:
|
||||
raise ValueError(
|
||||
"Comment ID is required for merging PRs, please provide it using --comment-id"
|
||||
)
|
||||
|
||||
merge(
|
||||
pr,
|
||||
repo,
|
||||
comment_id=args.comment_id,
|
||||
dry_run=args.dry_run,
|
||||
skip_mandatory_checks=args.force,
|
||||
comment_id=args.comment_id,
|
||||
ignore_current=args.ignore_current,
|
||||
)
|
||||
except Exception as e:
|
||||
@ -2582,7 +2447,7 @@ def main() -> None:
|
||||
broken_trunk_checks=[],
|
||||
flaky_checks=[],
|
||||
unstable_checks=[],
|
||||
last_commit_sha=pr.last_commit_sha(default=""),
|
||||
last_commit_sha=pr.last_commit().get("oid", ""),
|
||||
merge_base_sha=pr.get_merge_base(),
|
||||
is_failed=True,
|
||||
skip_mandatory_checks=args.force,
|
||||
|
||||
3
.github/scripts/windows/build_magma.bat
vendored
3
.github/scripts/windows/build_magma.bat
vendored
@ -35,9 +35,6 @@ cd magma
|
||||
mkdir build && cd build
|
||||
|
||||
set GPU_TARGET=All
|
||||
if "%CUVER_NODOT%" == "130" (
|
||||
set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
|
||||
)
|
||||
if "%CUVER_NODOT%" == "129" (
|
||||
set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
|
||||
)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user