typing tvm.py

Type backend torchxla
typing registry.py
2025-11-03 07:24:58 +08:00 · 2025-08-11 14:57:13 -07:00 · 2025-08-11 14:37:34 -07:00 · 2025-08-11 14:09:50 -07:00 · 2025-08-11 13:51:06 -07:00 · 2025-08-11 13:43:05 -07:00
1809 changed files with 80897 additions and 89628 deletions
--- a/.bc-linter.yml
+++ b/.bc-linter.yml
@ -1,15 +0,0 @@
-version: 1
-paths:
-include:
-  - "**/*.py"
-exclude:
-  - ".*"
-  - ".*/**"
-  - "**/.*/**"
-  - "**/.*"
-  - "**/_*/**"
-  - "**/_*.py"
-  - "**/test/**"
-  - "**/benchmarks/**"
-  - "**/test_*.py"
-  - "**/*_test.py"
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -7,15 +7,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 fi

-if [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0"
-fi
-
-# Compress the fatbin with -compress-mode=size for CUDA 13
-if [[ "$DESIRED_CUDA" == *"13"* ]]; then
-    export TORCH_NVCC_FLAGS="-compress-mode=size"
-fi
-
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh

--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -77,24 +77,21 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    wheelname = os.path.basename(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    # Common libraries for all CUDA versions
-    common_libs = [
-        # Non-NVIDIA system libraries
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-        # Common CUDA libraries (same for all versions)
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
        "/usr/local/cuda/lib64/libcudnn.so.9",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
        "/usr/local/cuda/lib64/libnccl.so.2",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -102,41 +99,22 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/usr/local/cuda/lib64/libcufile.so.0",
-        "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/lib64/libgomp.so.1",
+        "/usr/lib64/libgfortran.so.5",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]

-    # CUDA version-specific libraries
-    if "130" in desired_cuda:
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
-            "/usr/local/cuda/lib64/libcublas.so.13",
-            "/usr/local/cuda/lib64/libcublasLt.so.13",
-            "/usr/local/cuda/lib64/libcudart.so.13",
-            "/usr/local/cuda/lib64/libcufft.so.12",
-            "/usr/local/cuda/lib64/libcusolver.so.12",
-            "/usr/local/cuda/lib64/libnvJitLink.so.13",
-            "/usr/local/cuda/lib64/libnvrtc.so.13",
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+    if "129" in desired_cuda:
+        libs_to_copy += [
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
-    elif "12" in desired_cuda:
-        # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
-        minor_version = desired_cuda[-1]
-        version_specific_libs = [
-            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-            "/usr/local/cuda/lib64/libcublas.so.12",
-            "/usr/local/cuda/lib64/libcublasLt.so.12",
-            "/usr/local/cuda/lib64/libcudart.so.12",
-            "/usr/local/cuda/lib64/libcufft.so.11",
-            "/usr/local/cuda/lib64/libcusolver.so.11",
-            "/usr/local/cuda/lib64/libnvJitLink.so.12",
-            "/usr/local/cuda/lib64/libnvrtc.so.12",
-            f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
-        ]
-
-    # Combine all libraries
-    libs_to_copy = common_libs + version_specific_libs

    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
@ -230,7 +208,7 @@ if __name__ == "__main__":
    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
-        build_vars += "MAX_JOBS=5 "
+        build_vars = "MAX_JOBS=5 " + build_vars

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -120,8 +120,8 @@ If your new Docker image needs a library installed from a specific pinned commit
   If you're introducing a new argument to the Docker build, make sure to add it in the Docker build step in `.ci/docker/build.sh`:
   ```bash
   docker build \
-     ....
-     --build-arg "NEW_ARG_1=${NEW_ARG_1}"
+      ....
+      --build-arg "NEW_ARG_1=${NEW_ARG_1}"
   ```

 3. **Update Dockerfile logic**:
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-ENV DESIRED_CUDA=13.0
-
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@ -80,10 +76,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh

 FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
-COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -76,13 +76,10 @@ elif [[ "$image" == *cuda*linter* ]]; then
 elif [[ "$image" == *linter* ]]; then
  # Use a separate Dockerfile for linter to keep a small image size
  DOCKERFILE="linter/Dockerfile"
-elif [[ "$image" == *riscv* ]]; then
-  # Use RISC-V specific Dockerfile
-  DOCKERFILE="ubuntu-cross-riscv/Dockerfile"
 fi

-_UCX_COMMIT=7836b165abdbe468a2f607e7254011c07d788152
-_UCC_COMMIT=430e241bf5d38cbc73fc7a6b89155397232e3f96
+_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
@ -114,19 +111,31 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
-    CUDA_VERSION=13.0.0
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.8.1
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=9
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    TRITON=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.8.1
+    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    VISION=yes
    KATEX=yes
@ -156,13 +165,13 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    TRITON=yes
@ -197,24 +206,23 @@ case "$tag" in
    UCC_COMMIT=${_UCC_COMMIT}
    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    ;;
-  pytorch-linux-jammy-xpu-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-xpu-2025.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-xpu-2025.1-py3)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.1
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.2
-    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
+  pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
@ -223,8 +231,8 @@ case "$tag" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-clang12)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=12.8.1
    CLANG_VERSION=12
    VISION=yes
@ -235,8 +243,8 @@ case "$tag" in
    CLANG_VERSION=18
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.10-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-py3.9-gcc11)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
@ -277,6 +285,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
@ -287,15 +296,13 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-noble-riscv64-py3.12-gcc14)
-    GCC_VERSION=14
-    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    VISION=yes
@ -416,14 +423,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
 fi

 if [ -n "$GCC_VERSION" ]; then
-  if [[ "$image" == *riscv* ]]; then
-    # Check RISC-V cross-compilation toolchain version
-    if !(drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
-      echo "RISC-V GCC_VERSION=$GCC_VERSION, but:"
-      drun riscv64-linux-gnu-gcc-${GCC_VERSION} --version
-      exit 1
-    fi
-  elif !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
+  if !(drun gcc --version 2>&1 | grep -q " $GCC_VERSION\\W"); then
    echo "GCC_VERSION=$GCC_VERSION, but:"
    drun gcc --version
    exit 1
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +0,0 @@
-transformers==4.54.0
-soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
+243e186efbf7fb93328dd6b34927a4e8c8f24395
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +0,0 @@
-v2.27.7-1
--- a/.ci/docker/ci_commit_pins/torchbench.txt
+++ b/.ci/docker/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-74a23feff57432129df84d8099e622773cf77925
+e03a63be43e33596f7f0a43b0f530353785e4a59
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-d0e80f39c562c70986fc548fa6e5852ad86e16e7
+ae324eeac8e102a2b40370e341460f3791353398
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.24
+NVSHMEM_VERSION=3.3.9

 function install_cuda {
  version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
-  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
-  suffix=".tar.xz"
-  url="https://developer.download.nvidia.com/compute/nvshmem/redist/libnvshmem/linux-${arch_path}/${filename}${suffix}"
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}${suffix}"
-  cp -a "${filename}/include/"* /usr/local/cuda/include/
-  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
@ -128,6 +126,74 @@ function install_129 {
  ldconfig
 }

+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,38 +212,18 @@ function install_128 {
  ldconfig
 }

-function install_130 {
-  CUDNN_VERSION=9.12.0.46
-  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 13 $CUDNN_VERSION
-
-  install_nvshmem 13 $NVSHMEM_VERSION
-
-  CUDA_VERSION=13.0 bash install_nccl.sh
-
-  CUDA_VERSION=13.0 bash install_cusparselt.sh
-
-  ldconfig
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124;
+    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126;
+    12.6|12.6.*) install_126; prune_126
        ;;
    12.8|12.8.*) install_128;
        ;;
    12.9|12.9.*) install_129;
        ;;
-    13.0|13.0.*) install_130;
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,7 +5,9 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  pip_install -r huggingface-requirements.txt
+  local version
+  commit=$(get_pinned_commit huggingface)
+  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }

 function install_timm() {
@ -24,12 +26,15 @@ function install_torchbench() {

  python install.py --continue_on_fail

+  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
+  # is regressing speedup metric. This needs to be investigated further
+  pip install transformers==4.38.1
+
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd

  chown -R jenkins torchbench
-  chown -R jenkins /opt/conda
 }

 # Pango is needed for weasyprint which is needed for doctr
@ -43,4 +48,4 @@ install_huggingface
 install_timm

 # Clean up
-conda_run pip uninstall -y torch torchvision torchaudio triton torchao
+conda_run pip uninstall -y torch torchvision torchaudio triton
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
  exit 1
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.22.1
-pip_install onnxscript==0.4.0
+pip_install onnxruntime==1.18.1
+pip_install onnxscript==0.3.1

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -57,7 +57,7 @@ if [ ! -f setup.py ]; then
  cd python
 fi

-pip_install pybind11==3.0.1
+pip_install pybind11==2.13.6

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -44,12 +44,8 @@ function install_ucc() {

  ./autogen.sh

-  if [[ -n "$CUDA_VERSION"  && $CUDA_VERSION == 13* ]]; then
-    NVCC_GENCODE="-gencode=arch=compute_86,code=compute_86"
-  else
-    # We only run distributed tests on Tesla M60 and A10G
-    NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
-  fi
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"

  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -65,14 +65,10 @@ function install_ubuntu() {

 function install_rhel() {
    . /etc/os-release
-    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
-            echo "RHEL version ${VERSION_ID} not supported"
-            exit
-        fi
-    elif [[ "${ID}" == "almalinux" ]]; then
-        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.8"
+
+    if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        echo "RHEL version ${VERSION_ID} not supported"
+        exit
    fi

    dnf install -y 'dnf-command(config-manager)'
@ -150,11 +146,11 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    XPU_DRIVER_VERSION="/lts/2350"
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.1
-if [[ "$XPU_VERSION" == "2025.2" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.2"
-else
+# Default use Intel® oneAPI Deep Learning Essentials 2025.0
+if [[ "$XPU_VERSION" == "2025.1" ]]; then
    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
+else
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi

 # The installation depends on the base OS
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -69,11 +69,6 @@ RUN bash ./install_cuda.sh 12.9
 RUN bash ./install_magma.sh 12.9
 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-RUN bash ./install_magma.sh 13.0
-RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
-
 FROM cpu as rocm
 ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -175,6 +175,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.2
+ENV XPU_VERSION 2025.1
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -67,12 +67,6 @@ case ${image} in
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28-builder:cuda13*)
-        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
-        MANY_LINUX_VERSION="2_28"
-        ;;
    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
        GPU_IMAGE=amd64/almalinux:8
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -263,6 +263,11 @@ scipy==1.14.1 ; python_version >= "3.12"
 #Pinned versions:
 #test that import:

+tb-nightly==2.13.0a20230426
+#Description: TensorBoard
+#Pinned versions:
+#test that import:
+
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python
@ -339,7 +344,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:

-onnxscript==0.4.0
+onnxscript==0.3.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -379,7 +384,7 @@ dataclasses_json==0.6.7
 cmake==4.0.0
 #Description: required for building

-tlparse==0.4.0
+tlparse==0.3.30
 #Description: required for log parsing

 cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@722b7e6f9ca512fcc526ad07d62b3d28c50bb6cd#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/ubuntu-cross-riscv/Dockerfile
+++ b/.ci/docker/ubuntu-cross-riscv/Dockerfile
@ -1,155 +0,0 @@
-# Cross-compilation Docker container for RISC-V architecture
-ARG UBUNTU_VERSION
-FROM --platform=linux/amd64 ubuntu:${UBUNTU_VERSION} as base
-
-ARG UBUNTU_VERSION
-
-ENV GCC_VERSION=14
-ENV PYTHON_VERSION=3.12.3
-ENV DEBIAN_FRONTEND=noninteractive
-ENV CC=riscv64-linux-gnu-gcc-${GCC_VERSION}
-ENV CXX=riscv64-linux-gnu-g++-${GCC_VERSION}
-ENV QEMU_LD_PREFIX=/usr/riscv64-linux-gnu/
-ENV SYSROOT=/opt/sysroot
-
-# Install basic dependencies
-RUN apt-get update && apt-get install -y \
-    ninja-build \
-    autoconf \
-    automake \
-    libtool \
-    patchelf \
-    ccache \
-    git \
-    wget \
-    python3-pip \
-    python3-venv \
-    python-is-python3 \
-    cmake \
-    sudo \
-    lsb-release \
-    gcc-${GCC_VERSION}-riscv64-linux-gnu \
-    g++-${GCC_VERSION}-riscv64-linux-gnu \
-    pkg-config \
-    && rm -rf /var/lib/apt/lists/*
-
-# Install user
-COPY ./common/install_user.sh install_user.sh
-RUN bash ./install_user.sh && rm install_user.sh
-
-FROM base as python
-ARG ZLIB_VERSION=1.3.1
-ARG FFI_VERSION=3.4.6
-ARG BZ2_VERSION=1.0.8
-ARG XZ_VERSION=5.4.6
-ARG OPENSSL_VERSION=3.2.1
-
-# Set up sysroot directory for dependencies
-ENV PKG_CONFIG_PATH=${SYSROOT}/lib/pkgconfig
-ENV PKG_CONFIG_SYSROOT_DIR=${SYSROOT}
-
-WORKDIR /opt
-
-# Build zlib (for compression)
-RUN echo "--- Building zlib ---" \
-    && wget -c https://www.zlib.net/zlib-${ZLIB_VERSION}.tar.gz \
-    && tar -xf zlib-${ZLIB_VERSION}.tar.gz --no-same-permissions --no-same-owner \
-    && cd zlib-${ZLIB_VERSION}/ \
-    && mkdir build && cd build \
-    && ../configure --prefix=${SYSROOT} \
-    && make -j$(nproc) && make install \
-    && cd ../..
-
-# Build libffi (for ctypes module)
-RUN echo "--- Building libffi ---" \
-    && wget -c https://github.com/libffi/libffi/releases/download/v${FFI_VERSION}/libffi-${FFI_VERSION}.tar.gz \
-    && tar -xf libffi-${FFI_VERSION}.tar.gz --no-same-permissions --no-same-owner \
-    && cd libffi-${FFI_VERSION}/ \
-    && mkdir build && cd build \
-    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
-    && make -j$(nproc) && make install \
-    && cd ../..
-
-# Build bzip2 (for bz2 module)
-RUN echo "--- Building bzip2 ---" \
-    && wget -c https://sourceware.org/pub/bzip2/bzip2-${BZ2_VERSION}.tar.gz \
-    && tar -xf bzip2-${BZ2_VERSION}.tar.gz --no-same-permissions --no-same-owner \
-    && cd bzip2-${BZ2_VERSION}/ \
-    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} bzip2 bzip2recover libbz2.a \
-    && make CC=riscv64-linux-gnu-gcc-${GCC_VERSION} -f Makefile-libbz2_so \
-    && make install PREFIX=${SYSROOT} \
-    && cp libbz2.so.${BZ2_VERSION} ${SYSROOT}/lib/ \
-    && cd ${SYSROOT}/lib/ \
-    && ln -sf libbz2.so.${BZ2_VERSION} libbz2.so.1.0 \
-    && ln -sf libbz2.so.1.0 libbz2.so \
-    && cd /opt/
-
-# Build xz (for lzma module)
-RUN echo "--- Building xz ---" \
-    && wget -c https://github.com/tukaani-project/xz/releases/download/v${XZ_VERSION}/xz-${XZ_VERSION}.tar.gz \
-    && tar -xf xz-${XZ_VERSION}.tar.gz --no-same-permissions --no-same-owner \
-    && cd xz-${XZ_VERSION} \
-    && mkdir build && cd build \
-    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
-    && make -j$(nproc) && make install \
-    && cd ../..
-
-# Build OpenSSL (for ssl module)
-RUN echo "--- Building OpenSSL ---" \
-    && wget -c https://www.openssl.org/source/openssl-${OPENSSL_VERSION}.tar.gz \
-    && tar -xf openssl-${OPENSSL_VERSION}.tar.gz --no-same-permissions --no-same-owner \
-    && cd openssl-${OPENSSL_VERSION}/ \
-    && mkdir build && cd build \
-    && ../Configure linux64-riscv64 --prefix=${SYSROOT} \
-    && make -j$(nproc) && make install_sw \
-    && cd ../..
-
-# Build SQLite3 (for sqlite3 module)
-RUN echo "--- Building SQLite3 ---" \
-    && wget -c https://www.sqlite.org/2024/sqlite-autoconf-3450200.tar.gz \
-    && tar -xf sqlite-autoconf-3450200.tar.gz --no-same-permissions --no-same-owner \
-    && cd sqlite-autoconf-3450200 \
-    && mkdir build && cd build \
-    && ../configure --prefix=${SYSROOT} --host=riscv64-linux-gnu --build=x86_64-linux-gnu \
-    && make -j$(nproc) && make install \
-    && cd ../..
-
-# Build and install RISC-V Python with all modules
-RUN wget -c https://www.python.org/ftp/python/${PYTHON_VERSION}/Python-${PYTHON_VERSION}.tgz \
-    && tar -xf Python-${PYTHON_VERSION}.tgz --no-same-permissions --no-same-owner \
-    && cd Python-${PYTHON_VERSION} \
-    && mkdir build && cd build \
-    && ../configure \
-        --host=riscv64-linux-gnu \
-        --build=x86_64-linux-gnu \
-        --prefix=${SYSROOT} \
-        --enable-shared \
-        --disable-ipv6 \
-        --with-build-python=/usr/bin/python3 \
-        --with-ensurepip=no \
-        ac_cv_file__dev_ptmx=yes \
-        ac_cv_file__dev_ptc=no \
-    && make -j$(nproc) \
-    && make install
-
-FROM base as final
-COPY --from=python             /opt/sysroot                       /opt/sysroot
-
-# Install crossenv and cmake
-RUN pip install crossenv cmake==4.0.0 --break-system-packages \
-    && /usr/bin/python3 -m crossenv ${SYSROOT}/bin/python3 /opt/riscv-cross-env
-
-# Add pip-installed cmake binaries to PATH
-ENV PATH="/usr/local/bin:${PATH}"
-
-# Set up cross Python environment
-SHELL ["/bin/bash", "-c"]
-RUN source /opt/riscv-cross-env/bin/activate \
-    && pip install setuptools pyyaml typing_extensions wheel
-
-# Set default environment variables for PyTorch build
-ENV Python_ROOT_DIR=${SYSROOT}
-ENV OPENSSL_ROOT_DIR=${SYSROOT}
-
-USER jenkins
-CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -66,7 +66,6 @@ ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
-ARG CUDA_VERSION
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
@ -97,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
@ -182,6 +181,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

 # AWS specific CUDA build guidance
+ENV TORCH_CUDA_ARCH_LIST Maxwell
 ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 ENV CUDA_PATH /usr/local/cuda

--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/lumen_cli/README.md
+++ b/.ci/lumen_cli/README.md
@ -1,31 +0,0 @@
-# 🔧 Lumen_cli
-A Python CLI tool for building and testing PyTorch-based components, using a YAML configuration file for structured, repeatable workflows.
-
-
-## Features
- **Build**
-    - external projects (e.g. vLLM)
-
-## 📦 Installation
-at the root of the pytorch repo
-```bash
-pip install -e .ci/lumen_cli
-```
-
-## Run the cli tool
-The cli tool must be used at root of pytorch repo, as example to run build external vllm:
-```bash
-python -m cli.run build external vllm
-```
-this will run the build steps with default behaviour for vllm project.
-
-to see help messages, run
-```bash
-python3 -m cli.run --help
-```
-
-## Add customized external build logics
-To add a new external build, for instance, add a new external build logics:
-1. create the build function in cli/lib folder
-2. register your target and the main build function at  EXTERNAL_BUILD_TARGET_DISPATCH in `cli/build_cli/register_build.py`
-3. [optional] create your ci config file in .github/ci_configs/${EXTERNAL_PACKAGE_NAME}.yaml
--- a/.ci/lumen_cli/cli/build_cli/register_build.py
+++ b/.ci/lumen_cli/cli/build_cli/register_build.py
@ -1,37 +0,0 @@
-import argparse
-import logging
-
-from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
-from cli.lib.core.vllm.vllm_build import VllmBuildRunner
-
-
-logger = logging.getLogger(__name__)
-
-# Maps targets to their argparse configuration and runner
-# it adds new target to path python -m cli.run build external {target} with buildrunner
-_TARGETS: dict[str, TargetSpec] = {
-    "vllm": {
-        "runner": VllmBuildRunner,
-        "help": "Build vLLM using docker buildx.",
-    }
-    # add yours ...
-}
-
-
-def register_build_commands(subparsers: argparse._SubParsersAction) -> None:
-    build_parser = subparsers.add_parser(
-        "build",
-        help="Build related commands",
-        formatter_class=RichHelp,
-    )
-    build_subparsers = build_parser.add_subparsers(dest="build_command", required=True)
-    overview = "\n".join(
-        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
-    )
-    external_parser = build_subparsers.add_parser(
-        "external",
-        help="Build external targets",
-        description="Build third-party targets.\n\nAvailable targets:\n" + overview,
-        formatter_class=RichHelp,
-    )
-    register_targets(external_parser, _TARGETS)
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -1,71 +0,0 @@
-"""
-Cli Argparser Utility helpers for CLI tasks.
-
-"""
-
-import argparse
-from abc import ABC, abstractmethod
-
-
-try:
-    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
-except ImportError:
-    from typing import Any, Callable, TypedDict
-
-    from typing_extensions import Required  # Fallback for Python <3.11
-
-
-class BaseRunner(ABC):
-    def __init__(self, args: Any) -> None:
-        self.args = args
-
-    @abstractmethod
-    def run(self) -> None:
-        """runs main logics, required"""
-
-
-# Pretty help: keep newlines + show defaults
-class RichHelp(
-    argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter
-):
-    pass
-
-
-class TargetSpec(TypedDict, total=False):
-    """CLI subcommand specification with bA."""
-
-    runner: Required[type[BaseRunner]]
-    help: str
-    description: str
-    add_arguments: Callable[[argparse.ArgumentParser], None]
-
-
-def register_targets(
-    parser: argparse.ArgumentParser,
-    target_specs: dict[str, TargetSpec],
-    common_args: Callable[[argparse.ArgumentParser], None] = lambda _: None,
-) -> None:
-    """Register target subcommands."""
-    targets = parser.add_subparsers(
-        dest="target",
-        required=True,
-        metavar="{" + ",".join(target_specs.keys()) + "}",
-    )
-
-    for name, spec in target_specs.items():
-        desc = spec.get("description") or spec["runner"].__doc__ or ""
-
-        p = targets.add_parser(
-            name,
-            help=spec.get("help", ""),
-            description=desc.strip(),
-            formatter_class=RichHelp,
-        )
-        p.set_defaults(
-            func=lambda args, cls=spec["runner"]: cls(args).run(),
-            _runner_class=spec["runner"],
-        )
-        if "add_arguments" in spec and callable(spec["add_arguments"]):
-            spec["add_arguments"](p)
-        if common_args:
-            common_args(p)
--- a/.ci/lumen_cli/cli/lib/common/docker_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/docker_helper.py
@ -1,42 +0,0 @@
-"""
-Docker Utility helpers for CLI tasks.
-"""
-
-import logging
-from typing import Optional
-
-import docker
-from docker.errors import APIError, NotFound
-
-
-logger = logging.getLogger(__name__)
-
-# lazy singleton so we don't reconnect every call
-_docker_client: Optional[docker.DockerClient] = None
-
-
-def _get_client() -> docker.DockerClient:
-    global _docker_client
-    if _docker_client is None:
-        _docker_client = docker.from_env()
-    return _docker_client
-
-
-def local_image_exists(
-    image_name: str, client: Optional[docker.DockerClient] = None
-) -> bool:
-    """Return True if a local Docker image exists."""
-    if not image_name:
-        return False
-
-    client = client or _get_client()
-    try:
-        client.images.get(image_name)
-        return True
-    except (NotFound, APIError) as e:
-        logger.error(
-            "Error when checking Docker image '%s': %s",
-            image_name,
-            e.explanation if hasattr(e, "explanation") else str(e),
-        )
-        return False
--- a/.ci/lumen_cli/cli/lib/common/envs_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/envs_helper.py
@ -1,110 +0,0 @@
-"""
-Environment Variables and Dataclasses Utility helpers for CLI tasks.
-"""
-
-import os
-from dataclasses import field, fields, is_dataclass, MISSING
-from pathlib import Path
-from textwrap import indent
-from typing import Optional, Union
-
-from cli.lib.common.utils import str2bool
-
-
-def get_env(name: str, default: str = "") -> str:
-    """Get environment variable with default fallback."""
-    return os.environ.get(name) or default
-
-
-def env_path_optional(
-    name: str,
-    default: Optional[Union[str, Path]] = None,
-    resolve: bool = True,
-) -> Optional[Path]:
-    """Get environment variable as optional Path."""
-    val = get_env(name) or default
-    if not val:
-        return None
-
-    path = Path(val)
-    return path.resolve() if resolve else path
-
-
-def env_path(
-    name: str,
-    default: Optional[Union[str, Path]] = None,
-    resolve: bool = True,
-) -> Path:
-    """Get environment variable as Path, raise if missing."""
-    path = env_path_optional(name, default, resolve)
-    if not path:
-        raise ValueError(f"Missing path value for {name}")
-    return path
-
-
-def env_bool(
-    name: str,
-    default: bool = False,
-) -> bool:
-    val = get_env(name)
-    if not val:
-        return default
-    return str2bool(val)
-
-
-def env_bool_field(
-    name: str,
-    default: bool = False,
-):
-    return field(default_factory=lambda: env_bool(name, default))
-
-
-def env_path_field(
-    name: str,
-    default: Union[str, Path] = "",
-    *,
-    resolve: bool = True,
-) -> Path:
-    return field(default_factory=lambda: env_path(name, default, resolve=resolve))
-
-
-def env_str_field(
-    name: str,
-    default: str = "",
-) -> str:
-    return field(default_factory=lambda: get_env(name, default))
-
-
-def generate_dataclass_help(cls) -> str:
-    """Auto-generate help text for dataclass fields."""
-    if not is_dataclass(cls):
-        raise TypeError(f"{cls} is not a dataclass")
-
-    def get_value(f):
-        if f.default is not MISSING:
-            return f.default
-        if f.default_factory is not MISSING:
-            try:
-                return f.default_factory()
-            except Exception as e:
-                return f"<error: {e}>"
-        return "<required>"
-
-    lines = [f"{f.name:<22} = {repr(get_value(f))}" for f in fields(cls)]
-    return indent("\n".join(lines), "    ")
-
-
-def with_params_help(params_cls: type, title: str = "Parameter defaults"):
-    """
-    Class decorator that appends a help table generated from another dataclass
-    (e.g., VllmParameters) to the decorated class's docstring.
-    """
-    if not is_dataclass(params_cls):
-        raise TypeError(f"{params_cls} must be a dataclass")
-
-    def _decorator(cls: type) -> type:
-        block = generate_dataclass_help(params_cls)
-        cls.__doc__ = (cls.__doc__ or "") + f"\n\n{title}:\n{block}"
-        return cls
-
-    return _decorator
--- a/.ci/lumen_cli/cli/lib/common/gh_summary.py
+++ b/.ci/lumen_cli/cli/lib/common/gh_summary.py
@ -1,143 +0,0 @@
-from __future__ import annotations
-
-import logging
-import os
-import textwrap
-from pathlib import Path
-from typing import TYPE_CHECKING
-
-from cli.lib.common.utils import get_wheels
-from jinja2 import Template
-
-
-if TYPE_CHECKING:
-    from collections.abc import Iterable, Mapping
-
-
-logger = logging.getLogger(__name__)
-
-_TPL_CONTENT = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-
-    ```{{ lang }}
-    {{ content }}
-    ```
-""")
-)
-
-_TPL_LIST_ITEMS = Template(
-    textwrap.dedent("""\
-    ## {{ title }}
-    {% for it in items %}
-    - {{ it.pkg }}: {{ it.relpath }}
-    {% else %}
-    _(no item found)_
-    {% endfor %}
-    """)
-)
-
-_TPL_TABLE = Template(
-    textwrap.dedent("""\
-    {%- if rows %}
-    | {{ cols | join(' | ') }} |
-    |{%- for _ in cols %} --- |{%- endfor %}
-    {%- for r in rows %}
-    | {%- for c in cols %} {{ r.get(c, "") }} |{%- endfor %}
-    {%- endfor %}
-    {%- else %}
-    _(no data)_
-    {%- endif %}
-""")
-)
-
-
-def gh_summary_path() -> Path | None:
-    """Return the Path to the GitHub step summary file, or None if not set."""
-    p = os.environ.get("GITHUB_STEP_SUMMARY")
-    return Path(p) if p else None
-
-
-def write_gh_step_summary(md: str, *, append_content: bool = True) -> bool:
-    """
-    Write Markdown content to the GitHub Step Summary file if GITHUB_STEP_SUMMARY is set.
-    append_content: default true, if True, append to the end of the file, else overwrite the whole file
-
-    Returns:
-        True if written successfully (in GitHub Actions environment),
-        False if skipped (e.g., running locally where the variable is not set).
-    """
-    sp = gh_summary_path()
-    if not sp:
-        logger.info("[gh-summary] GITHUB_STEP_SUMMARY not set, skipping write.")
-        return False
-
-    md_clean = textwrap.dedent(md).strip() + "\n"
-
-    mode = "a" if append_content else "w"
-    with sp.open(mode, encoding="utf-8") as f:
-        f.write(md_clean)
-    return True
-
-
-def md_heading(text: str, level: int = 2) -> str:
-    """Generate a Markdown heading string with the given level (1-6)."""
-    return f"{'#' * max(1, min(level, 6))} {text}\n"
-
-
-def md_details(summary: str, content: str) -> str:
-    """Generate a collapsible <details> block with a summary and inner content."""
-    return f"<details>\n<summary>{summary}</summary>\n\n{content}\n\n</details>\n"
-
-
-def summarize_content_from_file(
-    output_dir: Path,
-    freeze_file: str,
-    title: str = "Content from file",
-    code_lang: str = "",  # e.g. "text" or "ini"
-) -> bool:
-    f = Path(output_dir) / freeze_file
-    if not f.exists():
-        return False
-    content = f.read_text(encoding="utf-8").strip()
-    md = render_content(content, title=title, lang=code_lang)
-    return write_gh_step_summary(md)
-
-
-def summarize_wheels(path: Path, title: str = "Wheels", max_depth: int = 3):
-    items = get_wheels(path, max_depth=max_depth)
-    if not items:
-        return False
-    md = render_list(items, title=title)
-    return write_gh_step_summary(md)
-
-
-def md_kv_table(rows: Iterable[Mapping[str, str | int | float]]) -> str:
-    """
-    Render a list of dicts as a Markdown table using Jinja template.
-    """
-    rows = list(rows)
-    cols = list({k for r in rows for k in r.keys()})
-    md = _TPL_TABLE.render(cols=cols, rows=rows).strip() + "\n"
-    return md
-
-
-def render_list(
-    items: Iterable[str],
-    *,
-    title: str = "List",
-) -> str:
-    tpl = _TPL_LIST_ITEMS
-    md = tpl.render(title=title, items=items)
-    return md
-
-
-def render_content(
-    content: str,
-    *,
-    title: str = "Content",
-    lang: str = "text",
-) -> str:
-    tpl = _TPL_CONTENT
-    md = tpl.render(title=title, content=content, lang=lang)
-    return md
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -1,69 +0,0 @@
-"""
-Git Utility helpers for CLI tasks.
-"""
-
-import logging
-from pathlib import Path
-
-from cli.lib.common.path_helper import remove_dir
-from git import GitCommandError, RemoteProgress, Repo
-
-
-logger = logging.getLogger(__name__)
-
-
-class PrintProgress(RemoteProgress):
-    """Simple progress logger for git operations."""
-
-    def __init__(self, interval: int = 5):
-        super().__init__()
-        self._last_percent = -1
-        self._interval = interval
-
-    def update(self, op_code, cur, max=None, message=""):
-        msg = self._cur_line or message
-        if max and cur:
-            percent = int(cur / max * 100)
-            if percent != self._last_percent and percent % self._interval == 0:
-                self._last_percent = percent
-                logger.info("Progress: %d%% - %s", percent, msg)
-        elif msg:
-            logger.info(msg)
-
-
-def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules=False):
-    """Clone repository with pinned commit and optional submodules."""
-    dst = dst or target
-
-    try:
-        logger.info("Cloning %s to %s", target, dst)
-
-        # Clone and fetch
-        remove_dir(dst)
-        r = Repo.clone_from(repo, dst, progress=PrintProgress())
-        r.git.fetch("--all", "--tags")
-
-        # Checkout pinned commit
-        commit = get_post_build_pinned_commit(target)
-        logger.info("Checking out pinned %s commit %s", target, commit)
-        r.git.checkout(commit)
-
-        # Update submodules if requested
-        if update_submodules and r.submodules:
-            logger.info("Updating %d submodule(s)", len(r.submodules))
-            for sm in r.submodules:
-                sm.update(init=True, recursive=True, progress=PrintProgress())
-
-        logger.info("Successfully cloned %s", target)
-        return r, commit
-
-    except GitCommandError as e:
-        logger.error("Git operation failed: %s", e)
-        raise
-
-
-def get_post_build_pinned_commit(name: str, prefix=".github/ci_commit_pins") -> str:
-    path = Path(prefix) / f"{name}.txt"
-    if not path.exists():
-        raise FileNotFoundError(f"Pin file not found: {path}")
-    return path.read_text(encoding="utf-8").strip()
--- a/.ci/lumen_cli/cli/lib/common/logger.py
+++ b/.ci/lumen_cli/cli/lib/common/logger.py
@ -1,14 +0,0 @@
-"""
-Logger Utility helpers for CLI tasks.
-"""
-
-import logging
-import sys
-
-
-def setup_logging(level: int = logging.INFO):
-    logging.basicConfig(
-        level=level,
-        format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
-        stream=sys.stdout,
-    )
--- a/.ci/lumen_cli/cli/lib/common/path_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/path_helper.py
@ -1,62 +0,0 @@
-"""Path utility helpers for CLI tasks."""
-
-import logging
-import shutil
-from pathlib import Path
-from typing import Union
-
-
-logger = logging.getLogger(__name__)
-
-
-def get_path(path: Union[str, Path], resolve: bool = False) -> Path:
-    """Convert to Path object, optionally resolving to absolute path."""
-    if not path:
-        raise ValueError("Path cannot be None or empty")
-    result = Path(path)
-    return result.resolve() if resolve else result
-
-
-def ensure_dir_exists(path: Union[str, Path]) -> Path:
-    """Create directory if it doesn't exist."""
-    path_obj = get_path(path)
-    path_obj.mkdir(parents=True, exist_ok=True)
-    return path_obj
-
-
-def remove_dir(path: Union[str, Path, None]) -> None:
-    """Remove directory if it exists."""
-    if not path:
-        return
-    path_obj = get_path(path)
-    if path_obj.exists():
-        shutil.rmtree(path_obj)
-
-
-def force_create_dir(path: Union[str, Path]) -> Path:
-    """Remove directory if exists, then create fresh empty directory."""
-    remove_dir(path)
-    return ensure_dir_exists(path)
-
-
-def copy(src: Union[str, Path], dst: Union[str, Path]) -> None:
-    """Copy file or directory from src to dst."""
-    src_path = get_path(src, resolve=True)
-    dst_path = get_path(dst, resolve=True)
-
-    if not src_path.exists():
-        raise FileNotFoundError(f"Source does not exist: {src_path}")
-
-    dst_path.parent.mkdir(parents=True, exist_ok=True)
-
-    if src_path.is_file():
-        shutil.copy2(src_path, dst_path)
-    elif src_path.is_dir():
-        shutil.copytree(src_path, dst_path, dirs_exist_ok=True)
-    else:
-        raise ValueError(f"Unsupported path type: {src_path}")
-
-
-def is_path_exist(path: Union[str, Path, None]) -> bool:
-    """Check if path exists."""
-    return bool(path and get_path(path).exists())
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@ -1,71 +0,0 @@
-import glob
-import logging
-import shlex
-import shutil
-import sys
-from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version  # noqa: UP035
-from typing import Optional, Union
-
-from cli.lib.common.utils import run_command
-
-
-logger = logging.getLogger(__name__)
-
-
-def pip_install_packages(
-    packages: Iterable[str] = (),
-    env=None,
-    *,
-    requirements: Optional[str] = None,
-    constraints: Optional[str] = None,
-    prefer_uv: bool = False,
-) -> None:
-    use_uv = prefer_uv and shutil.which("uv") is not None
-    base = (
-        [sys.executable, "-m", "uv", "pip", "install"]
-        if use_uv
-        else [sys.executable, "-m", "pip", "install"]
-    )
-    cmd = base[:]
-    if requirements:
-        cmd += ["-r", requirements]
-    if constraints:
-        cmd += ["-c", constraints]
-    cmd += list(packages)
-    logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
-    run_command(" ".join(map(shlex.quote, cmd)), env=env)
-
-
-def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
-    wheel = first_matching_pkg(pattern)
-    target = f"{wheel}[{extras}]" if extras else wheel
-    logger.info("Installing %s...", target)
-    pip_install_packages([target], prefer_uv=pref_uv)
-
-
-def run_python(args: Union[str, list[str]], env=None):
-    """
-    Run the python in the current environment.
-    """
-    if isinstance(args, str):
-        args = shlex.split(args)
-    cmd = [sys.executable] + args
-    run_command(" ".join(map(shlex.quote, cmd)), env=env)
-
-
-def pkg_exists(name: str) -> bool:
-    try:
-        pkg_version = version(name)
-        logger.info("%s already exist with version: %s", name, pkg_version)
-        return True
-    except PackageNotFoundError:
-        logger.info("%s is not installed", name)
-        return False
-
-
-def first_matching_pkg(pattern: str) -> str:
-    matches = sorted(glob.glob(pattern))
-    if not matches:
-        raise FileNotFoundError(f"No wheel matching: {pattern}")
-    return matches[0]
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -1,139 +0,0 @@
-"""
-General Utility helpers for CLI tasks.
-"""
-
-import logging
-import os
-import shlex
-import subprocess
-import sys
-from contextlib import contextmanager
-from pathlib import Path
-from typing import Optional
-
-
-logger = logging.getLogger(__name__)
-
-
-def run_command(
-    cmd: str,
-    use_shell: bool = False,
-    log_cmd: bool = True,
-    cwd: Optional[str] = None,
-    env: Optional[dict] = None,
-    check: bool = True,
-) -> int:
-    """Run a command with optional shell execution."""
-    if use_shell:
-        args = cmd
-        log_prefix = "[shell]"
-        executable = "/bin/bash"
-    else:
-        args = shlex.split(cmd)
-        log_prefix = "[cmd]"
-        executable = None
-
-    if log_cmd:
-        display_cmd = cmd if use_shell else " ".join(args)
-        logger.info("%s %s", log_prefix, display_cmd)
-
-    run_env = {**os.environ, **(env or {})}
-
-    proc = subprocess.run(
-        args,
-        shell=use_shell,
-        executable=executable,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-        cwd=cwd,
-        env=run_env,
-        check=False,
-    )
-
-    if check and proc.returncode != 0:
-        logger.error(
-            "%s Command failed (exit %s): %s", log_prefix, proc.returncode, cmd
-        )
-        raise subprocess.CalledProcessError(
-            proc.returncode, args if not use_shell else cmd
-        )
-
-    return proc.returncode
-
-
-def str2bool(value: Optional[str]) -> bool:
-    """Convert environment variables to boolean values."""
-    if not value:
-        return False
-    if not isinstance(value, str):
-        raise ValueError(
-            f"Expected a string value for boolean conversion, got {type(value)}"
-        )
-    value = value.strip().lower()
-
-    true_value_set = {"1", "true", "t", "yes", "y", "on", "enable", "enabled", "found"}
-    false_value_set = {"0", "false", "f", "no", "n", "off", "disable"}
-
-    if value in true_value_set:
-        return True
-    if value in false_value_set:
-        return False
-    raise ValueError(f"Invalid string value for boolean conversion: {value}")
-
-
-@contextmanager
-def temp_environ(updates: dict[str, str]):
-    """
-    Temporarily set environment variables and restore them after the block.
-    Args:
-        updates: Dict of environment variables to set.
-    """
-    missing = object()
-    old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
-    try:
-        os.environ.update(updates)
-        yield
-    finally:
-        for k, v in old.items():
-            if v is missing:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v  # type: ignore[arg-type]
-
-
-@contextmanager
-def working_directory(path: str):
-    """
-    Temporarily change the working directory inside a context.
-    """
-    if not path:
-        # No-op context
-        yield
-        return
-    prev_cwd = os.getcwd()
-    try:
-        os.chdir(path)
-        yield
-    finally:
-        os.chdir(prev_cwd)
-
-
-def get_wheels(
-    output_dir: Path,
-    max_depth: Optional[int] = None,
-) -> list[str]:
-    """Return a list of wheels found in the given output directory."""
-    root = Path(output_dir)
-    if not root.exists():
-        return []
-    items = []
-    for dirpath, _, filenames in os.walk(root):
-        depth = Path(dirpath).relative_to(root).parts
-        if max_depth is not None and len(depth) > max_depth:
-            continue
-        for fname in sorted(filenames):
-            if fname.endswith(".whl"):
-                pkg = fname.split("-")[0]
-                relpath = str((Path(dirpath) / fname).relative_to(root))
-                items.append({"pkg": pkg, "relpath": relpath})
-    return items
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -1,256 +0,0 @@
-import logging
-import os
-import textwrap
-from typing import Any
-
-from cli.lib.common.gh_summary import write_gh_step_summary
-from cli.lib.common.git_helper import clone_external_repo
-from cli.lib.common.pip_helper import pip_install_packages
-from cli.lib.common.utils import run_command, temp_environ, working_directory
-from jinja2 import Template
-
-
-logger = logging.getLogger(__name__)
-
-_TPL_VLLM_INFO = Template(
-    textwrap.dedent("""\
-    ##  Vllm against Pytorch CI Test Summary
-    **Vllm Commit**: [{{ vllm_commit }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})
-    {%- if torch_sha %}
-    **Pytorch Commit**: [{{ torch_sha }}](https://github.com/pytorch/pytorch/commit/{{ torch_sha }})
-    {%- endif %}
-""")
-)
-
-
-def sample_vllm_test_library():
-    """
-    Simple sample to unblock the vllm ci development, which is mimic to
-    https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
-    see run_test_plan for more details
-    """
-    # TODO(elainewy): Read from yaml file to handle the env and tests for vllm
-    return {
-        "vllm_basic_correctness_test": {
-            "title": "Basic Correctness Test",
-            "id": "vllm_basic_correctness_test",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "steps": [
-                "pytest -v -s basic_correctness/test_cumem.py",
-                "pytest -v -s basic_correctness/test_basic_correctness.py",
-                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
-            ],
-        },
-        "vllm_basic_models_test": {
-            "title": "Basic models test",
-            "id": "vllm_basic_models_test",
-            "steps": [
-                "pytest -v -s models/test_transformers.py",
-                "pytest -v -s models/test_registry.py",
-                "pytest -v -s models/test_utils.py",
-                "pytest -v -s models/test_vision.py",
-                "pytest -v -s models/test_initialization.py",
-            ],
-        },
-        "vllm_entrypoints_test": {
-            "title": "Entrypoints Test ",
-            "id": "vllm_entrypoints_test",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "steps": [
-                " ".join(
-                    [
-                        "pytest",
-                        "-v",
-                        "-s",
-                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
-                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
-                        "--ignore=entrypoints/llm/test_collective_rpc.py",
-                    ]
-                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
-            ],
-        },
-        "vllm_regression_test": {
-            "title": "Regression Test",
-            "id": "vllm_regression_test",
-            "package_install": ["modelscope"],
-            "steps": [
-                "pytest -v -s test_regression.py",
-            ],
-        },
-        "vllm_lora_tp_test_distributed": {
-            "title": "LoRA TP Test (Distributed)",
-            "id": "vllm_lora_tp_test_distributed",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "num_gpus": 4,
-            "steps": [
-                "pytest -v -s -x lora/test_chatglm3_tp.py",
-                "echo $VLLM_WORKER_MULTIPROC_METHOD",
-                "pytest -v -s -x lora/test_llama_tp.py",
-                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
-            ],
-        },
-        "vllm_lora_280_failure_test": {
-            "title": "LoRA 280 failure test",
-            "id": "vllm_lora_280_failure_test",
-            "steps": ["pytest -v lora/test_quant_model.py"],
-        },
-        "vllm_multi_model_processor_test": {
-            "title": "Multi-Modal Processor Test",
-            "id": "vllm_multi_model_processor_test",
-            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
-            "steps": [
-                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
-            ],
-        },
-        "vllm_pytorch_compilation_unit_tests": {
-            "title": "PyTorch Compilation Unit Tests",
-            "id": "vllm_pytorch_compilation_unit_tests",
-            "steps": [
-                "pytest -v -s compile/test_pass_manager.py",
-                "pytest -v -s compile/test_fusion.py",
-                "pytest -v -s compile/test_fusion_attn.py",
-                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
-                "pytest -v -s compile/test_sequence_parallelism.py",
-                "pytest -v -s compile/test_async_tp.py",
-                "pytest -v -s compile/test_fusion_all_reduce.py",
-                "pytest -v -s compile/test_decorator.py",
-            ],
-        },
-        # TODO(elainewy):need to add g6 with 4 gpus to run this test
-        "vllm_lora_test": {
-            "title": "LoRA Test %N",
-            "id": "lora_test",
-            "parallelism": 4,
-            "steps": [
-                "echo '[checking] list sharded lora tests:'",
-                " ".join(
-                    [
-                        "pytest -q --collect-only lora",
-                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
-                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
-                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
-                    ]
-                ),
-                "echo '[checking] Done. list lora tests'",
-                " ".join(
-                    [
-                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
-                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
-                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
-                    ]
-                ),
-            ],
-        },
-    }
-
-
-def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
-    """
-    a method to check if the test plan is parallelism or not.
-    """
-    parallelism = int(tests.get("parallelism", "0"))
-    is_parallel = parallelism and parallelism > 1
-
-    if not is_parallel:
-        return False
-
-    if shard_id > num_shards:
-        raise RuntimeError(
-            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
-        )
-
-    if num_shards != parallelism:
-        raise RuntimeError(
-            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
-        )
-
-    return True
-
-
-def run_test_plan(
-    test_plan: str,
-    test_target: str,
-    tests_map: dict[str, Any],
-    shard_id: int = 0,
-    num_shards: int = 0,
-):
-    """
-    a method to run list of tests based on the test plan.
-    """
-    logger.info("run %s tests.....", test_target)
-    if test_plan not in tests_map:
-        raise RuntimeError(
-            f"test {test_plan} not found, please add it to test plan pool"
-        )
-    tests = tests_map[test_plan]
-    pkgs = tests.get("package_install", [])
-    title = tests.get("title", "unknown test")
-
-    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
-    if is_parallel:
-        title = title.replace("%N", f"{shard_id}/{num_shards}")
-
-    logger.info("Running tests: %s", title)
-    if pkgs:
-        logger.info("Installing packages: %s", pkgs)
-        pip_install_packages(packages=pkgs, prefer_uv=True)
-    with (
-        working_directory(tests.get("working_directory", "tests")),
-        temp_environ(tests.get("env_vars", {})),
-    ):
-        failures = []
-        for step in tests["steps"]:
-            logger.info("Running step: %s", step)
-            if is_parallel:
-                step = replace_buildkite_placeholders(step, shard_id, num_shards)
-                logger.info("Running parallel step: %s", step)
-            code = run_command(cmd=step, check=False, use_shell=True)
-            if code != 0:
-                failures.append(step)
-            logger.info("Finish running step: %s", step)
-        if failures:
-            logger.error("Failed tests: %s", failures)
-            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
-        logger.info("Done. All tests passed")
-
-
-def clone_vllm(dst: str = "vllm"):
-    _, commit = clone_external_repo(
-        target="vllm",
-        repo="https://github.com/vllm-project/vllm.git",
-        dst=dst,
-        update_submodules=True,
-    )
-    return commit
-
-
-def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
-    mapping = {
-        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
-        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
-    }
-    for k in sorted(mapping, key=len, reverse=True):
-        step = step.replace(k, mapping[k])
-    return step
-
-
-def summarize_build_info(vllm_commit: str) -> bool:
-    torch_sha = os.getenv("GITHUB_SHA")
-    md = (
-        _TPL_VLLM_INFO.render(vllm_commit=vllm_commit, torch_sha=torch_sha).strip()
-        + "\n"
-    )
-    return write_gh_step_summary(md)
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -1,285 +0,0 @@
-import logging
-import os
-import textwrap
-from dataclasses import dataclass
-from pathlib import Path
-from typing import Optional
-
-from cli.lib.common.cli_helper import BaseRunner
-from cli.lib.common.docker_helper import local_image_exists
-from cli.lib.common.envs_helper import (
-    env_bool_field,
-    env_path_field,
-    env_str_field,
-    with_params_help,
-)
-from cli.lib.common.gh_summary import (
-    gh_summary_path,
-    summarize_content_from_file,
-    summarize_wheels,
-)
-from cli.lib.common.path_helper import (
-    copy,
-    ensure_dir_exists,
-    force_create_dir,
-    get_path,
-    is_path_exist,
-)
-from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm, summarize_build_info
-
-
-logger = logging.getLogger(__name__)
-
-
-# Default path for docker build artifacts
-_DEFAULT_RESULT_PATH = "./shared"
-
-# Temp folder in vllm work place to cp torch whls in vllm work directory for docker build
-_VLLM_TEMP_FOLDER = "tmp"
-
-
-@dataclass
-class VllmBuildParameters:
-    """
-    Parameters defining the vllm external input configurations.
-    Combine with VllmDockerBuildArgs to define the vllm build environment
-    """
-
-    # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
-    # Otherwise docker build pull torch nightly during build
-    # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
-    use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
-    torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
-
-    # USE_LOCAL_BASE_IMAGE: when true, use an existing local Docker base image; requires BASE_IMAGE
-    # Otherwise, pull dockerfile's default image remotely
-    # BASE_IMAGE: name:tag (only needed when use_local_base_image is True)
-    use_local_base_image: bool = env_bool_field("USE_LOCAL_BASE_IMAGE", True)
-    base_image: str = env_str_field("BASE_IMAGE")
-
-    # USE_LOCAL_DOCKERFILE: when true("1"), use a local Dockerfile; requires DOCKERFILE_PATH.
-    # otherwise, use vllm's default dockerfile.torch_nightly for build
-    # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
-    use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
-    dockerfile_path: Path = env_path_field(
-        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
-    )
-
-    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
-    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
-
-    # --- Build args ----------------------------------------------------------
-    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
-
-    tag_name: str = env_str_field("TAG", "vllm-wheels")
-
-    cuda_version: str = env_str_field("CUDA_VERSION", "12.8.1")
-
-    python_version: str = env_str_field("PYTHON_VERSION", "3.12")
-
-    max_jobs: str = env_str_field("MAX_JOBS", "64")
-
-    sccache_bucket: str = env_str_field("SCCACHE_BUCKET")
-
-    sccache_region: str = env_str_field("SCCACHE_REGION")
-
-    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
-
-    def __post_init__(self):
-        checks = [
-            (
-                self.use_torch_whl,  # flag
-                True,  # trigger_value
-                "torch_whls_path",  # resource
-                is_path_exist,  # check_func
-                "TORCH_WHEELS_PATH is not provided, but USE_TORCH_WHEEL is set to 1",
-            ),
-            (
-                self.use_local_base_image,
-                True,
-                "base_image",
-                local_image_exists,
-                f"BASE_IMAGE {self.base_image} does not found, but USE_LOCAL_BASE_IMAGE is set to 1",
-            ),
-            (
-                self.use_local_dockerfile,
-                True,
-                "dockerfile_path",
-                is_path_exist,
-                " DOCKERFILE_PATH path does not found, but USE_LOCAL_DOCKERFILE is set to 1",
-            ),
-        ]
-        for flag, trigger_value, attr_name, check_func, error_msg in checks:
-            value = getattr(self, attr_name)
-            if flag == trigger_value:
-                if not value or not check_func(value):
-                    raise ValueError(error_msg)
-            else:
-                logger.info("flag  %s is not set", flag)
-        if not self.output_dir:
-            raise ValueError("missing required output_dir")
-
-
-@with_params_help(VllmBuildParameters)
-class VllmBuildRunner(BaseRunner):
-    """
-    Build vLLM using docker buildx.
-
-    Environment variable options:
-        "USE_TORCH_WHEEL":      "1: use local wheels; 0: pull nightly from pypi",
-        "TORCH_WHEELS_PATH":    "Path to local wheels (when USE_TORCH_WHEEL=1)",
-
-        "USE_LOCAL_BASE_IMAGE": "1: use local base image; 0: default image",
-         "BASE_IMAGE":           "name:tag to indicate base image the dockerfile depends on (when USE_LOCAL_BASE_IMAGE=1)",
-
-        "USE_LOCAL_DOCKERFILE": "1: use local Dockerfile; 0: vllm repo default dockerfile.torch_nightly",
-        "DOCKERFILE_PATH":      "Path to Dockerfile (when USE_LOCAL_DOCKERFILE=1)",
-
-        "OUTPUT_DIR":           "e.g. './shared'",
-
-        "TORCH_CUDA_ARCH_LIST": "e.g. '8.0' or '8.0;9.0'",
-        "CUDA_VERSION":         "e.g. '12.8.1'",
-        "PYTHON_VERSION":       "e.g. '3.12'",
-        "MAX_JOBS":             "e.g. '64'",
-        "SCCACHE_BUCKET":       "e.g. 'my-bucket'",
-        "SCCACHE_REGION":       "e.g. 'us-west-2'",
-    """
-
-    def __init__(self, args=None):
-        self.work_directory = "vllm"
-
-    def run(self):
-        """
-        main function to run vllm build
-        1. prepare vllm build environment
-        2. prepare the docker build command args
-        3. run docker build
-        """
-        inputs = VllmBuildParameters()
-        logger.info("Running vllm build with inputs: %s", inputs)
-        vllm_commit = clone_vllm()
-
-        self.cp_dockerfile_if_exist(inputs)
-        # cp torch wheels from root direct to vllm workspace if exist
-        self.cp_torch_whls_if_exist(inputs)
-
-        # make sure the output dir to store the build artifacts exist
-        ensure_dir_exists(Path(inputs.output_dir))
-
-        cmd = self._generate_docker_build_cmd(inputs)
-        logger.info("Running docker build: \n %s", cmd)
-
-        try:
-            run_command(cmd, cwd="vllm", env=os.environ.copy())
-        finally:
-            self.genearte_vllm_build_summary(vllm_commit, inputs)
-
-    def genearte_vllm_build_summary(
-        self, vllm_commit: str, inputs: VllmBuildParameters
-    ):
-        if not gh_summary_path():
-            return logger.info("Skipping, not detect GH Summary env var....")
-        logger.info("Generate GH Summary ...")
-        # summarize vllm build info
-        summarize_build_info(vllm_commit)
-
-        # summarize vllm build artifacts
-        vllm_artifact_dir = inputs.output_dir / "wheels"
-        summarize_content_from_file(
-            vllm_artifact_dir,
-            "build_summary.txt",
-            title="Vllm build env pip package summary",
-        )
-        summarize_wheels(
-            inputs.torch_whls_path, max_depth=3, title="Torch Wheels Artifacts"
-        )
-        summarize_wheels(vllm_artifact_dir, max_depth=3, title="Vllm Wheels Artifacts")
-
-    def cp_torch_whls_if_exist(self, inputs: VllmBuildParameters) -> str:
-        if not inputs.use_torch_whl:
-            return ""
-        tmp_dir = f"./{self.work_directory}/{_VLLM_TEMP_FOLDER}"
-        tmp_path = Path(tmp_dir)
-        force_create_dir(tmp_path)
-        copy(inputs.torch_whls_path, tmp_dir)
-        return tmp_dir
-
-    def cp_dockerfile_if_exist(self, inputs: VllmBuildParameters):
-        if not inputs.use_local_dockerfile:
-            logger.info("using vllm default dockerfile.torch_nightly for build")
-            return
-        dockerfile_path = get_path(inputs.dockerfile_path, resolve=True)
-        vllm_torch_dockerfile = Path(
-            f"./{self.work_directory}/docker/Dockerfile.nightly_torch"
-        )
-        copy(dockerfile_path, vllm_torch_dockerfile)
-
-    def get_result_path(self, path):
-        """
-        Get the absolute path of the result path
-        """
-        if not path:
-            path = _DEFAULT_RESULT_PATH
-        abs_path = get_path(path, resolve=True)
-        return abs_path
-
-    def _get_torch_wheel_path_arg(self, torch_whl_dir: Optional[Path]) -> str:
-        if not torch_whl_dir:
-            return ""
-        return f"--build-arg TORCH_WHEELS_PATH={_VLLM_TEMP_FOLDER}"
-
-    def _get_base_image_args(self, inputs: VllmBuildParameters) -> tuple[str, str, str]:
-        """
-        Returns:
-            - base_image_arg: docker buildx arg string for base image
-            - final_base_image_arg:  docker buildx arg string for vllm-base stage
-            - pull_flag: --pull=true or --pull=false depending on whether the image exists locally
-        """
-        if not inputs.use_local_base_image:
-            return "", "", ""
-
-        base_image = inputs.base_image
-
-        # set both base image and final base image to the same local image
-        base_image_arg = f"--build-arg BUILD_BASE_IMAGE={base_image}"
-        final_base_image_arg = f"--build-arg FINAL_BASE_IMAGE={base_image}"
-
-        if local_image_exists(base_image):
-            pull_flag = "--pull=false"
-            return base_image_arg, final_base_image_arg, pull_flag
-        logger.info(
-            "[INFO] Local image not found:%s will try to pull from remote", {base_image}
-        )
-        return base_image_arg, final_base_image_arg, ""
-
-    def _generate_docker_build_cmd(
-        self,
-        inputs: VllmBuildParameters,
-    ) -> str:
-        base_image_arg, final_base_image_arg, pull_flag = self._get_base_image_args(
-            inputs
-        )
-        torch_arg = self._get_torch_wheel_path_arg(inputs.torch_whls_path)
-
-        return textwrap.dedent(
-            f"""
-            docker buildx build \
-                --output type=local,dest={inputs.output_dir} \
-                -f docker/Dockerfile.nightly_torch \
-                {pull_flag} \
-                {torch_arg} \
-                {base_image_arg} \
-                {final_base_image_arg} \
-                --build-arg max_jobs={inputs.max_jobs} \
-                --build-arg CUDA_VERSION={inputs.cuda_version} \
-                --build-arg PYTHON_VERSION={inputs.python_version} \
-                --build-arg USE_SCCACHE={int(bool(inputs.sccache_bucket and inputs.sccache_region))} \
-                --build-arg SCCACHE_BUCKET_NAME={inputs.sccache_bucket} \
-                --build-arg SCCACHE_REGION_NAME={inputs.sccache_region} \
-                --build-arg torch_cuda_arch_list='{inputs.torch_cuda_arch_list}' \
-                --target {inputs.target_stage} \
-                -t {inputs.tag_name} \
-                --progress=plain .
-        """
-        ).strip()
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -1,263 +0,0 @@
-import logging
-import os
-import re
-import subprocess
-import sys
-from collections.abc import Iterable
-from dataclasses import dataclass
-from enum import Enum
-from pathlib import Path
-from typing import Any
-
-from cli.lib.common.cli_helper import BaseRunner
-from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, remove_dir
-from cli.lib.common.pip_helper import (
-    pip_install_first_match,
-    pip_install_packages,
-    pkg_exists,
-    run_python,
-)
-from cli.lib.common.utils import run_command, working_directory
-from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class VllmTestParameters:
-    """
-    Parameters defining the vllm external test input
-
-    !!!DO NOT ADD SECRETS IN THIS CLASS!!!
-    you can put environment variable name in VllmTestParameters if it's not the same as the secret one
-    fetch secrests directly from env variables during runtime
-    """
-
-    torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")
-
-    vllm_whls_path: Path = env_path_field(
-        "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"
-    )
-
-    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
-
-    def __post_init__(self):
-        if not self.torch_whls_path.exists():
-            raise ValueError("missing torch_whls_path")
-        if not self.vllm_whls_path.exists():
-            raise ValueError("missing vllm_whls_path")
-
-
-class TestInpuType(Enum):
-    TEST_PLAN = "test_plan"
-    UNKNOWN = "unknown"
-
-
-class VllmTestRunner(BaseRunner):
-    def __init__(self, args: Any):
-        self.work_directory = "vllm"
-        self.test_plan = ""
-        self.test_type = TestInpuType.UNKNOWN
-
-        self.shard_id = args.shard_id
-        self.num_shards = args.num_shards
-
-        if args.test_plan:
-            self.test_plan = args.test_plan
-            self.test_type = TestInpuType.TEST_PLAN
-
-        # Matches the structeur in the artifacts.zip from torcb build
-        self.TORCH_WHL_PATH_REGEX = "torch*.whl"
-        self.TORCH_WHL_EXTRA = "opt-einsum"
-        self.TORCH_ADDITIONAL_WHLS_REGEX = [
-            "vision/torchvision*.whl",
-            "audio/torchaudio*.whl",
-        ]
-
-        # Match the structure of the artifacts.zip from vllm external build
-        self.VLLM_TEST_WHLS_REGEX = [
-            "xformers/*.whl",
-            "vllm/vllm*.whl",
-            "flashinfer-python/flashinfer*.whl",
-        ]
-
-    def prepare(self):
-        """
-        prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
-        """
-        params = VllmTestParameters()
-        logger.info("Display VllmTestParameters %s", params)
-        self._set_envs(params)
-
-        clone_vllm(dst=self.work_directory)
-        with working_directory(self.work_directory):
-            remove_dir(Path("vllm"))
-            self._install_wheels(params)
-            self._install_dependencies()
-        # verify the torches are not overridden by test dependencies
-        check_versions()
-
-    def run(self):
-        """
-        main function to run vllm test
-        """
-        self.prepare()
-        with working_directory(self.work_directory):
-            if self.test_type == TestInpuType.TEST_PLAN:
-                if self.num_shards > 1:
-                    run_test_plan(
-                        self.test_plan,
-                        "vllm",
-                        sample_vllm_test_library(),
-                        self.shard_id,
-                        self.num_shards,
-                    )
-                else:
-                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
-            else:
-                raise ValueError(f"Unknown test type {self.test_type}")
-
-    def _install_wheels(self, params: VllmTestParameters):
-        logger.info("Running vllm test with inputs: %s", params)
-        if not pkg_exists("torch"):
-            # install torch from local whls if it's not installed yet.
-            torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
-            pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
-
-        torch_whls_path = [
-            f"{str(params.torch_whls_path)}/{whl_path}"
-            for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
-        ]
-        for torch_whl in torch_whls_path:
-            pip_install_first_match(torch_whl)
-        logger.info("Done. Installed torch and other torch-related wheels ")
-
-        logger.info("Installing vllm wheels")
-        vllm_whls_path = [
-            f"{str(params.vllm_whls_path)}/{whl_path}"
-            for whl_path in self.VLLM_TEST_WHLS_REGEX
-        ]
-        for vllm_whl in vllm_whls_path:
-            pip_install_first_match(vllm_whl)
-        logger.info("Done. Installed vllm wheels")
-
-    def _install_test_dependencies(self):
-        """
-        This method replaces torch dependencies with local torch wheel info in
-        requirements/test.in file from vllm repo. then generates the test.txt
-        in runtime
-        """
-        logger.info("generate test.txt from requirements/test.in with local torch whls")
-        preprocess_test_in()
-        copy("requirements/test.txt", "snapshot_constraint.txt")
-
-        run_command(
-            f"{sys.executable} -m uv pip compile requirements/test.in "
-            "-o test.txt "
-            "--index-strategy unsafe-best-match "
-            "--constraint snapshot_constraint.txt "
-            "--torch-backend cu128"
-        )
-        pip_install_packages(requirements="test.txt", prefer_uv=True)
-        logger.info("Done. installed requirements for test dependencies")
-
-    def _install_dependencies(self):
-        pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
-        pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
-        # using script from vllm repo to remove all torch packages from requirements txt
-        run_python("use_existing_torch.py")
-
-        # install common packages
-        for requirements in ["requirements/common.txt", "requirements/build.txt"]:
-            pip_install_packages(
-                requirements=requirements,
-                prefer_uv=True,
-            )
-        # install test packages
-        self._install_test_dependencies()
-
-    def _set_envs(self, inputs: VllmTestParameters):
-        os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
-        if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
-            logger.warning(
-                "Missing supported TORCH_CUDA_ARCH_LIST. "
-                "Currently support TORCH_CUDA_ARCH_LIST env var "
-                "with supported arch [8.0, 8.9, 9.0]"
-            )
-
-        os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
-        if not get_env("HF_TOKEN"):
-            raise ValueError(
-                "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
-            )
-        if not get_env("TORCH_CUDA_ARCH_LIST"):
-            raise ValueError(
-                "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
-            )
-
-
-def preprocess_test_in(
-    target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
-):
-    """
-    This modifies the target_file file in place in vllm work directory.
-    It removes torch and unwanted packages in target_file and replace with local torch whls
-    package  with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"
-    """
-    additional_package_to_move = list(additional_packages or ())
-    pkgs_to_remove = [
-        "torch",
-        "torchvision",
-        "torchaudio",
-        "xformers",
-        "mamba_ssm",
-    ] + additional_package_to_move
-    # Read current requirements
-    target_path = Path(target_file)
-    lines = target_path.read_text().splitlines()
-
-    pkgs_to_add = []
-
-    # Remove lines starting with the package names (==, @, >=) — case-insensitive
-    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
-    kept_lines = [line for line in lines if not pattern.match(line)]
-
-    # Get local installed torch/vision/audio from pip freeze
-    # This is hacky, but it works
-    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
-    header_lines = [
-        line
-        for line in pip_freeze.splitlines()
-        if re.match(
-            r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
-        )
-    ]
-
-    # Write back: header_lines + blank + kept_lines
-    out_lines = header_lines + [""] + kept_lines
-    if pkgs_to_add:
-        out_lines += [""] + pkgs_to_add
-
-    out = "\n".join(out_lines) + "\n"
-    target_path.write_text(out)
-    logger.info("[INFO] Updated %s", target_file)
-
-
-def validate_cuda(value: str) -> bool:
-    VALID_VALUES = {"8.0", "8.9", "9.0"}
-    return all(v in VALID_VALUES for v in value.split())
-
-
-def check_versions():
-    """
-    check installed packages version
-    """
-    logger.info("Double check installed packages")
-    patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
-    for pkg in patterns:
-        pkg_exists(pkg)
-    logger.info("Done. checked installed packages")
--- a/.ci/lumen_cli/cli/run.py
+++ b/.ci/lumen_cli/cli/run.py
@ -1,40 +0,0 @@
-# main.py
-
-import argparse
-import logging
-
-from cli.build_cli.register_build import register_build_commands
-from cli.lib.common.logger import setup_logging
-from cli.test_cli.register_test import register_test_commands
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    # Define top-level parser
-    parser = argparse.ArgumentParser(description="Lumos CLI")
-    subparsers = parser.add_subparsers(dest="command", required=True)
-    parser.add_argument(
-        "--log-level", default="INFO", help="Log level (DEBUG, INFO, WARNING, ERROR)"
-    )
-
-    # registers second-level subcommands
-    register_build_commands(subparsers)
-    register_test_commands(subparsers)
-
-    # parse args after all options are registered
-    args = parser.parse_args()
-
-    # setup global logging
-    setup_logging(getattr(logging, args.log_level.upper(), logging.INFO))
-    logger.debug("Parsed args: %s", args)
-
-    if hasattr(args, "func"):
-        args.func(args)
-    else:
-        parser.print_help()
-
-
-if __name__ == "__main__":
-    main()
--- a/.ci/lumen_cli/cli/test_cli/register_test.py
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@ -1,62 +0,0 @@
-import argparse
-import logging
-
-from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
-from cli.lib.core.vllm.vllm_test import VllmTestRunner
-
-
-logger = logging.getLogger(__name__)
-
-# Maps targets to their argparse configuration and runner
-# it adds new target to path python -m cli.run build external {target} with buildrunner
-_TARGETS: dict[str, TargetSpec] = {
-    "vllm": {
-        "runner": VllmTestRunner,
-        "help": "test vLLM with pytorch main",
-    }
-    # add yours ...
-}
-
-
-def common_args(parser: argparse.ArgumentParser) -> None:
-    """
-    Add common CLI arguments to the given parser.
-    """
-    parser.add_argument(
-        "--shard-id",
-        type=int,
-        default=1,
-        help="a shard id to run, e.g. '0,1,2,3'",
-    )
-    parser.add_argument(
-        "--num-shards",
-        type=int,
-        default=1,
-        help="a number of shards to run, e.g. '4'",
-    )
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
-        "-tp",
-        "--test-plan",
-        type=str,
-        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
-    )
-
-
-def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
-    build_parser = subparsers.add_parser(
-        "test",
-        help="test related commands",
-        formatter_class=RichHelp,
-    )
-    build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
-    overview = "\n".join(
-        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
-    )
-    external_parser = build_subparsers.add_parser(
-        "external",
-        help="Test external targets",
-        description="Test third-party targets.\n\nAvailable targets:\n" + overview,
-        formatter_class=RichHelp,
-    )
-    register_targets(external_parser, _TARGETS, common_args=common_args)
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -1,23 +0,0 @@
-[project]
-name = "lumen-ci"
-version = "0.1.0"
-dependencies = [
-    "pyyaml==6.0.2",
-    "GitPython==3.1.45",
-    "docker==7.1.0",
-    "pytest==7.3.2",
-    "uv==0.8.6"
-]
-
-[tool.setuptools]
-packages = ["cli"]
-
-[tool.setuptools.package-dir]
-cli = "cli"
-
-[tool.ruff.lint]
-# Enable preview mode for linting
-preview = true
-
-# Now you can select your preview rules, like RUF048
-extend-select = ["RUF048"]
--- a/.ci/lumen_cli/tests/test_app.py
+++ b/.ci/lumen_cli/tests/test_app.py
@ -1,47 +0,0 @@
-# tests/test_cli.py
-import io
-import sys
-import unittest
-from contextlib import redirect_stderr, redirect_stdout
-from unittest.mock import patch
-
-from cli.run import main
-
-
-class TestArgparseCLI(unittest.TestCase):
-    @patch("cli.build_cli.register_build.VllmBuildRunner.run", return_value=None)
-    @patch("cli.build_cli.register_build.VllmBuildRunner.__init__", return_value=None)
-    def test_cli_run_build_external(self, mock_init, mock_run):
-        from cli.run import main  # import after patches if needed
-
-        test_args = ["cli.run", "build", "external", "vllm"]
-        with patch.object(sys, "argv", test_args):
-            # argparse may call sys.exit on error; capture to avoid test aborts
-            try:
-                main()
-            except SystemExit:
-                pass
-        mock_init.assert_called_once()  # got constructed
-        mock_run.assert_called_once_with()  # run() called
-
-    def test_build_help(self):
-        test_args = ["cli.run", "build", "--help"]
-
-        with patch.object(sys, "argv", test_args):
-            stdout = io.StringIO()
-            stderr = io.StringIO()
-
-            # --help always raises SystemExit(0)
-            with self.assertRaises(SystemExit) as cm:
-                with redirect_stdout(stdout), redirect_stderr(stderr):
-                    main()
-
-            self.assertEqual(cm.exception.code, 0)
-
-            output = stdout.getvalue()
-            self.assertIn("usage", output)
-            self.assertIn("external", output)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/.ci/lumen_cli/tests/test_cli_helper.py
+++ b/.ci/lumen_cli/tests/test_cli_helper.py
@ -1,115 +0,0 @@
-import argparse
-import io
-import unittest
-from contextlib import redirect_stderr
-from unittest.mock import patch
-
-from cli.lib.common.cli_helper import BaseRunner, register_targets, RichHelp, TargetSpec
-
-
-# ---- Dummy runners for unittests----
-class FooRunner(BaseRunner):
-    """Foo description from docstring."""
-
-    def run(self) -> None:  # replaced by mock
-        pass
-
-
-class BarRunner(BaseRunner):
-    def run(self) -> None:  # replaced by mock
-        pass
-
-
-def add_foo_args(p: argparse.ArgumentParser) -> None:
-    p.add_argument("--x", type=int, required=True, help="x value")
-
-
-def common_args(p: argparse.ArgumentParser) -> None:
-    p.add_argument("--verbose", action="store_true", help="verbose flag")
-
-
-def build_parser(specs: dict[str, TargetSpec]) -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(prog="app", formatter_class=RichHelp)
-    register_targets(
-        parser=parser,
-        target_specs=specs,
-        common_args=common_args,
-    )
-    return parser
-
-
-def get_subparser(
-    parser: argparse.ArgumentParser, name: str
-) -> argparse.ArgumentParser:
-    subparsers_action = next(
-        a
-        for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
-        if isinstance(a, argparse._SubParsersAction)
-    )
-    return subparsers_action.choices[name]
-
-
-class TestRegisterTargets(unittest.TestCase):
-    def test_metavar_lists_targets(self):
-        specs: dict[str, TargetSpec] = {
-            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
-            "bar": {"runner": BarRunner},
-        }
-        parser = build_parser(specs)
-        subparsers_action = next(
-            a
-            for a in parser._subparsers._group_actions  # type: ignore[attr-defined]
-            if isinstance(a, argparse._SubParsersAction)
-        )
-        self.assertEqual(subparsers_action.metavar, "{foo,bar}")
-
-    def test_add_arguments_and_common_args_present(self):
-        specs: dict[str, TargetSpec] = {
-            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
-        }
-        parser = build_parser(specs)
-        foo = get_subparser(parser, "foo")
-        help_text = foo.format_help()
-        self.assertIn("--x", help_text)
-        self.assertIn("--verbose", help_text)
-
-    def test_runner_constructed_with_ns_and_run_called(self):
-        specs: dict[str, TargetSpec] = {
-            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
-        }
-        parser = build_parser(specs)
-
-        with (
-            patch.object(FooRunner, "__init__", return_value=None) as mock_init,
-            patch.object(FooRunner, "run", return_value=None) as mock_run,
-        ):
-            ns = parser.parse_args(["foo", "--x", "3", "--verbose"])
-            ns.func(ns)  # set by register_targets
-            # __init__ received the Namespace
-            self.assertEqual(mock_init.call_count, 1)
-            (called_ns,), _ = mock_init.call_args
-            self.assertIsInstance(called_ns, argparse.Namespace)
-            # run() called with no args
-            mock_run.assert_called_once_with()
-
-    def test_runner_docstring_used_as_description_when_missing(self):
-        specs: dict[str, TargetSpec] = {
-            "foo": {"runner": FooRunner, "add_arguments": add_foo_args},
-        }
-        parser = build_parser(specs)
-        foo = get_subparser(parser, "foo")
-        help_text = foo.format_help()
-        self.assertIn("Foo description from docstring.", help_text)
-
-    def test_missing_target_raises_systemexit_with_usage(self):
-        specs: dict[str, TargetSpec] = {"foo": {"runner": FooRunner}}
-        parser = build_parser(specs)
-        buf = io.StringIO()
-        with self.assertRaises(SystemExit), redirect_stderr(buf):
-            parser.parse_args([])
-        err = buf.getvalue()
-        self.assertIn("usage:", err)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/.ci/lumen_cli/tests/test_docker_helper.py
+++ b/.ci/lumen_cli/tests/test_docker_helper.py
@ -1,75 +0,0 @@
-import unittest
-from unittest import mock
-from unittest.mock import MagicMock
-
-import docker.errors as derr
-from cli.lib.common.docker_helper import _get_client, local_image_exists
-
-
-class TestDockerImageHelpers(unittest.TestCase):
-    def setUp(self):
-        # Reset the singleton in the target module
-        patcher = mock.patch("cli.lib.common.docker_helper._docker_client", None)
-        self.addCleanup(patcher.stop)
-        patcher.start()
-
-    def test_local_image_exists_true(self):
-        # Mock a docker client whose images.get returns an object (no exception)
-        mock_client = MagicMock()
-        mock_client.images.get.return_value = object()
-        ok = local_image_exists("repo:tag", client=mock_client)
-        self.assertTrue(ok)
-
-    def test_local_image_exists_not_found_false(self):
-        mock_client = MagicMock()
-        # Raise docker.errors.NotFound
-        mock_client.images.get.side_effect = derr.NotFound("nope")
-        ok = local_image_exists("missing:latest", client=mock_client)
-        self.assertFalse(ok)
-
-    def test_local_image_exists_api_error_false(self):
-        mock_client = MagicMock()
-        mock_client.images.get.side_effect = derr.APIError("boom", None)
-
-        ok = local_image_exists("broken:tag", client=mock_client)
-        self.assertFalse(ok)
-
-    def test_local_image_exists_uses_lazy_singleton(self):
-        # Patch docker.from_env used by _get_client()
-        with mock.patch(
-            "cli.lib.common.docker_helper.docker.from_env"
-        ) as mock_from_env:
-            mock_docker_client = MagicMock()
-            mock_from_env.return_value = mock_docker_client
-
-            # First call should create and cache the client
-            c1 = _get_client()
-            self.assertIs(c1, mock_docker_client)
-            mock_from_env.assert_called_once()
-
-            # Second call should reuse cached client (no extra from_env calls)
-            c2 = _get_client()
-            self.assertIs(c2, mock_docker_client)
-            mock_from_env.assert_called_once()  # still once
-
-    def test_local_image_exists_without_client_param_calls_get_client_once(self):
-        # Ensure _get_client is called and cached; local_image_exists should reuse it
-        with mock.patch("cli.lib.common.docker_helper._get_client") as mock_get_client:
-            mock_client = MagicMock()
-            mock_get_client.return_value = mock_client
-
-            # 1st call
-            local_image_exists("repo:tag")
-            # 2nd call
-            local_image_exists("repo:tag2")
-
-            # local_image_exists should call _get_client each time,
-            # but your _get_client itself caches docker.from_env.
-            self.assertEqual(mock_get_client.call_count, 2)
-            self.assertEqual(mock_client.images.get.call_count, 2)
-            mock_client.images.get.assert_any_call("repo:tag")
-            mock_client.images.get.assert_any_call("repo:tag2")
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/.ci/lumen_cli/tests/test_envs_helper.py
+++ b/.ci/lumen_cli/tests/test_envs_helper.py
@ -1,149 +0,0 @@
-import os
-import unittest
-from dataclasses import dataclass
-from pathlib import Path
-from unittest.mock import patch
-
-import cli.lib.common.envs_helper as m
-
-
-class TestEnvHelpers(unittest.TestCase):
-    def setUp(self):
-        # Keep a copy of the original environment to restore later
-        self._env_backup = dict(os.environ)
-
-    def tearDown(self):
-        # Restore environment to original state
-        os.environ.clear()
-        os.environ.update(self._env_backup)
-
-    # -------- get_env --------
-    def test_get_env_unset_returns_default(self):
-        with patch.dict(os.environ, {}, clear=True):
-            self.assertEqual(m.get_env("FOO", "default"), "default")
-
-    def test_get_env_empty_returns_default(self):
-        with patch.dict(os.environ, {"FOO": ""}, clear=True):
-            self.assertEqual(m.get_env("FOO", "default"), "default")
-
-    def test_get_env_set_returns_value(self):
-        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
-            self.assertEqual(m.get_env("FOO", "default"), "bar")
-
-    def test_get_env_not_exist_returns_default(self):
-        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
-            self.assertEqual(m.get_env("TEST_NOT_EXIST", "default"), "default")
-
-    def test_get_env_not_exist_without_default(self):
-        with patch.dict(os.environ, {"FOO": "bar"}, clear=True):
-            self.assertEqual(m.get_env("TEST_NOT_EXIST"), "")
-
-    # -------- env_bool --------
-    def test_env_bool_uses_default_when_unset(self):
-        with patch.dict(os.environ, {}, clear=True):
-            self.assertTrue(m.env_bool("FLAG", default=True))
-            self.assertFalse(m.env_bool("FLAG", default=False))
-
-    def test_env_bool_uses_str2bool_when_set(self):
-        # Patch str2bool used by env_bool so we don't depend on its exact behavior
-        def fake_str2bool(s: str) -> bool:
-            return s.lower() in {"1", "true", "yes", "on", "y"}
-
-        with (
-            patch.dict(os.environ, {"FLAG": "yEs"}, clear=True),
-            patch.object(m, "str2bool", fake_str2bool),
-        ):
-            self.assertTrue(m.env_bool("FLAG", default=False))
-
-    # -------- env_path_optional / env_path --------
-    def test_env_path_optional_unset_returns_none_by_default(self):
-        with patch.dict(os.environ, {}, clear=True):
-            self.assertIsNone(m.env_path_optional("P"))
-
-    def test_env_path_optional_unset_returns_none_when_env_var_is_empty(self):
-        with patch.dict(os.environ, {"P": ""}, clear=True):
-            self.assertIsNone(m.env_path_optional("P"))
-
-    def test_env_path_optional_unset_returns_default_str(self):
-        # default as string; resolve=True by default -> absolute path
-        default_str = "x/y"
-        with patch.dict(os.environ, {}, clear=True):
-            p = m.env_path_optional("P", default=default_str)
-            self.assertIsInstance(p, Path)
-            self.assertIsNotNone(p)
-            if p:
-                self.assertTrue(p.is_absolute())
-                self.assertEqual(p.parts[-2:], ("x", "y"))
-
-    def test_env_path_optional_unset_returns_default_path_no_resolve(self):
-        d = Path("z")
-        with patch.dict(os.environ, {}, clear=True):
-            p = m.env_path_optional("P", default=d, resolve=False)
-            self.assertEqual(p, d)
-
-    def test_env_path_optional_respects_resolve_true(self):
-        with patch.dict(os.environ, {"P": "a/b"}, clear=True):
-            p = m.env_path_optional("P", resolve=True)
-            self.assertIsInstance(p, Path)
-            if p:
-                self.assertTrue(p.is_absolute())
-
-    def test_env_path_optional_respects_resolve_false(self):
-        with patch.dict(os.environ, {"P": "rel/dir"}, clear=True):
-            p = m.env_path_optional("P", resolve=False)
-            self.assertEqual(p, Path("rel/dir"))
-            if p:
-                self.assertFalse(p.is_absolute())
-
-    def test_env_path_raises_when_missing_and_default_none(self):
-        with patch.dict(os.environ, {}, clear=True):
-            with self.assertRaises(ValueError):
-                m.env_path("P", None, resolve=True)
-
-    def test_env_path_returns_path_when_present(self):
-        tmp = Path("./b").resolve()
-        with patch.dict(os.environ, {"P": str(tmp)}, clear=True):
-            p = m.env_path("P", None, resolve=True)
-            self.assertEqual(p, tmp)
-
-    # -------- dataclass field helpers --------
-    def test_dataclass_fields_read_env_at_instantiation(self):
-        @dataclass
-        class Cfg:
-            flag: bool = m.env_bool_field("FLAG", default=False)
-            out: Path = m.env_path_field("OUT", default="ab", resolve=True)
-            name: str = m.env_str_field("NAME", default="anon")
-
-        # First instantiation
-        with patch.dict(
-            os.environ, {"FLAG": "true", "OUT": "outdir", "NAME": "alice"}, clear=True
-        ):
-            cfg1 = Cfg()
-            self.assertTrue(cfg1.flag)
-            self.assertIsInstance(cfg1.out, Path)
-            self.assertTrue(cfg1.out.is_absolute())
-            self.assertEqual(cfg1.name, "alice")
-            cfg1.name = "bob"  # change instance value
-            self.assertEqual(cfg1.name, "bob")  # change is reflected
-
-        # Change env; new instance should reflect new values
-        with patch.dict(os.environ, {"FLAG": "false", "NAME": ""}, clear=True):
-            cfg2 = Cfg()
-            self.assertFalse(cfg2.flag)  # str2bool("false") -> False
-            self.assertTrue("ab" in str(cfg2.out))
-            self.assertIsInstance(cfg2.out, Path)
-            self.assertTrue(cfg2.out.is_absolute())
-            self.assertEqual(cfg2.name, "anon")  # empty -> fallback to default
-
-    def test_dataclass_path_field_with_default_value(self):
-        @dataclass
-        class C2:
-            out: Path = m.env_path_field("OUT", default="some/dir", resolve=False)
-
-        with patch.dict(os.environ, {}, clear=True):
-            c = C2()
-            self.assertEqual(c.out, Path("some/dir"))
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/.ci/lumen_cli/tests/test_path_helper.py
+++ b/.ci/lumen_cli/tests/test_path_helper.py
@ -1,122 +0,0 @@
-# test_path_utils.py
-# Run: pytest -q
-
-import os
-import unittest
-from pathlib import Path
-from tempfile import TemporaryDirectory
-
-from cli.lib.common.path_helper import (
-    copy,
-    ensure_dir_exists,
-    force_create_dir,
-    get_path,
-    is_path_exist,
-    remove_dir,
-)
-
-
-class TestPathHelper(unittest.TestCase):
-    def setUp(self):
-        self.tmpdir = TemporaryDirectory()
-        self.tmp_path = Path(self.tmpdir.name)
-
-    def tearDown(self):
-        self.tmpdir.cleanup()
-
-    # -------- get_path --------
-    def test_get_path_returns_path_for_str(self):
-        # Use relative path to avoid absolute-ness
-        rel_str = "sub/f.txt"
-        os.chdir(self.tmp_path)
-        p = get_path(rel_str, resolve=False)
-        self.assertIsInstance(p, Path)
-        self.assertFalse(p.is_absolute())
-        self.assertEqual(str(p), rel_str)
-
-    def test_get_path_resolves(self):
-        rel_str = "sub/f.txt"
-        p = get_path(str(self.tmp_path / rel_str), resolve=True)
-        self.assertTrue(p.is_absolute())
-        self.assertTrue(str(p).endswith(rel_str))
-
-    def test_get_path_with_path_input(self):
-        p_in = self.tmp_path / "sub/f.txt"
-        p_out = get_path(p_in, resolve=False)
-        self.assertTrue(str(p_out) == str(p_in))
-
-    def test_get_path_with_none_raises(self):
-        with self.assertRaises(ValueError):
-            get_path(None)  # type: ignore[arg-type]
-
-    def test_get_path_invalid_type_raises(self):
-        with self.assertRaises(TypeError):
-            get_path(123)  # type: ignore[arg-type]
-
-    # -------- ensure_dir_exists / force_create_dir / remove_dir --------
-    def test_ensure_dir_exists_creates_and_is_idempotent(self):
-        d = self.tmp_path / "made"
-        ensure_dir_exists(d)
-        self.assertTrue(d.exists() and d.is_dir())
-        ensure_dir_exists(d)
-
-    def test_force_create_dir_clears_existing(self):
-        d = self.tmp_path / "fresh"
-        (d / "inner").mkdir(parents=True)
-        (d / "inner" / "f.txt").write_text("x")
-        force_create_dir(d)
-        self.assertTrue(d.exists())
-        self.assertEqual(list(d.iterdir()), [])
-
-    def test_remove_dir_none_is_noop(self):
-        remove_dir(None)  # type: ignore[arg-type]
-
-    def test_remove_dir_nonexistent_is_noop(self):
-        ghost = self.tmp_path / "ghost"
-        remove_dir(ghost)
-
-    def test_remove_dir_accepts_str(self):
-        d = self.tmp_path / "to_rm"
-        d.mkdir()
-        remove_dir(str(d))
-        self.assertFalse(d.exists())
-
-    # -------- copy --------
-    def test_copy_file_to_file(self):
-        src = self.tmp_path / "src.txt"
-        dst = self.tmp_path / "out" / "dst.txt"
-        src.write_text("hello")
-        copy(src, dst)
-        self.assertEqual(dst.read_text(), "hello")
-
-    def test_copy_dir_to_new_dir(self):
-        src = self.tmp_path / "srcdir"
-        (src / "a").mkdir(parents=True)
-        (src / "a" / "f.txt").write_text("content")
-        dst = self.tmp_path / "destdir"
-        copy(src, dst)
-        self.assertEqual((dst / "a" / "f.txt").read_text(), "content")
-
-    def test_copy_dir_into_existing_dir_overwrite_true_merges(self):
-        src = self.tmp_path / "srcdir"
-        dst = self.tmp_path / "destdir"
-        (src / "x").mkdir(parents=True)
-        (src / "x" / "new.txt").write_text("new")
-        dst.mkdir()
-        (dst / "existing.txt").write_text("old")
-        copy(src, dst)
-        self.assertEqual((dst / "existing.txt").read_text(), "old")
-        self.assertEqual((dst / "x" / "new.txt").read_text(), "new")
-
-    def test_is_str_path_exist(self):
-        p = self.tmp_path / "x.txt"
-        p.write_text("1")
-        self.assertTrue(is_path_exist(str(p)))
-        self.assertTrue(is_path_exist(p))
-        self.assertFalse(is_path_exist(str(self.tmp_path / "missing")))
-        self.assertFalse(is_path_exist(self.tmp_path / "missing"))
-        self.assertFalse(is_path_exist(""))
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/.ci/lumen_cli/tests/test_run_plan.py
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@ -1,185 +0,0 @@
-# tests/test_run_test_plan.py
-import importlib
-from contextlib import nullcontext
-from types import SimpleNamespace
-from unittest.mock import MagicMock
-
-import pytest
-
-
-MOD = "cli.lib.core.vllm.lib"
-
-# We import inside tests so the MOD override above applies everywhere
-run_test_plan_import_path = f"{MOD}.run_test_plan"
-
-
-def _get_cmd(c):
-    # Support both kwargs and positional args
-    return c.kwargs.get("cmd", c.args[0] if c.args else None)
-
-
-def _get_check(c):
-    if "check" in c.kwargs:
-        return c.kwargs["check"]
-    # If positional, assume second arg is 'check' when present; default False
-    return c.args[1] if len(c.args) > 1 else False
-
-
-@pytest.fixture
-def patch_module(monkeypatch):
-    """
-    Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',
-    'run_command', 'logger') inside the target module and expose them.
-    """
-    module = importlib.import_module(MOD)
-
-    # Create fakes/mocks
-    pip_install_packages = MagicMock(name="pip_install_packages")
-    run_command = MagicMock(name="run_command", return_value=0)
-
-    # temp_environ / working_directory: record calls but act as context managers
-    temp_calls: list[dict] = []
-    workdir_calls: list[str] = []
-
-    def fake_working_directory(path: str):
-        workdir_calls.append(path)
-        return nullcontext()
-
-    def fake_temp_env(map: dict[str, str]):
-        temp_calls.append(map)
-        return nullcontext()
-
-    logger = SimpleNamespace(
-        info=MagicMock(name="logger.info"),
-        error=MagicMock(name="logger.error"),
-    )
-
-    # Apply patches (raise if attribute doesn't exist)
-    monkeypatch.setattr(
-        module, "pip_install_packages", pip_install_packages, raising=True
-    )
-    monkeypatch.setattr(module, "run_command", run_command, raising=True)
-    monkeypatch.setattr(
-        module, "working_directory", fake_working_directory, raising=True
-    )
-    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
-    monkeypatch.setattr(module, "logger", logger, raising=True)
-
-    return SimpleNamespace(
-        module=module,
-        run_test_plan=module.run_test_plan,  # expose to avoid getattr("constant") (Ruff B009)
-        pip_install_packages=pip_install_packages,
-        run_command=run_command,
-        temp_calls=temp_calls,
-        workdir_calls=workdir_calls,
-        logger=logger,
-    )
-
-
-def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):
-    run_test_plan = patch_module.run_test_plan
-
-    tests_map = {
-        "basic": {
-            "title": "Basic suite",
-            "package_install": [],
-            "working_directory": "tests",
-            "env_vars": {"GLOBAL_FLAG": "1"},
-            "steps": [
-                "export A=x && pytest -q",
-                "export B=y && pytest -q tests/unit",
-            ],
-        }
-    }
-
-    # One exit code per step (export + two pytest)
-    patch_module.run_command.side_effect = [0, 0, 0]
-
-    run_test_plan("basic", "cpu", tests_map)
-
-    calls = patch_module.run_command.call_args_list
-    cmds = [_get_cmd(c) for c in calls]
-    checks = [_get_check(c) for c in calls]
-
-    assert cmds == [
-        "export A=x && pytest -q",
-        "export B=y && pytest -q tests/unit",
-    ]
-    assert all(chk is False for chk in checks)
-
-    assert patch_module.workdir_calls == ["tests"]
-    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
-
-
-def test_installs_packages_when_present(monkeypatch, patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "with_pkgs": {
-            "title": "Needs deps",
-            "package_install": ["timm==1.0.0", "flash-attn"],
-            "steps": ["pytest -q"],
-        }
-    }
-
-    patch_module.run_command.return_value = 0
-
-    run_test_plan("with_pkgs", "gpu", tests_map)
-
-    patch_module.pip_install_packages.assert_called_once_with(
-        packages=["timm==1.0.0", "flash-attn"],
-        prefer_uv=True,
-    )
-
-
-def test_raises_on_missing_plan(patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-    with pytest.raises(RuntimeError) as ei:
-        run_test_plan("nope", "cpu", tests_map={})
-
-    assert "test nope not found" in str(ei.value)
-
-
-def test_aggregates_failures_and_raises(monkeypatch, patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "mix": {
-            "title": "Some pass some fail",
-            "steps": [
-                "pytest test_a.py",  # 0 → pass
-                "pytest test_b.py",  # 1 → fail
-                "pytest test_c.py",  # 2 → fail
-            ],
-        }
-    }
-
-    # Simulate pass, fail, fail
-    patch_module.run_command.side_effect = [0, 1, 2]
-
-    with pytest.raises(RuntimeError) as ei:
-        run_test_plan("mix", "cpu", tests_map)
-
-    msg = str(ei.value)
-    assert "2 pytest runs failed" in msg
-    # Ensure logger captured failed tests list
-    patch_module.logger.error.assert_called_once()
-    # And we attempted all three commands
-    assert patch_module.run_command.call_count == 3
-
-
-def test_custom_working_directory_used(patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "customwd": {
-            "title": "Custom wd",
-            "working_directory": "examples/ci",
-            "steps": ["pytest -q"],
-        }
-    }
-
-    patch_module.run_command.return_value = 0
-    run_test_plan("customwd", "cpu", tests_map)
-
-    assert patch_module.workdir_calls == ["examples/ci"]
--- a/.ci/lumen_cli/tests/test_utils.py
+++ b/.ci/lumen_cli/tests/test_utils.py
@ -1,143 +0,0 @@
-import os
-import tempfile
-import unittest
-from pathlib import Path
-
-from cli.lib.common.utils import temp_environ, working_directory  # <-- replace import
-
-
-class EnvIsolatedTestCase(unittest.TestCase):
-    """Base class that snapshots os.environ and CWD for isolation."""
-
-    def setUp(self):
-        import os
-        import tempfile
-
-        self._env_backup = dict(os.environ)
-
-        # Snapshot/repair CWD if it's gone
-        try:
-            self._cwd_backup = os.getcwd()
-        except FileNotFoundError:
-            # If CWD no longer exists, switch to a safe place and record that
-            self._cwd_backup = tempfile.gettempdir()
-            os.chdir(self._cwd_backup)
-
-        # Create a temporary directory for the test to run in
-        self._temp_dir = tempfile.mkdtemp()
-        os.chdir(self._temp_dir)
-
-    def tearDown(self):
-        import os
-        import shutil
-        import tempfile
-
-        # Restore cwd first (before cleaning up temp dir)
-        try:
-            os.chdir(self._cwd_backup)
-        except OSError:
-            os.chdir(tempfile.gettempdir())
-
-        # Clean up temporary directory
-        try:
-            shutil.rmtree(self._temp_dir, ignore_errors=True)
-        except Exception:
-            pass  # Ignore cleanup errors
-
-        # Restore env
-        to_del = set(os.environ.keys()) - set(self._env_backup.keys())
-        for k in to_del:
-            os.environ.pop(k, None)
-        for k, v in self._env_backup.items():
-            os.environ[k] = v
-
-
-class TestTempEnviron(EnvIsolatedTestCase):
-    def test_sets_and_restores_new_var(self):
-        var = "TEST_TMP_ENV_NEW"
-        self.assertNotIn(var, os.environ)
-
-        with temp_environ({var: "123"}):
-            self.assertEqual(os.environ[var], "123")
-
-        self.assertNotIn(var, os.environ)  # removed after exit
-
-    def test_overwrites_and_restores_existing_var(self):
-        var = "TEST_TMP_ENV_OVERWRITE"
-        os.environ[var] = "orig"
-
-        with temp_environ({var: "override"}):
-            self.assertEqual(os.environ[var], "override")
-
-        self.assertEqual(os.environ[var], "orig")  # restored
-
-    def test_multiple_vars_and_missing_cleanup(self):
-        v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"
-        os.environ.pop(v1, None)
-        os.environ[v2] = "keep"
-
-        with temp_environ({v1: "a", v2: "b"}):
-            self.assertEqual(os.environ[v1], "a")
-            self.assertEqual(os.environ[v2], "b")
-
-        self.assertNotIn(v1, os.environ)  # newly-added -> removed
-        self.assertEqual(os.environ[v2], "keep")  # pre-existing -> restored
-
-    def test_restores_even_on_exception(self):
-        var = "TEST_TMP_ENV_EXCEPTION"
-        self.assertNotIn(var, os.environ)
-
-        with self.assertRaises(RuntimeError):
-            with temp_environ({var: "x"}):
-                self.assertEqual(os.environ[var], "x")
-                raise RuntimeError("boom")
-
-        self.assertNotIn(var, os.environ)  # removed after exception
-
-
-class TestWorkingDirectory(EnvIsolatedTestCase):
-    def test_changes_and_restores(self):
-        start = Path.cwd()
-        with tempfile.TemporaryDirectory() as td:
-            target = Path(td) / "wd"
-            target.mkdir()
-
-            with working_directory(str(target)):
-                self.assertEqual(Path.cwd().resolve(), target.resolve())
-
-        self.assertEqual(Path.cwd(), start)
-
-    def test_noop_when_empty_path(self):
-        start = Path.cwd()
-        with working_directory(""):
-            self.assertEqual(Path.cwd(), start)
-        self.assertEqual(Path.cwd(), start)
-
-    def test_restores_on_exception(self):
-        start = Path.cwd()
-
-        with tempfile.TemporaryDirectory() as td:
-            target = Path(td) / "wd_exc"
-            target.mkdir()
-
-            with self.assertRaises(ValueError):
-                with working_directory(str(target)):
-                    # Normalize both sides to handle /var -> /private/var
-                    self.assertEqual(Path.cwd().resolve(), target.resolve())
-                    raise ValueError("boom")
-
-        self.assertEqual(Path.cwd().resolve(), start.resolve())
-
-    def test_raises_for_missing_dir(self):
-        start = Path.cwd()
-        with tempfile.TemporaryDirectory() as td:
-            missing = Path(td) / "does_not_exist"
-            with self.assertRaises(FileNotFoundError):
-                # os.chdir should raise before yielding
-                with working_directory(str(missing)):
-                    pass
-        self.assertEqual(Path.cwd(), start)
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
--- a/.ci/lumen_cli/tests/test_vllm.py
+++ b/.ci/lumen_cli/tests/test_vllm.py
@ -1,176 +0,0 @@
-import os
-import tempfile
-import unittest
-from pathlib import Path
-from unittest.mock import MagicMock, patch
-
-import cli.lib.core.vllm.vllm_build as vllm_build
-
-
-_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"
-
-
-class TestVllmBuildParameters(unittest.TestCase):
-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)
-    @patch(
-        "cli.lib.common.envs_helper.env_path_optional",
-        side_effect=lambda name, default=None, resolve=True: {
-            "DOCKERFILE_PATH": Path("/abs/vllm/Dockerfile"),
-            "TORCH_WHEELS_PATH": Path("/abs/dist"),
-            "OUTPUT_DIR": Path("/abs/shared"),
-        }.get(name, Path(default) if default is not None else None),
-    )
-    @patch.dict(
-        os.environ,
-        {
-            "USE_TORCH_WHEEL": "1",
-            "USE_LOCAL_BASE_IMAGE": "1",
-            "USE_LOCAL_DOCKERFILE": "1",
-            "BASE_IMAGE": "my/image:tag",
-            "DOCKERFILE_PATH": "vllm/Dockerfile",
-            "TORCH_WHEELS_PATH": "dist",
-            "OUTPUT_DIR": "shared",
-        },
-        clear=True,
-    )
-    def test_params_success_normalizes_and_validates(
-        self, mock_env_path, mock_is_path, mock_local_img
-    ):
-        params = vllm_build.VllmBuildParameters()
-        self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
-        self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
-        self.assertEqual(params.output_dir, Path("/abs/shared"))
-        self.assertEqual(params.base_image, "my/image:tag")
-
-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
-    @patch.dict(
-        os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
-    )
-    def test_params_missing_torch_whls_raises(self, _is_path):
-        with tempfile.TemporaryDirectory() as td:
-            os.chdir(td)
-            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
-                    use_local_base_image=False,
-                    use_local_dockerfile=False,
-                )
-        err = cm.exception
-        self.assertIn("TORCH_WHEELS_PATH", str(err))
-
-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)
-    @patch.dict(
-        os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
-    )
-    def test_params_missing_local_base_image_raises(self, _local_img):
-        with tempfile.TemporaryDirectory() as td:
-            os.chdir(td)
-            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
-                    use_torch_whl=False,
-                    use_local_dockerfile=False,
-                )
-        err = cm.exception
-        self.assertIn("BASE_IMAGE", str(err))
-
-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
-    @patch.dict(
-        os.environ,
-        {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
-        clear=True,
-    )
-    def test_params_missing_dockerfile_raises(self, _is_path):
-        with tempfile.TemporaryDirectory() as td:
-            os.chdir(td)
-            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
-                    use_torch_whl=False,
-                    use_local_base_image=False,
-                )
-        err = cm.exception
-        self.assertIn("DOCKERFILE_PATH", str(err))
-
-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
-    @patch.dict(
-        os.environ,
-        {"OUTPUT_DIR": ""},
-        clear=True,
-    )
-    def test_params_missing_output_dir(self, _is_path):
-        with self.assertRaises(FileNotFoundError):
-            vllm_build.VllmBuildParameters()
-
-
-class TestBuildCmdAndRun(unittest.TestCase):
-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
-    def test_generate_docker_build_cmd_includes_bits(self, _exists):
-        runner = vllm_build.VllmBuildRunner()
-        inputs = MagicMock()
-        inputs.output_dir = Path("/abs/out")
-        inputs.use_local_base_image = True
-        inputs.base_image = "img:tag"
-        inputs.torch_whls_path = Path("./vllm/tmp")
-        inputs.max_jobs = 64
-        inputs.cuda_version = "12.8.1"
-        inputs.python_version = "3.12"
-        inputs.sccache_bucket = "my-bucket"
-        inputs.sccache_region = "us-west-2"
-        inputs.torch_cuda_arch_list = "8.0;9.0"
-        inputs.target_stage = "export-wheels"
-        inputs.tag_name = "vllm-wheels"
-
-        cmd = runner._generate_docker_build_cmd(inputs)
-        squashed = " ".join(cmd.split())
-
-        self.assertIn("--output type=local,dest=/abs/out", squashed)
-        self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
-        self.assertIn("--pull=false", squashed)
-        self.assertIn("--build-arg TORCH_WHEELS_PATH=tmp", squashed)
-        self.assertIn("--build-arg BUILD_BASE_IMAGE=img:tag", squashed)
-        self.assertIn("--build-arg FINAL_BASE_IMAGE=img:tag", squashed)
-        self.assertIn("--build-arg max_jobs=64", squashed)
-        self.assertIn("--build-arg CUDA_VERSION=12.8.1", squashed)
-        self.assertIn("--build-arg PYTHON_VERSION=3.12", squashed)
-        self.assertIn("--build-arg USE_SCCACHE=1", squashed)
-        self.assertIn("--build-arg SCCACHE_BUCKET_NAME=my-bucket", squashed)
-        self.assertIn("--build-arg SCCACHE_REGION_NAME=us-west-2", squashed)
-        self.assertIn("--build-arg torch_cuda_arch_list='8.0;9.0'", squashed)
-        self.assertIn("--target export-wheels", squashed)
-        self.assertIn("-t vllm-wheels", squashed)
-
-    @patch(f"{_VLLM_BUILD_MODULE}.run_command")
-    @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")
-    @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")
-    @patch.object(
-        vllm_build.VllmBuildRunner,
-        "_generate_docker_build_cmd",
-        return_value="docker buildx ...",
-    )
-    @patch.dict(
-        os.environ,
-        {
-            "USE_TORCH_WHEEL": "0",
-            "USE_LOCAL_BASE_IMAGE": "0",
-            "USE_LOCAL_DOCKERFILE": "0",
-            "OUTPUT_DIR": "shared",
-        },
-        clear=True,
-    )
-    def test_run_calls_clone_prepare_and_build(
-        self, mock_gen, mock_clone, mock_ensure, mock_run
-    ):
-        params = MagicMock()
-        params.output_dir = Path("shared")
-        params.use_local_dockerfile = False
-        params.use_torch_whl = False
-
-        with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):
-            runner = vllm_build.VllmBuildRunner()
-            runner.run()
-
-        mock_clone.assert_called_once()
-        mock_ensure.assert_called_once_with(Path("shared"))
-        mock_gen.assert_called_once_with(params)
-        mock_run.assert_called_once()
-        _, kwargs = mock_run.call_args
-        assert kwargs.get("cwd") == "vllm"
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh

 .PHONY: all
-all: magma-cuda130
 all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
@ -26,12 +25,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-cuda130
-magma-cuda130: DESIRED_CUDA := 13.0
-magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-magma-cuda130:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda129
 magma-cuda129: DESIRED_CUDA := 12.9
 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -28,7 +28,6 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
-patch -p1 < ${PACKAGE_FILES}/cuda13.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
@ -38,7 +37,6 @@ popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
--- a/.ci/magma/package_files/cuda13.patch
+++ b/.ci/magma/package_files/cuda13.patch
@ -1,26 +0,0 @@
-diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
-index 73fed1b20..e77519bfe 100644
--- a/interface_cuda/interface.cpp
-+++ b/interface_cuda/interface.cpp
-@@ -438,14 +438,20 @@ magma_print_environment()
-         cudaDeviceProp prop;
-         err = cudaGetDeviceProperties( &prop, dev );
-         check_error( err );
-+        #ifdef MAGMA_HAVE_CUDA
-+#if CUDA_VERSION < 13000
-         printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
-                 dev,
-                 prop.name,
-                 prop.clockRate / 1000.,
-+#else
-+        printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
-+                dev,
-+                prop.name,
-+#endif
-                 prop.totalGlobalMem / (1024.*1024.),
-                 prop.major,
-                 prop.minor );
-        #ifdef MAGMA_HAVE_CUDA
-         int arch = prop.major*100 + prop.minor*10;
-         if ( arch < MAGMA_CUDA_ARCH_MIN ) {
-             printf("\n"
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -5,6 +5,10 @@ set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

 case "${GPU_ARCH_TYPE:-BLANK}" in
+    BLANK)
+        # Legacy behavior for CircleCI
+        bash "${SCRIPTPATH}/build_cuda.sh"
+        ;;
    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -66,9 +66,6 @@ case ${CUDA_VERSION} in
            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
-    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        ;;
    12.6)
        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
        ;;
@ -113,15 +110,11 @@ DEPS_SONAME=(
 )


-# CUDA_VERSION 12.*, 13.*
-if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
+# CUDA_VERSION 12.6, 12.8, 12.9
+if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
-    TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
-    # Compress the fatbin with -compress-mode=size for CUDA 13
-    if [[ $CUDA_VERSION == 13* ]]; then
-        export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"
-    fi
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
@ -141,7 +134,6 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
@ -160,7 +152,6 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            "libcudart.so.12"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
-            "libnvshmem_host.so.3"
            "libcufile.so.0"
            "libcufile_rdma.so.1"
            "libcupti.so.12"
@ -174,29 +165,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
            '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/nvshmem/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
            '$ORIGIN/../../nvidia/cusparselt/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvshmem/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+            '$ORIGIN/../../nvidia/cufile/lib'
        )
-        if [[ $CUDA_VERSION == 13* ]]; then
-            CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')
-        else
-            CUDA_RPATHS+=(
-                '$ORIGIN/../../nvidia/cublas/lib'
-                '$ORIGIN/../../nvidia/cuda_cupti/lib'
-                '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-                '$ORIGIN/../../nvidia/cuda_runtime/lib'
-                '$ORIGIN/../../nvidia/cufft/lib'
-                '$ORIGIN/../../nvidia/curand/lib'
-                '$ORIGIN/../../nvidia/cusolver/lib'
-                '$ORIGIN/../../nvidia/cusparse/lib'
-                '$ORIGIN/../../cusparselt/lib'
-                '$ORIGIN/../../nvidia/nvtx/lib'
-                '$ORIGIN/../../nvidia/cufile/lib'
-            )
-        fi
-
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -25,7 +25,6 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
 export USE_ONEMKL=1
 export USE_XCCL=1
-export USE_MPI=0

 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -50,6 +50,9 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

+# Enable LLVM dependency for TensorExpr testing
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
@ -92,27 +95,6 @@ if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  export ACL_ROOT_DIR=/ComputeLibrary
 fi

-if [[ "$BUILD_ENVIRONMENT" == *riscv64* ]]; then
-  if [[ -f /opt/riscv-cross-env/bin/activate ]]; then
-    # shellcheck disable=SC1091
-    source /opt/riscv-cross-env/bin/activate
-  else
-    echo "Activation file not found"
-    exit 1
-  fi
-
-  export CMAKE_CROSSCOMPILING=TRUE
-  export CMAKE_SYSTEM_NAME=Linux
-  export CMAKE_SYSTEM_PROCESSOR=riscv64
-
-  export USE_CUDA=0
-  export USE_MKLDNN=0
-
-  export SLEEF_TARGET_EXEC_USE_QEMU=ON
-  sudo chown -R jenkins /var/lib/jenkins/workspace /opt
-
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
  POSSIBLE_JAVA_HOMES=()
  POSSIBLE_JAVA_HOMES+=(/usr/local)
@ -173,7 +155,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
-  export USE_MPI=0
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
@ -195,16 +176,8 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
-  J=2  # default to 2 jobs
-  case "$RUNNER" in
-    linux.12xlarge.memory|linux.24xlarge.memory)
-      J=24
-      ;;
-  esac
-  echo "Building FlashAttention with job limit $J"
-  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
+  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
@ -219,6 +192,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
  export USE_ASAN=1
  export REL_WITH_DEB_INFO=1
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
+  unset USE_LLVM
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *no-ops* ]]; then
@ -239,7 +213,7 @@ fi

 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -284,7 +258,8 @@ else
    # XLA test build fails when WERROR=1
    # set only when building other architectures
    # or building non-XLA tests.
-    if [[ "$BUILD_ENVIRONMENT" != *rocm*  && "$BUILD_ENVIRONMENT" != *xla* && "$BUILD_ENVIRONMENT" != *riscv64* ]]; then
+    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
      python -mpip install numpy==2.0.2

@ -421,7 +396,7 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
  python tools/stats/export_test_times.py
 fi
-# don't do this for bazel or s390x or riscv64 as they don't use sccache
-if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+# don't do this for bazel or s390x as they don't use sccache
+if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -300,3 +300,24 @@ except RuntimeError as e:
    exit 1
  fi
 fi
+
+###############################################################################
+# Check for C++ ABI compatibility to GCC-11 - GCC 13
+###############################################################################
+if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
+  pushd /tmp
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
+  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
+  # gcc 11 - CUDA 11.8, xpu, rocm
+  # gcc 13 - CUDA 12.6, 12.8 and cpu
+  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
+  if [[ "$(uname -m)" == "s390x" ]]; then
+    cxx_abi="19"
+  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
+    cxx_abi="18"
+  else
+    cxx_abi="16"
+  fi
+  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  popd
+fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -149,19 +149,6 @@ function get_pinned_commit() {
  cat .github/ci_commit_pins/"${1}".txt
 }

-function detect_cuda_arch() {
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-    if command -v nvidia-smi; then
-      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
-    elif [[ "${TEST_CONFIG}" == *nogpu* ]]; then
-      # There won't be nvidia-smi in nogpu tests, so just set TORCH_CUDA_ARCH_LIST to the default
-      # minimum supported value here
-      TORCH_CUDA_ARCH_LIST=8.0
-    fi
-    export TORCH_CUDA_ARCH_LIST
-  fi
-}
-
 function install_torchaudio() {
  local commit
  commit=$(get_pinned_commit audio)
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -35,10 +35,11 @@ fi

 print_cmake_info
 if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
 else
-  # NB: we always build with distributed; USE_DISTRIBUTED turns off all
-  # backends (specifically the gloo backend), so test that this case works too
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
 fi
 if which sccache > /dev/null; then
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -13,13 +13,9 @@ if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available(
 fi
 popd

-python -mpip install -r requirements.txt
-
 # enable debug asserts in serialization
 export TORCH_SERIALIZATION_DEBUG=1

-python -mpip install --no-input -r requirements.txt
-
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
  # This environment variable makes ProcessGroupGloo default to
@ -178,15 +174,10 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  popd
-
-  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
-  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
-  # its current version 0.12.0 doesn't work with transformers 4.54.0
-  pip uninstall -y torchao

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
+  popd
 }

 torchbench_setup_macos() {
@ -306,47 +297,6 @@ test_torchbench_smoketest() {
    fi

  done
-  echo "Pytorch benchmark on mps device completed"
-}
-
-test_aoti_torchbench_smoketest() {
-  print_cmake_info
-
-  echo "Launching AOTInductor torchbench setup"
-  pip_benchmark_deps
-  # shellcheck disable=SC2119,SC2120
-  torchbench_setup_macos
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local device=mps
-  local dtypes=(undefined float16 bfloat16 notset)
-  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
-
-  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
-  local dtype_arg="--${dtype}"
-  if [ "$dtype" == notset ]; then
-      dtype_arg="--float32"
-  fi
-  touch "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv"
-  for model in "${models[@]}"; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_performance.csv" || true
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --accuracy --only "$model" --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-      --output "$TEST_REPORTS_DIR/aot_inductor_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-  done
-
-  echo "Launching HuggingFace inference performance run for AOT Inductor and dtype ${dtype}"
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --performance --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_performance.csv" || true
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-    --accuracy --export-aot-inductor --inference --devices "$device" "$dtype_arg" \
-    --output "$TEST_REPORTS_DIR/aot_inductor_huggingface_${dtype}_inference_${device}_accuracy.csv" || true

  echo "Pytorch benchmark on mps device completed"
 }
@ -395,8 +345,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest "${SHARD_NUMBER}"
-elif [[ $TEST_CONFIG == *"aot_inductor_perf_smoketest"* ]]; then
-  test_aoti_torchbench_smoketest "${SHARD_NUMBER}"
 elif [[ $TEST_CONFIG == *"mps"* ]]; then
  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -45,7 +45,6 @@ if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
    # DTensor tests
    time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
    time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
-    time python test/run_test.py --verbose -i distributed/tensor/test_utils.py

    # DeviceMesh test
    time python test/run_test.py --verbose -i distributed/test_device_mesh
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -91,7 +91,6 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export VALGRIND=OFF
 fi

-detect_cuda_arch

 if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
  # There are additional warnings on s390x, maybe due to newer gcc.
@ -496,14 +495,6 @@ test_inductor_cpp_wrapper_shard() {
    -k 'take' \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-
-  if [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
-    python test/run_test.py \
-      --include inductor/test_mkldnn_pattern_matcher \
-      -k 'xpu' \
-      --shard "$1" "$NUM_TEST_SHARDS" \
-      --verbose
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -1060,10 +1051,20 @@ test_libtorch_api() {
    mkdir -p $TEST_REPORTS_DIR

    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" "$TORCH_BIN_DIR"/test_api --gtest_filter='-IMethodTest.*' --gtest_output=xml:$TEST_REPORTS_DIR/test_api.xml
+    "$TORCH_BIN_DIR"/test_tensorexpr --gtest_output=xml:$TEST_REPORTS_DIR/test_tensorexpr.xml
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"

+    # On s390x, pytorch is built without llvm.
+    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
+    # test fails with errors like:
+    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
+    # unknown file: Failure
+    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
+    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
+      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
+    fi
  fi

  # quantization is not fully supported on s390x yet
@ -1638,10 +1639,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
  test_xla
-elif [[ "$TEST_CONFIG" == *vllm* ]]; then
-    echo "vLLM CI uses TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
-    (cd .ci/lumen_cli && python -m pip install -e .)
-    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
@ -1695,6 +1692,7 @@ elif [[ "${TEST_CONFIG}" == verify_cachebench ]]; then
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  install_torchaudio
  install_torchvision
+  install_torchao
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -61,10 +61,9 @@ if "%USE_XPU%"=="1" (
  call "C:\Program Files (x86)\Intel\oneAPI\compiler\latest\env\vars.bat"
  call "C:\Program Files (x86)\Intel\oneAPI\ocloc\latest\env\vars.bat"
  if errorlevel 1 exit /b 1
-  :: Reduce build time
-  SET TORCH_XPU_ARCH_LIST=bmg
-  :: Re-setup python env for build
-  call pip install -r requirements.txt
+  :: Reduce build time. Only have MTL self-hosted runner now
+  SET TORCH_XPU_ARCH_LIST=xe-lpg
+  SET USE_KINETO=0
 )

@echo on
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -44,7 +44,7 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 python -m pip install z3-solver==4.15.1.0

 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.4.0
+python -m pip install tlparse==0.3.30

 # Install parameterized
 python -m pip install parameterized==0.8.1
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda130.bat
+++ b/.ci/pytorch/windows/cuda130.bat
@ -1,59 +0,0 @@
-@echo off
-
-set MODULE_NAME=pytorch
-
-IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
-    call internal\clone.bat
-    cd %~dp0
-) ELSE (
-    call internal\clean.bat
-)
-IF ERRORLEVEL 1 goto :eof
-
-call internal\check_deps.bat
-IF ERRORLEVEL 1 goto :eof
-
-REM Check for optional components
-
-set USE_CUDA=
-set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
-
-IF "%NVTOOLSEXT_PATH%"=="" (
-    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
-        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-    ) ELSE (
-        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
-        exit /b 1
-    )
-)
-
-IF "%CUDA_PATH_V130%"=="" (
-    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" (
-        set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
-    ) ELSE (
-        echo CUDA 13.0 not found, failing
-        exit /b 1
-    )
-)
-
-IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0
-    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
-) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
-)
-
-set "CUDA_PATH=%CUDA_PATH_V130%"
-set "PATH=%CUDA_PATH_V130%\bin;%PATH%"
-
-:optcheck
-
-call internal\check_opts.bat
-IF ERRORLEVEL 1 goto :eof
-
-if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
-call  %~dp0\internal\copy.bat
-IF ERRORLEVEL 1 goto :eof
-
-call  %~dp0\internal\setup.bat
-IF ERRORLEVEL 1 goto :eof
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@ -26,7 +26,6 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
 if %CUDA_VER% EQU 129 goto cuda129
-if %CUDA_VER% EQU 130 goto cuda130

 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@ -114,33 +113,6 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"

 goto cuda_common

-:cuda130
-
-set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS="
-)
-
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-
-goto cuda_common
-
 :cuda_common
 :: NOTE: We only install CUDA if we don't have it installed already.
 :: With GHA runners these should be pre-installed as part of our AMI process
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -1,22 +1,12 @@
 set ADDITIONAL_OPTIONS=""
 set PYTHON_EXEC="python"
-
-
 if "%DESIRED_PYTHON%" == "3.13t" (
    echo Python version is set to 3.13t
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
-) else if "%DESIRED_PYTHON%"=="3.14" (
-    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
-) else if "%DESIRED_PYTHON%"=="3.14t" (
-    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
-    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
-    set PYTHON_EXEC="python3.14t"
 ) else (
-    echo Python version is set to %DESIRED_PYTHON%
+    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
 )

--- a/.ci/pytorch/windows/internal/xpu_install.bat
+++ b/.ci/pytorch/windows/internal/xpu_install.bat
@ -13,9 +13,9 @@ if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
 :xpu_bundle_install_start

 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
-set XPU_BUNDLE_VERSION=2025.1.3+5
+set XPU_BUNDLE_VERSION=2025.0.1+20
 set XPU_BUNDLE_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_EXTRA_URL=NULL
@ -24,9 +24,9 @@ set XPU_EXTRA_VERSION=2025.0.1+1226
 set XPU_EXTRA_INSTALLED=0
 set XPU_EXTRA_UNINSTALL=0

-if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.2] (
-    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/24751ead-ddc5-4479-b9e6-f9fe2ff8b9f2/intel-deep-learning-essentials-2025.2.1.25_offline.exe
-    set XPU_BUNDLE_VERSION=2025.2.1+20
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.1] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/75d4eb97-914a-4a95-852c-7b9733d80f74/intel-deep-learning-essentials-2025.1.3.8_offline.exe
+    set XPU_BUNDLE_VERSION=2025.1.3+5
 )

 :: Check if XPU bundle is target version or already installed
@ -90,3 +90,14 @@ if errorlevel 1 exit /b 1
 del xpu_extra.exe

 :xpu_install_end
+
+if not "%XPU_ENABLE_KINETO%"=="1" goto install_end
+:: Install Level Zero SDK
+set XPU_EXTRA_LZ_URL=https://github.com/oneapi-src/level-zero/releases/download/v1.14.0/level-zero-sdk_1.14.0.zip
+curl -k -L %XPU_EXTRA_LZ_URL% --output "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+echo "Installing level zero SDK..."
+7z x "%SRC_DIR%\temp_build\level_zero_sdk.zip" -o"%SRC_DIR%\temp_build\level_zero"
+set "INCLUDE=%SRC_DIR%\temp_build\level_zero\include;%INCLUDE%"
+del "%SRC_DIR%\temp_build\level_zero_sdk.zip"
+
+:install_end
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -7,8 +7,6 @@ call "internal\install_python.bat"

 %PYTHON_EXEC% --version
 set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
-if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
-if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -128,35 +128,16 @@ export MACOSX_DEPLOYMENT_TARGET=10.15
 export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}

 SETUPTOOLS_PINNED_VERSION="==70.1.0"
-PYYAML_PINNED_VERSION="==5.3"
+PYYAML_PINNED_VERSION="=5.3"
 EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
-    3.14t)
-        echo "Using 3.14 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="==2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
-    3.14)
-        echo "Using 3.14t deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="==2.1.0"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
    3.13t)
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="==2.1.0"
+        NUMPY_PINNED_VERSION="=2.1.0"
        CONDA_ENV_CREATE_FLAGS="python-freethreading"
        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
        desired_python="3.13"
@ -166,35 +147,35 @@ case $desired_python in
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="==2.1.0"
+        NUMPY_PINNED_VERSION="=2.1.0"
        ;;
    3.12)
        echo "Using 3.12 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="==2.0.2"
+        NUMPY_PINNED_VERSION="=2.0.2"
        ;;
    3.11)
        echo "Using 3.11 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="==2.0.2"
+        NUMPY_PINNED_VERSION="=2.0.2"
        ;;
    3.10)
        echo "Using 3.10 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="==2.0.2"
+        NUMPY_PINNED_VERSION="=2.0.2"
        ;;
    3.9)
        echo "Using 3.9 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
        PYYAML_PINNED_VERSION=">=5.3"
-        NUMPY_PINNED_VERSION="==2.0.2"
+        NUMPY_PINNED_VERSION="=2.0.2"
        ;;
    *)
        echo "Using default deps"
-        NUMPY_PINNED_VERSION="==1.11.3"
+        NUMPY_PINNED_VERSION="=1.11.3"
        ;;
 esac

@ -203,18 +184,12 @@ tmp_env_name="wheel_py$python_nodot"
 conda create ${EXTRA_CONDA_INSTALL_FLAGS} -yn "$tmp_env_name" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS}
 source activate "$tmp_env_name"

-PINNED_PACKAGES=(
-    "setuptools${SETUPTOOLS_PINNED_VERSION}"
-    "pyyaml${PYYAML_PINNED_VERSION}"
-    "numpy${NUMPY_PINNED_VERSION}"
-)
-retry pip install "${PINNED_PACKAGES[@]}" -r "${pytorch_rootdir}/requirements-build.txt"
-pip install requests ninja typing-extensions
+retry pip install -r "${pytorch_rootdir}/requirements-build.txt"
+pip install "numpy=${NUMPY_PINNED_VERSION}"  "pyyaml${PYYAML_PINNED_VERSION}" requests ninja "setuptools${SETUPTOOLS_PINNED_VERSION}" typing-extensions
 retry pip install -r "${pytorch_rootdir}/requirements.txt" || true
 retry brew install libomp

-# For USE_DISTRIBUTED=1 on macOS, this enables gloo, which needs libuv, which
-# is build as part of tensorpipe submodule
+# For USE_DISTRIBUTED=1 on macOS, need libuv, which is build as part of tensorpipe submodule
 export USE_DISTRIBUTED=1

 export USE_MKLDNN=OFF
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -51,12 +51,16 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
+    cache_control_flag=""
+    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
+      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
+    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}"
+          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
      )
    done
  )
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -15,7 +15,8 @@ fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
+    export XPU_ENABLE_KINETO=1
 fi

 echo "Free space on filesystem before build:"
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -8,7 +8,7 @@ export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
-    export XPU_VERSION=2025.2
+    export XPU_VERSION=2025.1
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
--- a/.flake8
+++ b/.flake8
@ -48,7 +48,6 @@ per-file-ignores =
    torch/__init__.py: F401,TOR901
    torch/_custom_op/impl.py: TOR901
    torch/_export/serde/upgrade.py: TOR901
-    torch/_functorch/predispatch.py: TOR901
    torch/_functorch/vmap.py: TOR901
    torch/_inductor/test_operators.py: TOR901
    torch/_library/abstract_impl.py: TOR901
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -54,7 +54,6 @@ self-hosted-runner:
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
    # gfx942 runners
-    - linux.rocm.gpu.gfx942.1
    - linux.rocm.gpu.gfx942.2
    - linux.rocm.gpu.gfx942.4
    - rocm-docker
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -1,81 +0,0 @@
-# .github/workflows/build-external.yml
-name: Build External packages
-
-description: build external packages for PyTorch
-
-inputs:
-  cuda-arch-list:
-    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
-    type: string
-    required: true
-    default: ""
-  docker-image:
-    description: Base image to use
-    type: string
-    required: true
-  build-targets:
-    description: Build targets
-    type: string
-    required: true
-  torch-wheel-dir:
-    description: Directory to built torch wheel
-    type: string
-    required: false
-    default: dist
-  output-dir:
-    description: Directory to store build artifact
-    default: external
-    type: string
-    required: false
-
-outputs:
-  build_time:
-    description: "Total build time in seconds"
-    value: ${{ steps.build-external.outputs.build_time }}
-  output_dir:
-    description: "Directory where build artifact is stored"
-    value: ${{ steps.build-external.outputs.output_dir }}
-
-runs:
-  using: composite
-  steps:
-    - name: Build external packages in sequence
-      id: build-external
-      env:
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_REGION: us-east-1
-        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
-        BASE_IMAGE: ${{ inputs.docker-image }}
-        BUILD_TARGETS: ${{ inputs.build-targets }}
-        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
-
-      shell: bash
-      run: |
-        set -euo pipefail
-        python3 --version
-        docker images
-        START_TIME=$(date +%s)
-        (
-          cd .ci/lumen_cli
-          python3 -m pip install -e .
-        )
-        MAX_JOBS="$(nproc --ignore=6)"
-        export MAX_JOBS
-
-        # Split the comma-separated list and build each target
-        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
-        for target in "${TARGETS[@]}"; do
-          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
-          export OUTPUT_DIR
-          echo "Building external package: $target in directory $OUTPUT_DIR"
-          python3 -m cli.run build external "$target"
-
-        done
-
-        END_TIME=$(date +%s)
-        {
-          echo "build_time=$((END_TIME - START_TIME))"
-          if [ -d "$PARENT_OUTPUT_DIR" ]; then
-            echo "output_dir=$PARENT_OUTPUT_DIR"
-          fi
-        } >> "$GITHUB_OUTPUT"
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -57,21 +57,6 @@ runs:
        submodules: ${{ inputs.submodules }}
        show-progress: false

-    - name: Clean submodules post checkout
-      id: clean-submodules
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      shell: bash
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        cd "${GITHUB_WORKSPACE}"
-        # Clean stale submodule dirs
-        if [ -z "${NO_SUDO}" ]; then
-          sudo git submodule foreach --recursive git clean -ffdx
-        else
-          git submodule foreach --recursive git clean -ffdx
-        fi
-
    - name: Clean workspace (try again)
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -59,6 +59,11 @@ runs:
            echo "$msg"
            exit 1
        fi
+        if [[ $ngpu -eq 1 ]]; then
+            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
+            echo "$msg"
+            exit 1
+        fi

    - name: Runner diskspace health check
      uses: pytorch/pytorch/.github/actions/diskspace-cleanup@main
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-0757bbb660855272f7dd8d31cc84e7c631522805
+0c22347335f4c9a5b92a2f5bad65e05e2464c184
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-862f2ef893d9751db0a92bd2d4ae0e3d9677872f
+7e3a8dc90670fd312ce1e0d4eba9bf11c571e3ad
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-763e5b78d4fcd74a9e812256656c075f99d9a781
+b6a5b82b9948b610fa4c304d0d869c82b8f17db1
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -1,439 +0,0 @@
-# TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
-# The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
-
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
-
-# BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
-# by default, it uses the torch-nightly-base stage from this docker image
-ARG BUILD_BASE_IMAGE=torch-nightly-base
-
-# FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
-# by default, it uses devel-ubuntu22.04 official image.
-ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
-
-
-#################### TORCH NIGHTLY  BASE IMAGE ####################
-# A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
-From nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
-ARG CUDA_VERSION=12.8.1
-ARG PYTHON_VERSION=3.12
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
-
-# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
-# as it was causing spam when compiling the CUTLASS kernels
-# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
-RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
-    if [ "$current_gcc_version" -lt 10 ]; then \
-      echo "GCC version is $current_gcc_version, installing gcc-10..."; \
-      apt-get update && \
-      apt-get install -y gcc-10 g++-10 && \
-      update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 && \
-      update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
-    else \
-      echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
-    fi && \
-    gcc --version && g++ --version
-
-# install uv for faster pip installs
-RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv==0.8.4
-
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-#################### TORCH NIGHTLY  BASE IMAGE ####################
-
-
-#################### BASE BUILD IMAGE ####################
-# A base image for building vLLM with torch nightly or torch wheels
-# prepare basic build environment
-FROM ${BUILD_BASE_IMAGE} AS base
-USER root
-
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
-# Install uv for faster pip installs if not existed
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if ! python3 -m uv --version >/dev/null 2>&1; then \
-        python3 -m pip install uv==0.8.4; \
-    fi
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-WORKDIR /workspace
-
-# install build and runtime dependencies
-COPY requirements/common.txt requirements/common.txt
-COPY use_existing_torch.py use_existing_torch.py
-COPY pyproject.toml pyproject.toml
-
-# install build and runtime dependencies without stable torch version
-RUN python3 use_existing_torch.py
-
-# default mount file as placeholder, this just avoid the mount error
-# change to a different vllm folder if this does not exist anymore
-ARG TORCH_WHEELS_PATH="./requirements"
-ARG PINNED_TORCH_VERSION
-
-# Install torch, torchaudio and torchvision based on the input
-# if TORCH_WHEELS_PATH is default "./requirements", it will pull thethe nightly versions using pip
-# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
-RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
-        echo "[INFO] Installing torch wheels to build vllm"; \
-        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
-    elif [ -n "$PINNED_TORCH_VERSION" ]; then \
-        echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
-        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
-    else \
-        echo "[INFO] Installing torch nightly with latest one to build vllm"; \
-        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
-    fi
-
-# Install numba 0.61.2 for cuda environment
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system numba==0.61.2
-
-# Install common dependencies from vllm common.txt
-RUN --mount=type=cache,target=/root/.cache/uv \
-uv pip install --system -r requirements/common.txt
-
-
-# Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
-ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
-
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-
-RUN echo ${TORCH_CUDA_ARCH_LIST}
-RUN echo ${MAX_JOBS}
-RUN pip freeze | grep -E 'ninja'
-
-# Build xformers with cuda and torch nightly/wheel
-# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
-ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
-ENV CCACHE_DIR=/root/.cache/ccache
-
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
-    echo 'git clone xformers...' \
-    && git clone https://github.com/facebookresearch/xformers.git --recursive \
-    && cd xformers \
-    && git checkout ${XFORMERS_COMMIT} \
-    && git submodule update --init --recursive \
-    && echo 'finish git clone xformers...' \
-    && rm -rf build \
-    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
-    && cd .. \
-    && rm -rf xformers
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl --verbose
-
-# Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
-# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
-RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
-
-RUN cat torch_build_versions.txt
-RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
-
-#################### BASE BUILD IMAGE ####################
-
-
-#################### WHEEL BUILD IMAGE ####################
-# Image used to build vllm wheel
-FROM base AS build
-ARG TARGETPLATFORM
-
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-COPY . .
-
-RUN python3 use_existing_torch.py
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/build.txt
-
-ARG GIT_REPO_CHECK=0
-RUN --mount=type=bind,source=.git,target=.git \
-    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
-
-# Max jobs used by Ninja to build extensions
-ARG max_jobs=16
-ENV MAX_JOBS=${max_jobs}
-ARG nvcc_threads=4
-ENV NVCC_THREADS=$nvcc_threads
-ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
-ARG USE_SCCACHE
-ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
-ARG SCCACHE_REGION_NAME=us-west-2
-ARG SCCACHE_S3_NO_CREDENTIALS=0
-
-# if USE_SCCACHE is set, use sccache to speed up compilation
-RUN --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git \
-    if [ "$USE_SCCACHE" = "1" ]; then \
-        echo "Installing sccache..." \
-        && curl -L -o sccache.tar.gz https://github.com/mozilla/sccache/releases/download/v0.8.1/sccache-v0.8.1-x86_64-unknown-linux-musl.tar.gz \
-        && tar -xzf sccache.tar.gz \
-        && sudo mv sccache-v0.8.1-x86_64-unknown-linux-musl/sccache /usr/bin/sccache \
-        && rm -rf sccache.tar.gz sccache-v0.8.1-x86_64-unknown-linux-musl \
-        && export SCCACHE_BUCKET=${SCCACHE_BUCKET_NAME} \
-        && export SCCACHE_REGION=${SCCACHE_REGION_NAME} \
-        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
-        && export SCCACHE_IDLE_TIMEOUT=0 \
-        && export CMAKE_BUILD_TYPE=Release \
-        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
-        && sccache --show-stats \
-        && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
-        && sccache --show-stats; \
-    fi
-
-ARG vllm_target_device="cuda"
-ENV VLLM_TARGET_DEVICE=${vllm_target_device}
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/uv \
-    --mount=type=bind,source=.git,target=.git  \
-    if [ "$USE_SCCACHE" != "1" ]; then \
-        # Clean any existing CMake artifacts
-        rm -rf .deps && \
-        mkdir -p .deps && \
-        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
-        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
-    fi
-
-RUN echo "[DEBUG] Listing  current directory:" && \
-    ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
-    cat torch_build_versions.txt
-
-#################### WHEEL BUILD IMAGE ####################
-
-
-################### VLLM INSTALLED IMAGE ####################
-# Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
-FROM ${FINAL_BASE_IMAGE} AS vllm-base
-USER root
-# prepare for environment starts
-WORKDIR /workspace
-
-RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
-    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
-
-# Install Python and other dependencies if it does not existed
-RUN if ! command -v python3 >/dev/null || ! python3 --version | grep -q "${PYTHON_VERSION}"; then \
-      echo "Installing Python ${PYTHON_VERSION}..." && \
-      echo 'tzdata tzdata/Areas select America' | debconf-set-selections && \
-      echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections && \
-      apt-get update -y && \
-      apt-get install -y ccache software-properties-common git curl sudo && \
-      for i in 1 2 3; do \
-        add-apt-repository -y ppa:deadsnakes/ppa && break || \
-        { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
-      done && \
-      apt-get update -y && \
-      apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv && \
-      update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 && \
-      update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} && \
-      ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config && \
-      curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION}; \
-   else \
-      echo "Python ${PYTHON_VERSION} already present, skipping setup."; \
-   fi \
-   && python3 --version && python3 -m pip --version
-
-
-# Get the torch versions, and whls used in previous stagtes for consistency
-COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
-COPY --from=base /workspace/xformers-dist /wheels/xformers
-COPY --from=build /workspace/vllm-dist /wheels/vllm
-RUN echo "[DEBUG] Listing current directory before torch install step:" && \
-    ls -al && \
-    echo "[DEBUG] Showing torch_build_versions.txt content:" && \
-    cat torch_build_versions.txt
-
-# Workaround for https://github.com/openai/triton/issues/2507 and
-# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
-# this won't be needed for future versions of this docker image
-# or future versions of triton.
-RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
-
-
-# Install uv for faster pip installs if not existed
-RUN --mount=type=cache,target=/root/.cache/uv \
-    if ! python3 -m uv --version > /dev/null 2>&1; then \
-        python3 -m pip install uv==0.8.4; \
-    fi
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-# Default mount file as placeholder, this just avoid the mount error
-ARG TORCH_WHEELS_PATH="./requirements"
-# Install torch, torchaudio and torchvision
-# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
-# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
-RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
-    --mount=type=cache,target=/root/.cache/uv \
-    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
-        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
-        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
-        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
-        uv pip install --system "${torch_whl}[opt-einsum]"; \
-        uv pip install --system "${vision_whl}"; \
-        uv pip install --system "${audio_whl}"; \
-    else \
-        echo "[INFO] Installing torch versions from torch_build_versions.txt"; \
-        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu128; \
-    fi
-
-# Install the vllm wheel from previous stage
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /wheels/vllm/*.whl --verbose
-
-# Install xformers wheel from previous stage
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system /wheels/xformers/*.whl --verbose
-
-
-# Build flashinfer from source.
-ARG torch_cuda_arch_list='8.0;8.9;9.0a'
-# install package for build flashinfer
-# see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
-
-RUN pip install build==1.3.0
-RUN pip freeze | grep -E 'setuptools|packaging|build'
-
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-# Build flashinfer for torch nightly from source around 10 mins
-ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
-# Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
-ARG FLASHINFER_GIT_REF="v0.2.14.post1"
-RUN --mount=type=cache,target=/root/.cache/uv \
-    git clone --depth 1 --recursive --shallow-submodules \
-        --branch ${FLASHINFER_GIT_REF} \
-        ${FLASHINFER_GIT_REPO} flashinfer \
-    && echo "Building FlashInfer with AOT for arches: ${torch_cuda_arch_list}" \
-    && cd flashinfer \
-    && python3 -m flashinfer.aot \
-    && python3 -m build --no-isolation --wheel --outdir ../wheels/flashinfer \
-    && cd .. \
-    && rm -rf flashinfer
-
-# install flashinfer python
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system wheels/flashinfer/*.whl --verbose
-
-# Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
-RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm\|^flashinfer' > build_summary.txt
-################### VLLM INSTALLED IMAGE ####################
-
-
-#################### UNITTEST IMAGE #############################
-FROM vllm-base as test
-
-ENV UV_HTTP_TIMEOUT=500
-ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy
-
-COPY tests/ tests/
-COPY examples examples
-COPY benchmarks benchmarks
-COPY ./vllm/collect_env.py .
-COPY requirements/common.txt requirements/common.txt
-COPY use_existing_torch.py use_existing_torch.py
-COPY pyproject.toml pyproject.toml
-# Install build and runtime dependencies without stable torch version
-COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
-
-RUN python3 use_existing_torch.py
-
-# install packages
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/common.txt
-# enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
-
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -e tests/vllm_test_utils
-
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system -r requirements/nightly_torch_test.txt
-
-# Workaround for #17068
-# pinned commit for v2.2.4
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@95d8aba8a8c75aedcaa6143713b11e745e7cd0d9#egg=mamba-ssm"
-
-# Logging to confirm the torch versions
-RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
-
-# Logging to confirm all the packages are installed
-RUN pip freeze
-
-#################### UNITTEST IMAGE #############################
-
-#################### EXPORT STAGE ####################
-FROM scratch as export-wheels
-
-# Just copy the wheels we prepared in previous stages
-COPY --from=base /workspace/xformers-dist /wheels/xformers
-COPY --from=build /workspace/vllm-dist /wheels/vllm
-COPY --from=vllm-base /workspace/build_summary.txt /wheels/build_summary.txt
-COPY --from=vllm-base /workspace/wheels/flashinfer /wheels/flashinfer-python
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,24 +0,0 @@
-version: 2
-updates:
-  # Update to the latest transformers version with dependabot
-  - package-ecosystem: "pip"
-    directory: "/.ci/docker/ci_commit_pins"
-    schedule:
-      interval: "daily"
-    target-branch: "main"
-    allow:
-      - dependency-name: "transformers"
-    ignore:
-      - dependency-name: "*"
-        update-types: ["version-update:semver-patch"]
-    commit-message:
-      prefix: "[Dependabot] Update"
-      include: "scope"
-    labels:
-      - "dependencies"
-      - "open source"
-      - "python"
-      - "topic: not user facing"
-      - "module: ci"
-      - "module: inductor"
-      - "ciflow/inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -22,12 +22,10 @@ ciflow_push_tags:
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
- ciflow/riscv64
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -0,0 +1,5 @@
+# Not pinning certifi so that we can always get the latest certificates
+certifi
+pip=23.2.1
+pkg-config=0.29.2
+wheel=0.37.1
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -28,7 +28,7 @@ pyyaml==6.0.2
 scipy==1.12.0
 setuptools==72.1.0
 sympy==1.13.3
-tlparse==0.4.0
+tlparse==0.3.30
 tensorboard==2.13.0
 typing-extensions==4.12.2
 unittest-xml-reporting<=3.2.0,>=2.0.0
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -1,4 +1,3 @@
-#!/bin/bash
 set -ex

 # Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
@ -51,15 +50,29 @@ do
    cp $lib $TRITON_ROCM_DIR/lib/
 done

+# Required ROCm libraries
+if [[ "${MAJOR_VERSION}" == "6" ]]; then
+    libamdhip="libamdhip64.so.6"
+else
+    libamdhip="libamdhip64.so.5"
+fi
+
 # Required ROCm libraries - ROCm 6.0
 ROCM_SO=(
-    "libamdhip64.so"
-    "libhsa-runtime64.so"
-    "libdrm.so"
-    "libdrm_amdgpu.so"
-    "libamd_comgr.so"
-    "librocprofiler-register.so"
+    "${libamdhip}"
+    "libhsa-runtime64.so.1"
+    "libdrm.so.2"
+    "libdrm_amdgpu.so.1"
 )
+if [[ $ROCM_INT -ge 60400 ]]; then
+    ROCM_SO+=("libamd_comgr.so.3")
+else
+    ROCM_SO+=("libamd_comgr.so.2")
+fi
+
+if [[ $ROCM_INT -ge 60100 ]]; then
+    ROCM_SO+=("librocprofiler-register.so.0")
+fi

 for lib in "${ROCM_SO[@]}"
 do
@ -81,6 +94,10 @@ do
    fi

    cp $file_path $TRITON_ROCM_DIR/lib
+    # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
+    LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
+    ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
+
 done

 # Copy Include Files
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@ -19,13 +19,15 @@ replace_needed_sofiles() {
    find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
        origname=$2
        patchedname=$3
-        set +e
-        origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-        ERRCODE=$?
-        set -e
-        if [ "$ERRCODE" -eq "0" ]; then
-            echo "patching $sofile entry $origname to $patchedname"
-            $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
+        if [[ "$origname" != "$patchedname" ]]; then
+            set +e
+            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
+            ERRCODE=$?
+            set -e
+            if [ "$ERRCODE" -eq "0" ]; then
+                echo "patching $sofile entry $origname to $patchedname"
+                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
+            fi
        fi
    done
 }
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,19 +16,17 @@ from typing import Optional


 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "12.9"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
-    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
    "12.9": "9",
-    "13.0": "9",
 }

 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
@ -40,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.9-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.9-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -56,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -73,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -90,49 +88,32 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
    "xpu": (
-        "intel-cmplr-lib-rt==2025.2.1 | "
-        "intel-cmplr-lib-ur==2025.2.1 | "
-        "intel-cmplr-lic-rt==2025.2.1 | "
-        "intel-sycl-rt==2025.2.1 | "
-        "oneccl-devel==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "oneccl==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "impi-rt==2021.16.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "onemkl-sycl-blas==2025.2.0 | "
-        "onemkl-sycl-dft==2025.2.0 | "
-        "onemkl-sycl-lapack==2025.2.0 | "
-        "onemkl-sycl-rng==2025.2.0 | "
-        "onemkl-sycl-sparse==2025.2.0 | "
-        "dpcpp-cpp-rt==2025.2.1 | "
-        "intel-opencl-rt==2025.2.1 | "
-        "mkl==2025.2.0 | "
-        "intel-openmp==2025.2.1 | "
-        "tbb==2022.2.0 | "
-        "tcmlib==1.4.0 | "
-        "umf==0.11.0 | "
-        "intel-pti==0.13.1"
+        "intel-cmplr-lib-rt==2025.1.1 | "
+        "intel-cmplr-lib-ur==2025.1.1 | "
+        "intel-cmplr-lic-rt==2025.1.1 | "
+        "intel-sycl-rt==2025.1.1 | "
+        "oneccl-devel==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "oneccl==2021.15.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "impi-rt==2021.15.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "onemkl-sycl-blas==2025.1.0 | "
+        "onemkl-sycl-dft==2025.1.0 | "
+        "onemkl-sycl-lapack==2025.1.0 | "
+        "onemkl-sycl-rng==2025.1.0 | "
+        "onemkl-sycl-sparse==2025.1.0 | "
+        "dpcpp-cpp-rt==2025.1.1 | "
+        "intel-opencl-rt==2025.1.1 | "
+        "mkl==2025.1.0 | "
+        "intel-openmp==2025.1.1 | "
+        "tbb==2022.1.0 | "
+        "tcmlib==1.3.0 | "
+        "umf==0.10.0 | "
+        "intel-pti==0.12.3"
    ),
 }

@ -143,7 +124,9 @@ def get_nccl_wheel_version(arch_version: str) -> str:
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
-    return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
+    return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
+        1
+    ]


 def read_nccl_pin(arch_version: str) -> str:
@ -210,7 +193,7 @@ LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
    "cpu": "libtorch-cxx11-builder:cpu",
 }

-FULL_PYTHON_VERSIONS = ["3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]
+FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -240,8 +223,6 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
    if libtorch_variants is None:
@ -333,8 +314,8 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13t on cpu-s390x
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
-            # TODO: Enable python 3.14 for rest
-            if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
+            # TODO: Enable python 3.14 on non linux OSes
+            if os != "linux" and (
                python_version == "3.14" or python_version == "3.14t"
            ):
                continue
@ -342,7 +323,7 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["13.0", "12.9", "12.8", "12.6"]
+                arch_version in ["12.9", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
@ -375,6 +356,29 @@ def generate_wheels_matrix(
                        ),  # include special case for aarch64 build, remove the -aarch64 postfix
                    }
                )
+                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
+                if python_version == "3.11" and arch_version == CUDA_STABLE:
+                    ret.append(
+                        {
+                            "python_version": python_version,
+                            "gpu_arch_type": gpu_arch_type,
+                            "gpu_arch_version": gpu_arch_version,
+                            "desired_cuda": translate_desired_cuda(
+                                gpu_arch_type, gpu_arch_version
+                            ),
+                            "container_image": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[0],
+                            "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[1],
+                            "package_type": package_type,
+                            "pytorch_extra_install_requirements": "",
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                                ".", "_"
+                            ),
+                        }
+                    )
            else:
                ret.append(
                    {
@ -405,7 +409,6 @@ def generate_wheels_matrix(
    return ret


-validate_nccl_dep_consistency("13.0")
 validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -27,7 +27,6 @@ from trymerge import (
    get_drci_classifications,
    gh_get_team_members,
    GitHubPR,
-    iter_issue_timeline_until_comment,
    JobCheckState,
    main as trymerge_main,
    MandatoryChecksMissingError,
@ -35,8 +34,6 @@ from trymerge import (
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
-    sha_from_committed_event,
-    sha_from_force_push_after,
    validate_revert,
 )

@ -73,9 +70,6 @@ def mock_query(
    if key in mocked_queries:
        return mocked_queries[key]

-    # TODO: Remove me once https://github.com/pytorch/pytorch/issues/160489 is resolved
-    raise ValueError(f"Key {key} could not be found in gql_mocks")
-
    try:
        rc = fallback_function(*args)
    except HTTPError as err:
@ -127,7 +121,7 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
            self.force = force
            self.pr_num = 76123
            self.dry_run = True
-            self.comment_id = 12345  # Set to non-zero value
+            self.comment_id = 0
            self.reason = "this is for testing"
            self.ignore_current = False
            self.check_mergeability = False
@ -155,9 +149,9 @@ def mock_revert(
 def mock_merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
@ -473,9 +467,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=True,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -488,9 +482,9 @@ class TestTryMerge(TestCase):
        mock_merge.assert_called_once_with(
            mock.ANY,
            mock.ANY,
-            comment_id=mock.ANY,
            dry_run=mock.ANY,
            skip_mandatory_checks=False,
+            comment_id=mock.ANY,
            ignore_current=False,
        )

@ -1141,176 +1135,5 @@ Pull Request resolved: https://github.com/pytorch/pytorch/pull/154394"""
        )


-@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
-@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
-@mock.patch(
-    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
-)
-class TestTimelineFunctions(TestCase):
-    """Tests for the new timeline-related functions"""
-
-    def test_sha_from_committed_event(self, *args: Any) -> None:
-        """Test extracting SHA from committed event"""
-        # Based on actual GitHub API format - committed events have "sha" at top level
-        event = {
-            "event": "committed",
-            "sha": "fb21ce932ded6670c918804a0d9151b773770a7c",
-        }
-        self.assertEqual(
-            sha_from_committed_event(event), "fb21ce932ded6670c918804a0d9151b773770a7c"
-        )
-
-        # Test with missing SHA
-        event_no_sha = {"event": "committed"}
-        self.assertIsNone(sha_from_committed_event(event_no_sha))
-
-    def test_sha_from_force_push_after(self, *args: Any) -> None:
-        """Test extracting SHA from force push event"""
-        # NOTE: The current function doesn't handle the actual GitHub API format
-        # Real force push events have "commit_id" at top level, but this function
-        # looks for "after", "after_commit", "after_sha", or "head_sha" fields
-
-        # Test with the legacy format the current function handles
-        event_legacy = {
-            "event": "head_ref_force_pushed",
-            "after": {"sha": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e"},
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_legacy),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )
-
-        # Test with current GitHub API format (should return None with current implementation)
-        event_real_api = {
-            "event": "head_ref_force_pushed",
-            "commit_id": "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        }
-        self.assertEqual(
-            sha_from_force_push_after(event_real_api),
-            "ef22bcbc54bb0f787e1e4ffd3d83df18fc407f5e",
-        )  # Current function doesn't handle commit_id
-
-        # Test with missing SHA
-        event_no_sha = {"event": "head_ref_force_pushed"}
-        self.assertIsNone(sha_from_force_push_after(event_no_sha))
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration until target comment"""
-        # Mock timeline data based on actual GitHub API format
-        timeline_data = [
-            {"event": "commented", "id": 100, "body": "first comment"},
-            {"event": "committed", "sha": "fb21ce932ded6670c918804a0d9151b773770a7c"},
-            {"event": "commented", "id": 200, "body": "target comment"},
-            {"event": "commented", "id": 300, "body": "after target"},
-        ]
-        mock_gh_fetch_json_list.return_value = timeline_data
-
-        # Test iteration stops at target comment
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 200))
-        self.assertEqual(len(events), 3)  # Should stop at target comment
-        self.assertEqual(events[0]["event"], "commented")
-        self.assertEqual(events[0]["id"], 100)
-        self.assertEqual(events[1]["event"], "committed")
-        self.assertEqual(events[1]["sha"], "fb21ce932ded6670c918804a0d9151b773770a7c")
-        self.assertEqual(events[2]["event"], "commented")
-        self.assertEqual(events[2]["id"], 200)
-
-    @mock.patch("trymerge.gh_fetch_json_list")
-    def test_iter_issue_timeline_until_comment_not_found(
-        self, mock_gh_fetch_json_list: Any, *args: Any
-    ) -> None:
-        """Test timeline iteration when target comment is not found"""
-        # Mock empty timeline
-        mock_gh_fetch_json_list.return_value = []
-
-        events = list(iter_issue_timeline_until_comment("pytorch", "pytorch", 123, 999))
-        self.assertEqual(len(events), 0)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_commit_after_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        """Test get_commit_sha_at_comment returns correct SHA after comment"""
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 100},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit2")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "commit_id": "commit3"},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_force_push_before_comment_legacy_mode(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 100},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_multiple_comments(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "committed", "sha": "commit1"},
-            {"event": "commented", "id": 100},
-            {"event": "committed", "sha": "commit2"},
-            {"event": "commented", "id": 200},
-            {"event": "head_ref_force_pushed", "after": {"sha": "commit3"}},
-            {"event": "commented", "id": 300},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(200)
-        self.assertEqual(sha, "commit2")
-        sha = pr.get_commit_sha_at_comment(300)
-        self.assertEqual(sha, "commit3")
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_no_events(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.return_value = [
-            {"event": "commented", "id": 100},
-            {"event": "labeled", "label": {"name": "test"}},
-        ]
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-    @mock.patch("trymerge.iter_issue_timeline_until_comment")
-    def test_get_commit_sha_at_comment_exception(
-        self, mock_iter_timeline: Any, *args: Any
-    ) -> None:
-        mock_iter_timeline.side_effect = Exception("API error")
-        pr = GitHubPR("pytorch", "pytorch", 77700)
-        sha = pr.get_commit_sha_at_comment(100)
-        self.assertIsNone(sha)
-
-
 if __name__ == "__main__":
    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -108,6 +108,10 @@ GH_CHECKSUITES_FRAGMENT = """
 fragment PRCheckSuites on CheckSuiteConnection {
  edges {
    node {
+      app {
+        name
+        databaseId
+      }
      workflowRun {
        workflow {
          name
@ -450,63 +454,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


-def iter_issue_timeline_until_comment(
-    org: str, repo: str, issue_number: int, target_comment_id: int, max_pages: int = 200
-) -> Any:
-    """
-    Yield timeline entries in order until (and including) the entry whose id == target_comment_id
-    for a 'commented' event. Stops once the target comment is encountered.
-    """
-    page = 1
-
-    while page <= max_pages:
-        url = (
-            f"https://api.github.com/repos/{org}/{repo}/issues/{issue_number}/timeline"
-        )
-        params = {"per_page": 100, "page": page}
-
-        batch = gh_fetch_json_list(url, params)
-
-        if not batch:
-            return
-        for ev in batch:
-            # The target is the issue comment row with event == "commented" and id == issue_comment_id
-            if ev.get("event") == "commented" and ev.get("id") == target_comment_id:
-                yield ev  # nothing in the timeline after this matters, so stop early
-                return
-            yield ev
-        if len(batch) < 100:
-            return
-        page += 1
-
-    # If we got here without finding the comment, then we either hit a bug or some github PR
-    # has a _really_ long timeline.
-    # The max # of pages found on any pytorch/pytorch PR at the time of this change was 41
-    raise RuntimeError(
-        f"Could not find a merge commit in the first {max_pages} pages of the timeline at url {url}."
-        f"This is most likely a bug, please report it to the @pytorch/pytorch-dev-infra team."
-    )
-
-
-def sha_from_committed_event(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from committed event in timeline"""
-    return ev.get("sha")
-
-
-def sha_from_force_push_after(ev: dict[str, Any]) -> Optional[str]:
-    """Extract SHA from force push event in timeline"""
-    # The current GitHub API format
-    commit_id = ev.get("commit_id")
-    if commit_id:
-        return str(commit_id)
-
-    # Legacy format
-    after = ev.get("after") or ev.get("after_commit") or {}
-    if isinstance(after, dict):
-        return after.get("sha") or after.get("oid")
-    return ev.get("after_sha") or ev.get("head_sha")
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -794,24 +741,16 @@ class GitHubPR:
    def last_commit(self) -> Any:
        return self.info["commits"]["nodes"][-1]["commit"]

-    def last_commit_sha(self, default: Optional[str] = None) -> str:
-        # for commits, the oid is the sha
-
-        if default is None:
-            return str(self.last_commit()["oid"])
-
-        return str(self.last_commit().get("oid", default))
-
    def get_merge_base(self) -> str:
        if self.merge_base:
            return self.merge_base

-        last_commit_sha = self.last_commit_sha()
+        last_commit_oid = self.last_commit()["oid"]
        # NB: We could use self.base_ref() here for regular PR, however, that doesn't
        # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
        # so let's just use main instead
        self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_sha, self.default_branch()
+            self.org, self.project, last_commit_oid, self.default_branch()
        )

        # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -900,44 +839,6 @@ class GitHubPR:
    def get_commit_count(self) -> int:
        return int(self.info["commits_with_authors"]["totalCount"])

-    def get_commit_sha_at_comment(self, comment_id: int) -> Optional[str]:
-        """
-        Get the PR head commit SHA that was present when a specific comment was posted.
-        This ensures we only merge the state of the PR at the time the merge command was issued,
-        not any subsequent commits that may have been pushed after.
-
-        Returns None if no head-changing events found before the comment or if the comment was not found.
-        """
-        head = None
-
-        try:
-            for event in iter_issue_timeline_until_comment(
-                self.org, self.project, self.pr_num, comment_id
-            ):
-                etype = event.get("event")
-                if etype == "committed":
-                    sha = sha_from_committed_event(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found commit event for SHA {sha}")
-                elif etype == "head_ref_force_pushed":
-                    sha = sha_from_force_push_after(event)
-                    if sha:
-                        head = sha
-                        print(f"Timeline: Found force push event for SHA {sha}")
-                elif etype == "commented":
-                    if event.get("id") == comment_id:
-                        print(f"Timeline: Found final comment with sha {sha}")
-                        return head
-        except Exception as e:
-            print(
-                f"Warning: Failed to reconstruct timeline for comment {comment_id}: {e}"
-            )
-            return None
-
-        print(f"Did not find comment with id {comment_id} in the PR timeline")
-        return None
-
    def get_pr_creator_login(self) -> str:
        return cast(str, self.info["author"]["login"])

@ -1254,7 +1155,7 @@ class GitHubPR:
        *,
        skip_mandatory_checks: bool = False,
        dry_run: bool = False,
-        comment_id: int,
+        comment_id: Optional[int] = None,
        ignore_current_checks: Optional[list[str]] = None,
    ) -> None:
        # Raises exception if matching rule is not found
@ -1270,7 +1171,7 @@ class GitHubPR:
            skip_internal_checks=can_skip_internal_checks(self, comment_id),
            ignore_current_checks=ignore_current_checks,
        )
-        additional_merged_prs = self.merge_changes_locally(
+        additional_merged_prs = self.merge_changes(
            repo, skip_mandatory_checks, comment_id
        )

@ -1299,7 +1200,7 @@ class GitHubPR:
                broken_trunk_checks=ignorable_checks.get("BROKEN_TRUNK", []),
                flaky_checks=ignorable_checks.get("FLAKY", []),
                unstable_checks=ignorable_checks.get("UNSTABLE", []),
-                last_commit_sha=self.last_commit_sha(default=""),
+                last_commit_sha=self.last_commit().get("oid", ""),
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
@ -1320,7 +1221,7 @@ class GitHubPR:
            dry_run=dry_run,
        )

-    def merge_changes_locally(
+    def merge_changes(
        self,
        repo: GitRepo,
        skip_mandatory_checks: bool = False,
@ -1329,15 +1230,27 @@ class GitHubPR:
        skip_all_rule_checks: bool = False,
    ) -> list["GitHubPR"]:
        """
-        :param skip_all_rule_checks: If true, skips all rule checks on ghstack PRs, useful for dry-running merge locally
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
+        if not self.is_ghstack_pr():
+            msg = self.gen_commit_message()
+            pr_branch_name = f"__pull-request-{self.pr_num}__init__"
+            repo.fetch(self.last_commit()["oid"], pr_branch_name)
+            repo._run_git("merge", "--squash", pr_branch_name)
+            repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)

-        # It's okay to skip the commit SHA check for ghstack PRs since
-        # authoring requires write access to the repo.
-        if self.is_ghstack_pr():
+            # Did the PR change since we started the merge?
+            pulled_sha = repo.show_ref(pr_branch_name)
+            latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
+            if pulled_sha != latest_pr_status.last_commit()["oid"]:
+                raise RuntimeError(
+                    "PR has been updated since CI checks last passed. Please rerun the merge command."
+                )
+            return []
+        else:
            return self.merge_ghstack_into(
                repo,
                skip_mandatory_checks,
@ -1345,48 +1258,6 @@ class GitHubPR:
                skip_all_rule_checks=skip_all_rule_checks,
            )

-        msg = self.gen_commit_message()
-        pr_branch_name = f"__pull-request-{self.pr_num}__init__"
-
-        # Determine which commit SHA to merge
-        commit_to_merge = None
-        if not comment_id:
-            raise ValueError("Must provide --comment-id when merging regular PRs")
-
-        # Get the commit SHA that was present when the comment was made
-        commit_to_merge = self.get_commit_sha_at_comment(comment_id)
-        if not commit_to_merge:
-            raise RuntimeError(
-                f"Could not find commit that was pushed before comment {comment_id}"
-            )
-
-        # Validate that this commit is the latest commit on the PR
-        latest_commit = self.last_commit_sha()
-        if commit_to_merge != latest_commit:
-            raise RuntimeError(
-                f"Commit {commit_to_merge} was HEAD when comment {comment_id} was posted "
-                f"but now the latest commit on the PR is {latest_commit}. "
-                f"Please re-issue the merge command to merge the latest commit."
-            )
-
-        print(f"Merging commit {commit_to_merge} locally")
-
-        repo.fetch(commit_to_merge, pr_branch_name)
-        repo._run_git("merge", "--squash", pr_branch_name)
-        repo._run_git("commit", f'--author="{self.get_author()}"', "-m", msg)
-
-        # Did the PR change since we started the merge?
-        pulled_sha = repo.show_ref(pr_branch_name)
-        latest_pr_status = GitHubPR(self.org, self.project, self.pr_num)
-        if (
-            pulled_sha != latest_pr_status.last_commit_sha()
-            or pulled_sha != commit_to_merge
-        ):
-            raise RuntimeError(
-                "PR has been updated since CI checks last passed. Please rerun the merge command."
-            )
-        return []
-

 class MergeRuleFailedError(RuntimeError):
    def __init__(self, message: str, rule: Optional["MergeRule"] = None) -> None:
@ -1591,7 +1462,7 @@ def find_matching_merge_rule(
            pending_checks = []
            failed_checks = []

-        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit_sha()}"
+        hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}"
        if len(failed_checks) > 0:
            if reject_reason_score < 30000:
                reject_reason_score = 30000
@ -2289,14 +2160,14 @@ def categorize_checks(
 def merge(
    pr: GitHubPR,
    repo: GitRepo,
-    comment_id: int,
    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
+    comment_id: Optional[int] = None,
    timeout_minutes: int = 400,
    stale_pr_days: int = 3,
    ignore_current: bool = False,
 ) -> None:
-    initial_commit_sha = pr.last_commit_sha()
+    initial_commit_sha = pr.last_commit()["oid"]
    pr_link = f"https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num}"
    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")

@ -2367,7 +2238,7 @@ def merge(
            f"Attempting merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} ({elapsed_time / 60} minutes elapsed)"
        )
        pr = GitHubPR(pr.org, pr.project, pr.pr_num)
-        if initial_commit_sha != pr.last_commit_sha():
+        if initial_commit_sha != pr.last_commit()["oid"]:
            raise RuntimeError(
                "New commits were pushed while merging. Please rerun the merge command."
            )
@ -2534,7 +2405,7 @@ def main() -> None:
    if args.check_mergeability:
        if pr.is_ghstack_pr():
            get_ghstack_prs(repo, pr)  # raises error if out of sync
-        pr.merge_changes_locally(
+        pr.merge_changes(
            repo,
            skip_mandatory_checks=True,
            skip_all_rule_checks=True,
@ -2549,18 +2420,12 @@ def main() -> None:
        gh_post_pr_comment(org, project, args.pr_num, message, dry_run=args.dry_run)
        return
    try:
-        # Ensure comment id is set, else fail
-        if not args.comment_id:
-            raise ValueError(
-                "Comment ID is required for merging PRs, please provide it using --comment-id"
-            )
-
        merge(
            pr,
            repo,
-            comment_id=args.comment_id,
            dry_run=args.dry_run,
            skip_mandatory_checks=args.force,
+            comment_id=args.comment_id,
            ignore_current=args.ignore_current,
        )
    except Exception as e:
@ -2582,7 +2447,7 @@ def main() -> None:
                broken_trunk_checks=[],
                flaky_checks=[],
                unstable_checks=[],
-                last_commit_sha=pr.last_commit_sha(default=""),
+                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
                skip_mandatory_checks=args.force,
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -35,9 +35,6 @@ cd magma
 mkdir build && cd build

 set GPU_TARGET=All
-if "%CUVER_NODOT%" == "130" (
-  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-)
 if "%CUVER_NODOT%" == "129" (
  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Lucas Kabela	1207f9ab93	typing tvm.py	2025-08-11 14:57:13 -07:00
Lucas Kabela	fcfb6bab89	Type backend torchxla	2025-08-11 14:37:34 -07:00
Lucas Kabela	95bd114806	typing registry.py	2025-08-11 14:09:50 -07:00
Lucas Kabela	ec68abdc38	typing inductor and placeholder backends	2025-08-11 13:51:06 -07:00
Lucas Kabela	ee417d1806	typing distributed.py	2025-08-11 13:43:05 -07:00
Lucas Kabela	4a8afeaffb	typing debugging.py	2025-08-11 11:33:30 -07:00
Lucas Kabela	90cba401a0	Type cudagraphs.py	2025-08-11 10:24:17 -07:00
Lucas Kabela	6e4c4d9e57	Typing for common.py	2025-08-11 09:50:55 -07:00