vllmbuild

ghstack-source-id: 61c87bcac7a2b2f47c2a03b8a1da8999dd648acd Pull-Request: https://github.com/pytorch/pytorch/pull/160089
[ghstack] setup torch_cli build
2025-11-11 22:34:53 +08:00 · 2025-08-13 22:01:40 -07:00 · 2025-08-13 22:01:40 -07:00
840 changed files with 10486 additions and 34225 deletions
--- a/.bc-linter.yml
+++ b/.bc-linter.yml
@ -1,15 +0,0 @@
-version: 1
-paths:
-include:
-  - "**/*.py"
-exclude:
-  - ".*"
-  - ".*/**"
-  - "**/.*/**"
-  - "**/.*"
-  - "**/_*/**"
-  - "**/_*.py"
-  - "**/test/**"
-  - "**/benchmarks/**"
-  - "**/test_*.py"
-  - "**/*_test.py"
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,7 +92,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -210,6 +209,8 @@ if __name__ == "__main__":
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
+        # nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
+        build_vars += "USE_NVSHMEM=OFF "

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-ENV DESIRED_CUDA=13.0
-
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@ -80,10 +76,10 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh

 FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
-COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,7 +168,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    ONNX=yes
@ -288,6 +288,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
@ -298,6 +299,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +0,0 @@
-transformers==4.54.0
-soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
+v4.54.0
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +0,0 @@
-v2.27.7-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-0958dc9b2bb815e428f721f9da599dab0dc1c5d7
+ae324eeac8e102a2b40370e341460f3791353398
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.20
+NVSHMEM_VERSION=3.3.9

 function install_cuda {
  version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
-  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
-  suffix=".tar.xz"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}${suffix}"
-  cp -a "${filename}/include/"* /usr/local/cuda/include/
-  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
@ -128,6 +126,74 @@ function install_129 {
  ldconfig
 }

+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,39 +212,18 @@ function install_128 {
  ldconfig
 }

-function install_130 {
-  CUDNN_VERSION=9.12.0.46
-  NVSHMEM_VERSION=3.3.20
-  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 13 $CUDNN_VERSION
-
-  install_nvshmem 13 $NVSHMEM_VERSION
-
-  CUDA_VERSION=13.0 bash install_nccl.sh
-
-  CUDA_VERSION=13.0 bash install_cusparselt.sh
-
-  ldconfig
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124;
+    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126;
+    12.6|12.6.*) install_126; prune_126
        ;;
    12.8|12.8.*) install_128;
        ;;
    12.9|12.9.*) install_129;
        ;;
-    13.0|13.0.*) install_130;
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,7 +5,9 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  pip_install -r huggingface-requirements.txt
+  local version
+  commit=$(get_pinned_commit huggingface)
+  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }

 function install_timm() {
@ -24,6 +26,9 @@ function install_torchbench() {

  python install.py --continue_on_fail

+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0
+
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
  exit 1
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-pip_install onnxruntime==1.22.1
-pip_install onnxscript==0.4.0
+pip_install onnxruntime==1.18.1
+pip_install onnxscript==0.3.1

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -69,11 +69,6 @@ RUN bash ./install_cuda.sh 12.9
 RUN bash ./install_magma.sh 12.9
 RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-RUN bash ./install_magma.sh 13.0
-RUN ln -sf /usr/local/cuda-13.0 /usr/local/cuda
-
 FROM cpu as rocm
 ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -67,12 +67,6 @@ case ${image} in
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28-builder:cuda13*)
-        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
-        MANY_LINUX_VERSION="2_28"
-        ;;
    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
        GPU_IMAGE=amd64/almalinux:8
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -263,6 +263,11 @@ scipy==1.14.1 ; python_version >= "3.12"
 #Pinned versions:
 #test that import:

+tb-nightly==2.13.0a20230426
+#Description: TensorBoard
+#Pinned versions:
+#test that import:
+
 # needed by torchgen utils
 typing-extensions>=4.10.0
 #Description: type hints for python
@ -339,7 +344,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:

-onnxscript==0.4.0
+onnxscript==0.3.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
@ -181,6 +181,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

 # AWS specific CUDA build guidance
+ENV TORCH_CUDA_ARCH_LIST Maxwell
 ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
 ENV CUDA_PATH /usr/local/cuda

--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/lumen_cli/cli/build_cli/register_build.py
+++ b/.ci/lumen_cli/cli/build_cli/register_build.py
@ -2,7 +2,7 @@ import argparse
 import logging

 from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
-from cli.lib.core.vllm.vllm_build import VllmBuildRunner
+from cli.lib.core.vllm import VllmBuildRunner


 logger = logging.getLogger(__name__)
--- a/.ci/lumen_cli/cli/lib/common/pip_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/pip_helper.py
@ -1,71 +0,0 @@
-import glob
-import logging
-import shlex
-import shutil
-import sys
-from collections.abc import Iterable
-from importlib.metadata import PackageNotFoundError, version
-from typing import Optional, Union
-
-from cli.lib.common.utils import run_command
-
-
-logger = logging.getLogger(__name__)
-
-
-def pip_install_packages(
-    packages: Iterable[str] = (),
-    env=None,
-    *,
-    requirements: Optional[str] = None,
-    constraints: Optional[str] = None,
-    prefer_uv: bool = False,
-) -> None:
-    use_uv = prefer_uv and shutil.which("uv") is not None
-    base = (
-        [sys.executable, "-m", "uv", "pip", "install"]
-        if use_uv
-        else [sys.executable, "-m", "pip", "install"]
-    )
-    cmd = base[:]
-    if requirements:
-        cmd += ["-r", requirements]
-    if constraints:
-        cmd += ["-c", constraints]
-    cmd += list(packages)
-    logger.info("pip installing packages: %s", " ".join(map(shlex.quote, cmd)))
-    run_command(" ".join(map(shlex.quote, cmd)), env=env)
-
-
-def pip_install_first_match(pattern: str, extras: Optional[str] = None, pref_uv=False):
-    wheel = first_matching_pkg(pattern)
-    target = f"{wheel}[{extras}]" if extras else wheel
-    logger.info("Installing %s...", target)
-    pip_install_packages([target], prefer_uv=pref_uv)
-
-
-def run_python(args: Union[str, list[str]], env=None):
-    """
-    Run the python in the current environment.
-    """
-    if isinstance(args, str):
-        args = shlex.split(args)
-    cmd = [sys.executable] + args
-    run_command(" ".join(map(shlex.quote, cmd)), env=env)
-
-
-def pkg_exists(name: str) -> bool:
-    try:
-        pkg_version = version(name)
-        logger.info("%s already exist with version: %s", name, pkg_version)
-        return True
-    except PackageNotFoundError:
-        logger.info("%s is not installed", name)
-        return False
-
-
-def first_matching_pkg(pattern: str) -> str:
-    matches = sorted(glob.glob(pattern))
-    if not matches:
-        raise FileNotFoundError(f"No wheel matching: {pattern}")
-    return matches[0]
--- a/.ci/lumen_cli/cli/lib/common/utils.py
+++ b/.ci/lumen_cli/cli/lib/common/utils.py
@ -7,7 +7,6 @@ import os
 import shlex
 import subprocess
 import sys
-from contextlib import contextmanager
 from typing import Optional


@ -78,40 +77,3 @@ def str2bool(value: Optional[str]) -> bool:
    if value in false_value_set:
        return False
    raise ValueError(f"Invalid string value for boolean conversion: {value}")
-
-
-@contextmanager
-def temp_environ(updates: dict[str, str]):
-    """
-    Temporarily set environment variables and restore them after the block.
-    Args:
-        updates: Dict of environment variables to set.
-    """
-    missing = object()
-    old: dict[str, str | object] = {k: os.environ.get(k, missing) for k in updates}
-    try:
-        os.environ.update(updates)
-        yield
-    finally:
-        for k, v in old.items():
-            if v is missing:
-                os.environ.pop(k, None)
-            else:
-                os.environ[k] = v  # type: ignore[arg-type]
-
-
-@contextmanager
-def working_directory(path: str):
-    """
-    Temporarily change the working directory inside a context.
-    """
-    if not path:
-        # No-op context
-        yield
-        return
-    prev_cwd = os.getcwd()
-    try:
-        os.chdir(path)
-        yield
-    finally:
-        os.chdir(prev_cwd)
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -13,6 +13,7 @@ from cli.lib.common.envs_helper import (
    env_str_field,
    with_params_help,
 )
+from cli.lib.common.git_helper import clone_external_repo
 from cli.lib.common.path_helper import (
    copy,
    ensure_dir_exists,
@ -21,7 +22,6 @@ from cli.lib.common.path_helper import (
    is_path_exist,
 )
 from cli.lib.common.utils import run_command
-from cli.lib.core.vllm.lib import clone_vllm


 logger = logging.getLogger(__name__)
@ -42,7 +42,7 @@ class VllmBuildParameters:
    """

    # USE_TORCH_WHEEL: when true, use local Torch wheels; requires TORCH_WHEELS_PATH.
-    # Otherwise docker build pull torch nightly during build
+    #  Otherwise docker build pull torch nightly during build
    # TORCH_WHEELS_PATH: directory containing local torch wheels when use_torch_whl is True
    use_torch_whl: bool = env_bool_field("USE_TORCH_WHEEL", True)
    torch_whls_path: Path = env_path_field("TORCH_WHEELS_PATH", "./dist")
@ -62,7 +62,7 @@ class VllmBuildParameters:
    )

    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
-    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")
+    output_dir: Path = env_path_field("OUTPUT_DIR", "shared")

    # --- Build args ----------------------------------------------------------
    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
@ -152,7 +152,6 @@ class VllmBuildRunner(BaseRunner):
        3. run docker build
        """
        inputs = VllmBuildParameters()
-        logger.info("Running vllm build with inputs: %s", inputs)
        clone_vllm()

        self.cp_dockerfile_if_exist(inputs)
@ -253,3 +252,12 @@ class VllmBuildRunner(BaseRunner):
                --progress=plain .
        """
        ).strip()
+
+
+def clone_vllm():
+    clone_external_repo(
+        target="vllm",
+        repo="https://github.com/vllm-project/vllm.git",
+        dst="vllm",
+        update_submodules=True,
+    )
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -1,232 +0,0 @@
-import logging
-from typing import Any
-
-from cli.lib.common.git_helper import clone_external_repo
-from cli.lib.common.pip_helper import pip_install_packages
-from cli.lib.common.utils import run_command, temp_environ, working_directory
-
-
-logger = logging.getLogger(__name__)
-
-
-def sample_vllm_test_library():
-    """
-    Simple sample to unblock the vllm ci development, which is mimic to
-    https://github.com/vllm-project/vllm/blob/main/.buildkite/test-pipeline.yaml
-    see run_test_plan for more details
-    """
-    # TODO(elainewy): Read from yaml file to handle the env and tests for vllm
-    return {
-        "vllm_basic_correctness_test": {
-            "title": "Basic Correctness Test",
-            "id": "vllm_basic_correctness_test",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "steps": [
-                "pytest -v -s basic_correctness/test_cumem.py",
-                "pytest -v -s basic_correctness/test_basic_correctness.py",
-                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
-            ],
-        },
-        "vllm_basic_models_test": {
-            "title": "Basic models test",
-            "id": "vllm_basic_models_test",
-            "steps": [
-                "pytest -v -s models/test_transformers.py",
-                "pytest -v -s models/test_registry.py",
-                "pytest -v -s models/test_utils.py",
-                "pytest -v -s models/test_vision.py",
-                "pytest -v -s models/test_initialization.py",
-            ],
-        },
-        "vllm_entrypoints_test": {
-            "title": "Entrypoints Test ",
-            "id": "vllm_entrypoints_test",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "steps": [
-                " ".join(
-                    [
-                        "pytest",
-                        "-v",
-                        "-s",
-                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
-                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
-                        "--ignore=entrypoints/llm/test_collective_rpc.py",
-                    ]
-                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "pytest -v -s entrypoints/llm/test_generate_multiple_loras.py",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
-            ],
-        },
-        "vllm_regression_test": {
-            "title": "Regression Test",
-            "id": "vllm_regression_test",
-            "package_install": ["modelscope"],
-            "steps": [
-                "pytest -v -s test_regression.py",
-            ],
-        },
-        "vllm_lora_tp_test_distributed": {
-            "title": "LoRA TP Test (Distributed)",
-            "id": "vllm_lora_tp_test_distributed",
-            "env_vars": {
-                "VLLM_WORKER_MULTIPROC_METHOD": "spawn",
-            },
-            "num_gpus": 4,
-            "steps": [
-                "pytest -v -s -x lora/test_chatglm3_tp.py",
-                "echo $VLLM_WORKER_MULTIPROC_METHOD",
-                "pytest -v -s -x lora/test_llama_tp.py",
-                "pytest -v -s -x lora/test_multi_loras_with_tp.py",
-            ],
-        },
-        "vllm_lora_280_failure_test": {
-            "title": "LoRA 280 failure test",
-            "id": "vllm_lora_280_failure_test",
-            "steps": ["pytest -v lora/test_quant_model.py"],
-        },
-        "vllm_multi_model_processor_test": {
-            "title": "Multi-Modal Processor Test",
-            "id": "vllm_multi_model_processor_test",
-            "package_install": ["git+https://github.com/TIGER-AI-Lab/Mantis.git"],
-            "steps": [
-                "pytest -v -s models/multimodal/processing --ignore models/multimodal/processing/test_tensor_schema.py",
-            ],
-        },
-        "vllm_pytorch_compilation_unit_tests": {
-            "title": "PyTorch Compilation Unit Tests",
-            "id": "vllm_pytorch_compilation_unit_tests",
-            "steps": [
-                "pytest -v -s compile/test_pass_manager.py",
-                "pytest -v -s compile/test_fusion.py",
-                "pytest -v -s compile/test_fusion_attn.py",
-                "pytest -v -s compile/test_silu_mul_quant_fusion.py",
-                "pytest -v -s compile/test_sequence_parallelism.py",
-                "pytest -v -s compile/test_async_tp.py",
-                "pytest -v -s compile/test_fusion_all_reduce.py",
-                "pytest -v -s compile/test_decorator.py",
-            ],
-        },
-        # TODO(elainewy):need to add g6 with 4 gpus to run this test
-        "vllm_lora_test": {
-            "title": "LoRA Test %N",
-            "id": "lora_test",
-            "parallelism": 4,
-            "steps": [
-                "echo '[checking] list sharded lora tests:'",
-                " ".join(
-                    [
-                        "pytest -q --collect-only lora",
-                        "--shard-id=$$BUILDKITE_PARALLEL_JOB",
-                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
-                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
-                    ]
-                ),
-                "echo '[checking] Done. list lora tests'",
-                " ".join(
-                    [
-                        "pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB",
-                        "--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT",
-                        "--ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py",
-                    ]
-                ),
-            ],
-        },
-    }
-
-
-def check_parallelism(tests: Any, title: str, shard_id: int = 0, num_shards: int = 0):
-    """
-    a method to check if the test plan is parallelism or not.
-    """
-    parallelism = int(tests.get("parallelism", "0"))
-    is_parallel = parallelism and parallelism > 1
-
-    if not is_parallel:
-        return False
-
-    if shard_id > num_shards:
-        raise RuntimeError(
-            f"Test {title} expects {num_shards} shards, but invalid {shard_id} is provided"
-        )
-
-    if num_shards != parallelism:
-        raise RuntimeError(
-            f"Test {title} expects {parallelism} shards, but invalid {num_shards} is provided"
-        )
-
-    return True
-
-
-def run_test_plan(
-    test_plan: str,
-    test_target: str,
-    tests_map: dict[str, Any],
-    shard_id: int = 0,
-    num_shards: int = 0,
-):
-    """
-    a method to run list of tests based on the test plan.
-    """
-    logger.info("run %s tests.....", test_target)
-    if test_plan not in tests_map:
-        raise RuntimeError(
-            f"test {test_plan} not found, please add it to test plan pool"
-        )
-    tests = tests_map[test_plan]
-    pkgs = tests.get("package_install", [])
-    title = tests.get("title", "unknown test")
-
-    is_parallel = check_parallelism(tests, title, shard_id, num_shards)
-    if is_parallel:
-        title = title.replace("%N", f"{shard_id}/{num_shards}")
-
-    logger.info("Running tests: %s", title)
-    if pkgs:
-        logger.info("Installing packages: %s", pkgs)
-        pip_install_packages(packages=pkgs, prefer_uv=True)
-    with (
-        working_directory(tests.get("working_directory", "tests")),
-        temp_environ(tests.get("env_vars", {})),
-    ):
-        failures = []
-        for step in tests["steps"]:
-            logger.info("Running step: %s", step)
-            if is_parallel:
-                step = replace_buildkite_placeholders(step, shard_id, num_shards)
-                logger.info("Running parallel step: %s", step)
-            code = run_command(cmd=step, check=False, use_shell=True)
-            if code != 0:
-                failures.append(step)
-            logger.info("Finish running step: %s", step)
-        if failures:
-            logger.error("Failed tests: %s", failures)
-            raise RuntimeError(f"{len(failures)} pytest runs failed: {failures}")
-        logger.info("Done. All tests passed")
-
-
-def clone_vllm(dst: str = "vllm"):
-    clone_external_repo(
-        target="vllm",
-        repo="https://github.com/vllm-project/vllm.git",
-        dst=dst,
-        update_submodules=True,
-    )
-
-
-def replace_buildkite_placeholders(step: str, shard_id: int, num_shards: int) -> str:
-    mapping = {
-        "$$BUILDKITE_PARALLEL_JOB_COUNT": str(num_shards),
-        "$$BUILDKITE_PARALLEL_JOB": str(shard_id),
-    }
-    for k in sorted(mapping, key=len, reverse=True):
-        step = step.replace(k, mapping[k])
-    return step
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_test.py
@ -1,257 +0,0 @@
-import logging
-import os
-import re
-import subprocess
-import sys
-from collections.abc import Iterable
-from dataclasses import dataclass
-from enum import Enum
-from pathlib import Path
-from typing import Any
-
-from cli.lib.common.cli_helper import BaseRunner
-from cli.lib.common.envs_helper import env_path_field, env_str_field, get_env
-from cli.lib.common.path_helper import copy, remove_dir
-from cli.lib.common.pip_helper import (
-    pip_install_first_match,
-    pip_install_packages,
-    pkg_exists,
-    run_python,
-)
-from cli.lib.common.utils import run_command, working_directory
-from cli.lib.core.vllm.lib import clone_vllm, run_test_plan, sample_vllm_test_library
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass
-class VllmTestParameters:
-    """
-    Parameters defining the vllm external test input
-
-    !!!DO NOT ADD SECRETS IN THIS CLASS!!!
-    you can put environment variable name in VllmTestParameters if it's not the same as the secret one
-    fetch secrests directly from env variables during runtime
-    """
-
-    torch_whls_path: Path = env_path_field("WHEELS_PATH", "./dist")
-
-    vllm_whls_path: Path = env_path_field(
-        "VLLM_WHEELS_PATH", "./dist/external/vllm/wheels"
-    )
-
-    torch_cuda_arch_list: str = env_str_field("TORCH_CUDA_ARCH_LIST", "8.9")
-
-    def __post_init__(self):
-        if not self.torch_whls_path.exists():
-            raise ValueError("missing torch_whls_path")
-        if not self.vllm_whls_path.exists():
-            raise ValueError("missing vllm_whls_path")
-
-
-class TestInpuType(Enum):
-    TEST_PLAN = "test_plan"
-    UNKNOWN = "unknown"
-
-
-class VllmTestRunner(BaseRunner):
-    def __init__(self, args: Any):
-        self.work_directory = "vllm"
-        self.test_plan = ""
-        self.test_type = TestInpuType.UNKNOWN
-
-        self.shard_id = args.shard_id
-        self.num_shards = args.num_shards
-
-        if args.test_plan:
-            self.test_plan = args.test_plan
-            self.test_type = TestInpuType.TEST_PLAN
-
-        # Matches the structeur in the artifacts.zip from torcb build
-        self.TORCH_WHL_PATH_REGEX = "torch*.whl"
-        self.TORCH_WHL_EXTRA = "opt-einsum"
-        self.TORCH_ADDITIONAL_WHLS_REGEX = [
-            "vision/torchvision*.whl",
-            "audio/torchaudio*.whl",
-        ]
-
-        # Match the structure of the artifacts.zip from vllm external build
-        self.VLLM_TEST_WHLS_REGEX = [
-            "xformers/*.whl",
-            "vllm/vllm*.whl",
-            "flashinfer-python/flashinfer*.whl",
-        ]
-
-    def prepare(self):
-        """
-        prepare test environment for vllm. This includes clone vllm repo, install all wheels, test dependencies and set env
-        """
-        params = VllmTestParameters()
-        logger.info("Display VllmTestParameters %s", params)
-        self._set_envs(params)
-
-        clone_vllm(dst=self.work_directory)
-        with working_directory(self.work_directory):
-            remove_dir(Path("vllm"))
-            self._install_wheels(params)
-            self._install_dependencies()
-        # verify the torches are not overridden by test dependencies
-        check_versions()
-
-    def run(self):
-        """
-        main function to run vllm test
-        """
-        self.prepare()
-        with working_directory(self.work_directory):
-            if self.test_type == TestInpuType.TEST_PLAN:
-                if self.num_shards > 1:
-                    run_test_plan(
-                        self.test_plan,
-                        "vllm",
-                        sample_vllm_test_library(),
-                        self.shard_id,
-                        self.num_shards,
-                    )
-                else:
-                    run_test_plan(self.test_plan, "vllm", sample_vllm_test_library())
-            else:
-                raise ValueError(f"Unknown test type {self.test_type}")
-
-    def _install_wheels(self, params: VllmTestParameters):
-        logger.info("Running vllm test with inputs: %s", params)
-        if not pkg_exists("torch"):
-            # install torch from local whls if it's not installed yet.
-            torch_p = f"{str(params.torch_whls_path)}/{self.TORCH_WHL_PATH_REGEX}"
-            pip_install_first_match(torch_p, self.TORCH_WHL_EXTRA)
-
-        torch_whls_path = [
-            f"{str(params.torch_whls_path)}/{whl_path}"
-            for whl_path in self.TORCH_ADDITIONAL_WHLS_REGEX
-        ]
-        for torch_whl in torch_whls_path:
-            pip_install_first_match(torch_whl)
-        logger.info("Done. Installed torch and other torch-related wheels ")
-
-        logger.info("Installing vllm wheels")
-        vllm_whls_path = [
-            f"{str(params.vllm_whls_path)}/{whl_path}"
-            for whl_path in self.VLLM_TEST_WHLS_REGEX
-        ]
-        for vllm_whl in vllm_whls_path:
-            pip_install_first_match(vllm_whl)
-        logger.info("Done. Installed vllm wheels")
-
-    def _install_test_dependencies(self):
-        """
-        This method replaces torch dependencies with local torch wheel info in
-        requirements/test.in file from vllm repo. then generates the test.txt
-        in runtime
-        """
-        logger.info("generate test.txt from requirements/test.in with local torch whls")
-        preprocess_test_in()
-        copy("requirements/test.txt", "snapshot_constraint.txt")
-
-        run_command(
-            f"{sys.executable} -m uv pip compile requirements/test.in "
-            "-o test.txt "
-            "--index-strategy unsafe-best-match "
-            "--constraint snapshot_constraint.txt "
-            "--torch-backend cu128"
-        )
-        pip_install_packages(requirements="test.txt", prefer_uv=True)
-        logger.info("Done. installed requirements for test dependencies")
-
-    def _install_dependencies(self):
-        pip_install_packages(packages=["-e", "tests/vllm_test_utils"], prefer_uv=True)
-        pip_install_packages(packages=["hf_transfer"], prefer_uv=True)
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-
-        # using script from vllm repo to remove all torch packages from requirements txt
-        run_python("use_existing_torch.py")
-
-        # install common packages
-        for requirements in ["requirements/common.txt", "requirements/build.txt"]:
-            pip_install_packages(
-                requirements=requirements,
-                prefer_uv=True,
-            )
-        # install test packages
-        self._install_test_dependencies()
-
-    def _set_envs(self, inputs: VllmTestParameters):
-        os.environ["TORCH_CUDA_ARCH_LIST"] = inputs.torch_cuda_arch_list
-        if not validate_cuda(get_env("TORCH_CUDA_ARCH_LIST")):
-            logger.warning(
-                "Missing supported TORCH_CUDA_ARCH_LIST. "
-                "Currently support TORCH_CUDA_ARCH_LIST env var "
-                "with supported arch [8.0, 8.9, 9.0]"
-            )
-
-        os.environ["HF_TOKEN"] = os.getenv("VLLM_TEST_HUGGING_FACE_TOKEN", "")
-        if not get_env("HF_TOKEN"):
-            raise ValueError(
-                "missing required HF_TOKEN, please set VLLM_TEST_HUGGING_FACE_TOKEN env var"
-            )
-        if not get_env("TORCH_CUDA_ARCH_LIST"):
-            raise ValueError(
-                "missing required TORCH_CUDA_ARCH_LIST, please set TORCH_CUDA_ARCH_LIST env var"
-            )
-
-
-def preprocess_test_in(
-    target_file: str = "requirements/test.in", additional_packages: Iterable[str] = ()
-):
-    """
-    This modifies the target_file file in place in vllm work directory.
-    It removes torch and unwanted packages in target_file and replace with local torch whls
-    package  with format "$WHEEL_PACKAGE_NAME @ file://<LOCAL_PATH>"
-    """
-    additional_package_to_move = list(additional_packages or ())
-    pkgs_to_remove = [
-        "torch",
-        "torchvision",
-        "torchaudio",
-        "xformers",
-        "mamba_ssm",
-    ] + additional_package_to_move
-    # Read current requirements
-    target_path = Path(target_file)
-    lines = target_path.read_text().splitlines()
-
-    # Remove lines starting with the package names (==, @, >=) — case-insensitive
-    pattern = re.compile(rf"^({'|'.join(pkgs_to_remove)})\s*(==|@|>=)", re.IGNORECASE)
-    kept_lines = [line for line in lines if not pattern.match(line)]
-
-    # Get local installed torch/vision/audio from pip freeze
-    # This is hacky, but it works
-    pip_freeze = subprocess.check_output(["pip", "freeze"], text=True)
-    header_lines = [
-        line
-        for line in pip_freeze.splitlines()
-        if re.match(
-            r"^(torch|torchvision|torchaudio)\s*@\s*file://", line, re.IGNORECASE
-        )
-    ]
-
-    # Write back: header_lines + blank + kept_lines
-    out = "\n".join(header_lines + [""] + kept_lines) + "\n"
-    target_path.write_text(out)
-    logger.info("[INFO] Updated %s", target_file)
-
-
-def validate_cuda(value: str) -> bool:
-    VALID_VALUES = {"8.0", "8.9", "9.0"}
-    return all(v in VALID_VALUES for v in value.split())
-
-
-def check_versions():
-    """
-    check installed packages version
-    """
-    logger.info("Double check installed packages")
-    patterns = ["torch", "xformers", "torchvision", "torchaudio", "vllm"]
-    for pkg in patterns:
-        pkg_exists(pkg)
-    logger.info("Done. checked installed packages")
--- a/.ci/lumen_cli/cli/run.py
+++ b/.ci/lumen_cli/cli/run.py
@ -5,7 +5,6 @@ import logging

 from cli.build_cli.register_build import register_build_commands
 from cli.lib.common.logger import setup_logging
-from cli.test_cli.register_test import register_test_commands


 logger = logging.getLogger(__name__)
@ -21,7 +20,6 @@ def main():

    # registers second-level subcommands
    register_build_commands(subparsers)
-    register_test_commands(subparsers)

    # parse args after all options are registered
    args = parser.parse_args()
--- a/.ci/lumen_cli/cli/test_cli/register_test.py
+++ b/.ci/lumen_cli/cli/test_cli/register_test.py
@ -1,62 +0,0 @@
-import argparse
-import logging
-
-from cli.lib.common.cli_helper import register_targets, RichHelp, TargetSpec
-from cli.lib.core.vllm.vllm_test import VllmTestRunner
-
-
-logger = logging.getLogger(__name__)
-
-# Maps targets to their argparse configuration and runner
-# it adds new target to path python -m cli.run build external {target} with buildrunner
-_TARGETS: dict[str, TargetSpec] = {
-    "vllm": {
-        "runner": VllmTestRunner,
-        "help": "test vLLM with pytorch main",
-    }
-    # add yours ...
-}
-
-
-def common_args(parser: argparse.ArgumentParser) -> None:
-    """
-    Add common CLI arguments to the given parser.
-    """
-    parser.add_argument(
-        "--shard-id",
-        type=int,
-        default=1,
-        help="a shard id to run, e.g. '0,1,2,3'",
-    )
-    parser.add_argument(
-        "--num-shards",
-        type=int,
-        default=1,
-        help="a number of shards to run, e.g. '4'",
-    )
-    group = parser.add_mutually_exclusive_group(required=True)
-    group.add_argument(
-        "-tp",
-        "--test-plan",
-        type=str,
-        help="a pre-defined test plan to run, e.g. 'basic_correctness_test'",
-    )
-
-
-def register_test_commands(subparsers: argparse._SubParsersAction) -> None:
-    build_parser = subparsers.add_parser(
-        "test",
-        help="test related commands",
-        formatter_class=RichHelp,
-    )
-    build_subparsers = build_parser.add_subparsers(dest="test_command", required=True)
-    overview = "\n".join(
-        f"  {name:12} {spec.get('help', '')}" for name, spec in _TARGETS.items()
-    )
-    external_parser = build_subparsers.add_parser(
-        "external",
-        help="Test external targets",
-        description="Test third-party targets.\n\nAvailable targets:\n" + overview,
-        formatter_class=RichHelp,
-    )
-    register_targets(external_parser, _TARGETS, common_args=common_args)
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,6 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.8.6"
 ]

 [tool.setuptools]
--- a/.ci/lumen_cli/tests/test_run_plan.py
+++ b/.ci/lumen_cli/tests/test_run_plan.py
@ -1,185 +0,0 @@
-# tests/test_run_test_plan.py
-import importlib
-from contextlib import nullcontext
-from types import SimpleNamespace
-from unittest.mock import MagicMock
-
-import pytest
-
-
-MOD = "cli.lib.core.vllm.lib"
-
-# We import inside tests so the MOD override above applies everywhere
-run_test_plan_import_path = f"{MOD}.run_test_plan"
-
-
-def _get_cmd(c):
-    # Support both kwargs and positional args
-    return c.kwargs.get("cmd", c.args[0] if c.args else None)
-
-
-def _get_check(c):
-    if "check" in c.kwargs:
-        return c.kwargs["check"]
-    # If positional, assume second arg is 'check' when present; default False
-    return c.args[1] if len(c.args) > 1 else False
-
-
-@pytest.fixture
-def patch_module(monkeypatch):
-    """
-    Patch helpers ('pip_install_packages', 'temp_environ', 'working_directory',
-    'run_command', 'logger') inside the target module and expose them.
-    """
-    module = importlib.import_module(MOD)
-
-    # Create fakes/mocks
-    pip_install_packages = MagicMock(name="pip_install_packages")
-    run_command = MagicMock(name="run_command", return_value=0)
-
-    # temp_environ / working_directory: record calls but act as context managers
-    temp_calls: list[dict] = []
-    workdir_calls: list[str] = []
-
-    def fake_working_directory(path: str):
-        workdir_calls.append(path)
-        return nullcontext()
-
-    def fake_temp_env(map: dict[str, str]):
-        temp_calls.append(map)
-        return nullcontext()
-
-    logger = SimpleNamespace(
-        info=MagicMock(name="logger.info"),
-        error=MagicMock(name="logger.error"),
-    )
-
-    # Apply patches (raise if attribute doesn't exist)
-    monkeypatch.setattr(
-        module, "pip_install_packages", pip_install_packages, raising=True
-    )
-    monkeypatch.setattr(module, "run_command", run_command, raising=True)
-    monkeypatch.setattr(
-        module, "working_directory", fake_working_directory, raising=True
-    )
-    monkeypatch.setattr(module, "temp_environ", fake_temp_env, raising=True)
-    monkeypatch.setattr(module, "logger", logger, raising=True)
-
-    return SimpleNamespace(
-        module=module,
-        run_test_plan=module.run_test_plan,  # expose to avoid getattr("constant") (Ruff B009)
-        pip_install_packages=pip_install_packages,
-        run_command=run_command,
-        temp_calls=temp_calls,
-        workdir_calls=workdir_calls,
-        logger=logger,
-    )
-
-
-def test_success_runs_all_steps_and_uses_env_and_workdir(monkeypatch, patch_module):
-    run_test_plan = patch_module.run_test_plan
-
-    tests_map = {
-        "basic": {
-            "title": "Basic suite",
-            "package_install": [],
-            "working_directory": "tests",
-            "env_vars": {"GLOBAL_FLAG": "1"},
-            "steps": [
-                "export A=x && pytest -q",
-                "export B=y && pytest -q tests/unit",
-            ],
-        }
-    }
-
-    # One exit code per step (export + two pytest)
-    patch_module.run_command.side_effect = [0, 0, 0]
-
-    run_test_plan("basic", "cpu", tests_map)
-
-    calls = patch_module.run_command.call_args_list
-    cmds = [_get_cmd(c) for c in calls]
-    checks = [_get_check(c) for c in calls]
-
-    assert cmds == [
-        "export A=x && pytest -q",
-        "export B=y && pytest -q tests/unit",
-    ]
-    assert all(chk is False for chk in checks)
-
-    assert patch_module.workdir_calls == ["tests"]
-    assert patch_module.temp_calls == [{"GLOBAL_FLAG": "1"}]
-
-
-def test_installs_packages_when_present(monkeypatch, patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "with_pkgs": {
-            "title": "Needs deps",
-            "package_install": ["timm==1.0.0", "flash-attn"],
-            "steps": ["pytest -q"],
-        }
-    }
-
-    patch_module.run_command.return_value = 0
-
-    run_test_plan("with_pkgs", "gpu", tests_map)
-
-    patch_module.pip_install_packages.assert_called_once_with(
-        packages=["timm==1.0.0", "flash-attn"],
-        prefer_uv=True,
-    )
-
-
-def test_raises_on_missing_plan(patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-    with pytest.raises(RuntimeError) as ei:
-        run_test_plan("nope", "cpu", tests_map={})
-
-    assert "test nope not found" in str(ei.value)
-
-
-def test_aggregates_failures_and_raises(monkeypatch, patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "mix": {
-            "title": "Some pass some fail",
-            "steps": [
-                "pytest test_a.py",  # 0 → pass
-                "pytest test_b.py",  # 1 → fail
-                "pytest test_c.py",  # 2 → fail
-            ],
-        }
-    }
-
-    # Simulate pass, fail, fail
-    patch_module.run_command.side_effect = [0, 1, 2]
-
-    with pytest.raises(RuntimeError) as ei:
-        run_test_plan("mix", "cpu", tests_map)
-
-    msg = str(ei.value)
-    assert "2 pytest runs failed" in msg
-    # Ensure logger captured failed tests list
-    patch_module.logger.error.assert_called_once()
-    # And we attempted all three commands
-    assert patch_module.run_command.call_count == 3
-
-
-def test_custom_working_directory_used(patch_module):
-    run_test_plan = patch_module.module.run_test_plan
-
-    tests_map = {
-        "customwd": {
-            "title": "Custom wd",
-            "working_directory": "examples/ci",
-            "steps": ["pytest -q"],
-        }
-    }
-
-    patch_module.run_command.return_value = 0
-    run_test_plan("customwd", "cpu", tests_map)
-
-    assert patch_module.workdir_calls == ["examples/ci"]
--- a/.ci/lumen_cli/tests/test_utils.py
+++ b/.ci/lumen_cli/tests/test_utils.py
@ -1,143 +0,0 @@
-import os
-import tempfile
-import unittest
-from pathlib import Path
-
-from cli.lib.common.utils import temp_environ, working_directory  # <-- replace import
-
-
-class EnvIsolatedTestCase(unittest.TestCase):
-    """Base class that snapshots os.environ and CWD for isolation."""
-
-    def setUp(self):
-        import os
-        import tempfile
-
-        self._env_backup = dict(os.environ)
-
-        # Snapshot/repair CWD if it's gone
-        try:
-            self._cwd_backup = os.getcwd()
-        except FileNotFoundError:
-            # If CWD no longer exists, switch to a safe place and record that
-            self._cwd_backup = tempfile.gettempdir()
-            os.chdir(self._cwd_backup)
-
-        # Create a temporary directory for the test to run in
-        self._temp_dir = tempfile.mkdtemp()
-        os.chdir(self._temp_dir)
-
-    def tearDown(self):
-        import os
-        import shutil
-        import tempfile
-
-        # Restore cwd first (before cleaning up temp dir)
-        try:
-            os.chdir(self._cwd_backup)
-        except OSError:
-            os.chdir(tempfile.gettempdir())
-
-        # Clean up temporary directory
-        try:
-            shutil.rmtree(self._temp_dir, ignore_errors=True)
-        except Exception:
-            pass  # Ignore cleanup errors
-
-        # Restore env
-        to_del = set(os.environ.keys()) - set(self._env_backup.keys())
-        for k in to_del:
-            os.environ.pop(k, None)
-        for k, v in self._env_backup.items():
-            os.environ[k] = v
-
-
-class TestTempEnviron(EnvIsolatedTestCase):
-    def test_sets_and_restores_new_var(self):
-        var = "TEST_TMP_ENV_NEW"
-        self.assertNotIn(var, os.environ)
-
-        with temp_environ({var: "123"}):
-            self.assertEqual(os.environ[var], "123")
-
-        self.assertNotIn(var, os.environ)  # removed after exit
-
-    def test_overwrites_and_restores_existing_var(self):
-        var = "TEST_TMP_ENV_OVERWRITE"
-        os.environ[var] = "orig"
-
-        with temp_environ({var: "override"}):
-            self.assertEqual(os.environ[var], "override")
-
-        self.assertEqual(os.environ[var], "orig")  # restored
-
-    def test_multiple_vars_and_missing_cleanup(self):
-        v1, v2 = "TEST_ENV_V1", "TEST_ENV_V2"
-        os.environ.pop(v1, None)
-        os.environ[v2] = "keep"
-
-        with temp_environ({v1: "a", v2: "b"}):
-            self.assertEqual(os.environ[v1], "a")
-            self.assertEqual(os.environ[v2], "b")
-
-        self.assertNotIn(v1, os.environ)  # newly-added -> removed
-        self.assertEqual(os.environ[v2], "keep")  # pre-existing -> restored
-
-    def test_restores_even_on_exception(self):
-        var = "TEST_TMP_ENV_EXCEPTION"
-        self.assertNotIn(var, os.environ)
-
-        with self.assertRaises(RuntimeError):
-            with temp_environ({var: "x"}):
-                self.assertEqual(os.environ[var], "x")
-                raise RuntimeError("boom")
-
-        self.assertNotIn(var, os.environ)  # removed after exception
-
-
-class TestWorkingDirectory(EnvIsolatedTestCase):
-    def test_changes_and_restores(self):
-        start = Path.cwd()
-        with tempfile.TemporaryDirectory() as td:
-            target = Path(td) / "wd"
-            target.mkdir()
-
-            with working_directory(str(target)):
-                self.assertEqual(Path.cwd().resolve(), target.resolve())
-
-        self.assertEqual(Path.cwd(), start)
-
-    def test_noop_when_empty_path(self):
-        start = Path.cwd()
-        with working_directory(""):
-            self.assertEqual(Path.cwd(), start)
-        self.assertEqual(Path.cwd(), start)
-
-    def test_restores_on_exception(self):
-        start = Path.cwd()
-
-        with tempfile.TemporaryDirectory() as td:
-            target = Path(td) / "wd_exc"
-            target.mkdir()
-
-            with self.assertRaises(ValueError):
-                with working_directory(str(target)):
-                    # Normalize both sides to handle /var -> /private/var
-                    self.assertEqual(Path.cwd().resolve(), target.resolve())
-                    raise ValueError("boom")
-
-        self.assertEqual(Path.cwd().resolve(), start.resolve())
-
-    def test_raises_for_missing_dir(self):
-        start = Path.cwd()
-        with tempfile.TemporaryDirectory() as td:
-            missing = Path(td) / "does_not_exist"
-            with self.assertRaises(FileNotFoundError):
-                # os.chdir should raise before yielding
-                with working_directory(str(missing)):
-                    pass
-        self.assertEqual(Path.cwd(), start)
-
-
-if __name__ == "__main__":
-    unittest.main(verbosity=2)
--- a/.ci/lumen_cli/tests/test_vllm.py
+++ b/.ci/lumen_cli/tests/test_vllm.py
@ -4,15 +4,12 @@ import unittest
 from pathlib import Path
 from unittest.mock import MagicMock, patch

-import cli.lib.core.vllm.vllm_build as vllm_build
-
-
-_VLLM_BUILD_MODULE = "cli.lib.core.vllm.vllm_build"
+import cli.lib.core.vllm as vllm


 class TestVllmBuildParameters(unittest.TestCase):
-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=True)
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=True)
    @patch(
        "cli.lib.common.envs_helper.env_path_optional",
        side_effect=lambda name, default=None, resolve=True: {
@ -37,13 +34,13 @@ class TestVllmBuildParameters(unittest.TestCase):
    def test_params_success_normalizes_and_validates(
        self, mock_env_path, mock_is_path, mock_local_img
    ):
-        params = vllm_build.VllmBuildParameters()
+        params = vllm.VllmBuildParameters()
        self.assertEqual(params.torch_whls_path, Path("/abs/dist"))
        self.assertEqual(params.dockerfile_path, Path("/abs/vllm/Dockerfile"))
        self.assertEqual(params.output_dir, Path("/abs/shared"))
        self.assertEqual(params.base_image, "my/image:tag")

-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
    @patch.dict(
        os.environ, {"USE_TORCH_WHEEL": "1", "TORCH_WHEELS_PATH": "dist"}, clear=True
    )
@ -51,14 +48,14 @@ class TestVllmBuildParameters(unittest.TestCase):
        with tempfile.TemporaryDirectory() as td:
            os.chdir(td)
            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
+                vllm.VllmBuildParameters(
                    use_local_base_image=False,
                    use_local_dockerfile=False,
                )
        err = cm.exception
        self.assertIn("TORCH_WHEELS_PATH", str(err))

-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=False)
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=False)
    @patch.dict(
        os.environ, {"USE_LOCAL_BASE_IMAGE": "1", "BASE_IMAGE": "img:tag"}, clear=True
    )
@ -66,14 +63,14 @@ class TestVllmBuildParameters(unittest.TestCase):
        with tempfile.TemporaryDirectory() as td:
            os.chdir(td)
            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
+                vllm.VllmBuildParameters(
                    use_torch_whl=False,
                    use_local_dockerfile=False,
                )
        err = cm.exception
        self.assertIn("BASE_IMAGE", str(err))

-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
    @patch.dict(
        os.environ,
        {"USE_LOCAL_DOCKERFILE": "1", "DOCKERFILE_PATH": "Dockerfile"},
@ -83,14 +80,14 @@ class TestVllmBuildParameters(unittest.TestCase):
        with tempfile.TemporaryDirectory() as td:
            os.chdir(td)
            with self.assertRaises(ValueError) as cm:
-                vllm_build.VllmBuildParameters(
+                vllm.VllmBuildParameters(
                    use_torch_whl=False,
                    use_local_base_image=False,
                )
        err = cm.exception
        self.assertIn("DOCKERFILE_PATH", str(err))

-    @patch(f"{_VLLM_BUILD_MODULE}.is_path_exist", return_value=False)
+    @patch("cli.lib.core.vllm.is_path_exist", return_value=False)
    @patch.dict(
        os.environ,
        {"OUTPUT_DIR": ""},
@ -98,13 +95,14 @@ class TestVllmBuildParameters(unittest.TestCase):
    )
    def test_params_missing_output_dir(self, _is_path):
        with self.assertRaises(FileNotFoundError):
-            vllm_build.VllmBuildParameters()
+            vllm.VllmBuildParameters()


 class TestBuildCmdAndRun(unittest.TestCase):
-    @patch(f"{_VLLM_BUILD_MODULE}.local_image_exists", return_value=True)
+    @patch("cli.lib.core.vllm.local_image_exists", return_value=True)
    def test_generate_docker_build_cmd_includes_bits(self, _exists):
-        runner = vllm_build.VllmBuildRunner()
+        runner = vllm.VllmBuildRunner()
+        # Craft inputs that simulate a prepared build
        inputs = MagicMock()
        inputs.output_dir = Path("/abs/out")
        inputs.use_local_base_image = True
@ -120,7 +118,7 @@ class TestBuildCmdAndRun(unittest.TestCase):
        inputs.tag_name = "vllm-wheels"

        cmd = runner._generate_docker_build_cmd(inputs)
-        squashed = " ".join(cmd.split())
+        squashed = " ".join(cmd.split())  # normalize whitespace for matching

        self.assertIn("--output type=local,dest=/abs/out", squashed)
        self.assertIn("-f docker/Dockerfile.nightly_torch", squashed)
@ -138,17 +136,18 @@ class TestBuildCmdAndRun(unittest.TestCase):
        self.assertIn("--target export-wheels", squashed)
        self.assertIn("-t vllm-wheels", squashed)

-    @patch(f"{_VLLM_BUILD_MODULE}.run_command")
-    @patch(f"{_VLLM_BUILD_MODULE}.ensure_dir_exists")
-    @patch(f"{_VLLM_BUILD_MODULE}.clone_vllm")
+    @patch("cli.lib.core.vllm.run_command")
+    @patch("cli.lib.core.vllm.ensure_dir_exists")
+    @patch("cli.lib.core.vllm.clone_vllm")
    @patch.object(
-        vllm_build.VllmBuildRunner,
+        vllm.VllmBuildRunner,
        "_generate_docker_build_cmd",
        return_value="docker buildx ...",
    )
    @patch.dict(
        os.environ,
        {
+            # Make __post_init__ validations pass cheaply
            "USE_TORCH_WHEEL": "0",
            "USE_LOCAL_BASE_IMAGE": "0",
            "USE_LOCAL_DOCKERFILE": "0",
@ -159,18 +158,24 @@ class TestBuildCmdAndRun(unittest.TestCase):
    def test_run_calls_clone_prepare_and_build(
        self, mock_gen, mock_clone, mock_ensure, mock_run
    ):
+        # Stub parameters instance so we avoid FS/Docker accesses in run()
        params = MagicMock()
        params.output_dir = Path("shared")
        params.use_local_dockerfile = False
        params.use_torch_whl = False

-        with patch(f"{_VLLM_BUILD_MODULE}.VllmBuildParameters", return_value=params):
-            runner = vllm_build.VllmBuildRunner()
+        with patch("cli.lib.core.vllm.VllmBuildParameters", return_value=params):
+            runner = vllm.VllmBuildRunner()
            runner.run()

        mock_clone.assert_called_once()
        mock_ensure.assert_called_once_with(Path("shared"))
        mock_gen.assert_called_once_with(params)
        mock_run.assert_called_once()
+        # ensure we run in vllm workdir
        _, kwargs = mock_run.call_args
        assert kwargs.get("cwd") == "vllm"
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma/build_magma.sh

 .PHONY: all
-all: magma-cuda130
 all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
@ -26,12 +25,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-cuda130
-magma-cuda130: DESIRED_CUDA := 13.0
-magma-cuda130: CUDA_ARCH_LIST := -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-magma-cuda130:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda129
 magma-cuda129: DESIRED_CUDA := 12.9
 magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -28,7 +28,6 @@ pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
-patch -p1 < ${PACKAGE_FILES}/cuda13.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
@ -38,7 +37,6 @@ popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_FILES}/cuda13.patch ${PACKAGE_RECIPE}/cuda13.patch
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
--- a/.ci/magma/package_files/cuda13.patch
+++ b/.ci/magma/package_files/cuda13.patch
@ -1,26 +0,0 @@
-diff --git a/interface_cuda/interface.cpp b/interface_cuda/interface.cpp
-index 73fed1b20..e77519bfe 100644
--- a/interface_cuda/interface.cpp
-+++ b/interface_cuda/interface.cpp
-@@ -438,14 +438,20 @@ magma_print_environment()
-         cudaDeviceProp prop;
-         err = cudaGetDeviceProperties( &prop, dev );
-         check_error( err );
-+        #ifdef MAGMA_HAVE_CUDA
-+#if CUDA_VERSION < 13000
-         printf( "%% device %d: %s, %.1f MHz clock, %.1f MiB memory, capability %d.%d\n",
-                 dev,
-                 prop.name,
-                 prop.clockRate / 1000.,
-+#else
-+        printf( "%% device %d: %s, ??? MHz clock, %.1f MiB memory, capability %d.%d\n",
-+                dev,
-+                prop.name,
-+#endif
-                 prop.totalGlobalMem / (1024.*1024.),
-                 prop.major,
-                 prop.minor );
-        #ifdef MAGMA_HAVE_CUDA
-         int arch = prop.major*100 + prop.minor*10;
-         if ( arch < MAGMA_CUDA_ARCH_MIN ) {
-             printf("\n"
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -66,9 +66,6 @@ case ${CUDA_VERSION} in
            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
-    13.0)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
-        ;;
    12.6)
        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
        ;;
@ -113,15 +110,11 @@ DEPS_SONAME=(
 )


-# CUDA_VERSION 12.*, 13.*
-if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
+# CUDA_VERSION 12.6, 12.8, 12.9
+if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
-    TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
-    # Compress the fatbin with -compress-mode=size for CUDA 13
-    if [[ $CUDA_VERSION == 13* ]]; then
-        export TORCH_NVCC_FLAGS="$TORCH_NVCC_FLAGS -compress-mode=size"
-    fi
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
@ -141,7 +134,7 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
+            "/usr/local/cuda/lib64/libnvshem_host.so.3"
            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
@ -174,29 +167,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
            '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/nvshmem/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
            '$ORIGIN/../../nvidia/cusparselt/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvshmem/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+            '$ORIGIN/../../nvidia/cufile/lib'
        )
-        if [[ $CUDA_VERSION == 13* ]]; then
-            CUDA_RPATHS+=('$ORIGIN/../../nvidia/cu13/lib')
-        else
-            CUDA_RPATHS+=(
-                '$ORIGIN/../../nvidia/cublas/lib'
-                '$ORIGIN/../../nvidia/cuda_cupti/lib'
-                '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-                '$ORIGIN/../../nvidia/cuda_runtime/lib'
-                '$ORIGIN/../../nvidia/cufft/lib'
-                '$ORIGIN/../../nvidia/curand/lib'
-                '$ORIGIN/../../nvidia/cusolver/lib'
-                '$ORIGIN/../../nvidia/cusparse/lib'
-                '$ORIGIN/../../cusparselt/lib'
-                '$ORIGIN/../../nvidia/nvtx/lib'
-                '$ORIGIN/../../nvidia/cufile/lib'
-            )
-        fi
-
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -25,7 +25,6 @@ source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
 export USE_ONEMKL=1
 export USE_XCCL=1
-export USE_MPI=0

 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -173,7 +173,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
-  export USE_MPI=0
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
@ -195,16 +194,8 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && echo "${TORCH_CUDA_ARCH_LIST}" | tr ' ' '\n' | sed 's/$/>= 8.0/' | bc | grep -q 1; then
-  J=2  # default to 2 jobs
-  case "$RUNNER" in
-    linux.12xlarge.memory|linux.24xlarge.memory)
-      J=24
-      ;;
-  esac
-  echo "Building FlashAttention with job limit $J"
-  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j ${J}"
+  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -152,10 +152,6 @@ function get_pinned_commit() {
 function install_torchaudio() {
  local commit
  commit=$(get_pinned_commit audio)
-  if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]] && command -v nvidia-smi; then
-    TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
-    export TORCH_CUDA_ARCH_LIST
-  fi
  pip_build_and_install "git+https://github.com/pytorch/audio.git@${commit}" dist/audio
 }

--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -174,15 +174,13 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  popd

-  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
-  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
-  # its current version 0.12.0 doesn't work with transformers 4.54.0
-  pip uninstall -y torchao
+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
+  popd
 }

 torchbench_setup_macos() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1629,14 +1629,6 @@ elif [[ "${TEST_CONFIG}" == *xla* ]]; then
  install_torchvision
  build_xla
  test_xla
-elif [[ "$TEST_CONFIG" == *vllm* ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" == *cuda* ]]; then
-      TORCH_CUDA_ARCH_LIST=$(nvidia-smi --query-gpu=compute_cap --format=csv | tail -n 1)
-      export TORCH_CUDA_ARCH_LIST
-    fi
-    echo "VLLM CI TORCH_CUDA_ARCH_LIST: $TORCH_CUDA_ARCH_LIST"
-    (cd .ci/lumen_cli && python -m pip install -e .)
-    python -m cli.run test external vllm --test-plan "$TEST_CONFIG" --shard-id "$SHARD_NUMBER" --num-shards "$NUM_TEST_SHARDS"
 elif [[ "${TEST_CONFIG}" == *executorch* ]]; then
  test_executorch
 elif [[ "$TEST_CONFIG" == 'jit_legacy' ]]; then
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -55,9 +55,6 @@ python -m pip install pulp==2.9.0
 # Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
 python -m pip install expecttest==0.3.0

-# Install intel-openmp
-python -m pip install intel-openmp==2025.1.1
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda130.bat
+++ b/.ci/pytorch/windows/cuda130.bat
@ -1,59 +0,0 @@
-@echo off
-
-set MODULE_NAME=pytorch
-
-IF NOT EXIST "setup.py" IF NOT EXIST "%MODULE_NAME%" (
-    call internal\clone.bat
-    cd %~dp0
-) ELSE (
-    call internal\clean.bat
-)
-IF ERRORLEVEL 1 goto :eof
-
-call internal\check_deps.bat
-IF ERRORLEVEL 1 goto :eof
-
-REM Check for optional components
-
-set USE_CUDA=
-set CMAKE_GENERATOR=Visual Studio 15 2017 Win64
-
-IF "%NVTOOLSEXT_PATH%"=="" (
-    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
-        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
-    ) ELSE (
-        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
-        exit /b 1
-    )
-)
-
-IF "%CUDA_PATH_V130%"=="" (
-    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0\bin\nvcc.exe" (
-        set "CUDA_PATH_V130=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.0"
-    ) ELSE (
-        echo CUDA 13.0 not found, failing
-        exit /b 1
-    )
-)
-
-IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=7.5;8.0;8.6;9.0;10.0;12.0
-    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
-) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
-)
-
-set "CUDA_PATH=%CUDA_PATH_V130%"
-set "PATH=%CUDA_PATH_V130%\bin;%PATH%"
-
-:optcheck
-
-call internal\check_opts.bat
-IF ERRORLEVEL 1 goto :eof
-
-if exist "%NIGHTLIES_PYTORCH_ROOT%" cd %NIGHTLIES_PYTORCH_ROOT%\..
-call  %~dp0\internal\copy.bat
-IF ERRORLEVEL 1 goto :eof
-
-call  %~dp0\internal\setup.bat
-IF ERRORLEVEL 1 goto :eof
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@ -26,7 +26,6 @@ if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%
 if %CUDA_VER% EQU 126 goto cuda126
 if %CUDA_VER% EQU 128 goto cuda128
 if %CUDA_VER% EQU 129 goto cuda129
-if %CUDA_VER% EQU 130 goto cuda130

 echo CUDA %CUDA_VERSION_STR% is not supported
 exit /b 1
@ -114,33 +113,6 @@ xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"

 goto cuda_common

-:cuda130
-
-set CUDA_INSTALL_EXE=cuda_13.0.0_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS="
-)
-
-set CUDNN_FOLDER=cudnn-windows-x86_64-9.12.0.46_cuda13-archive
-set CUDNN_LIB_FOLDER="lib"
-set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" & REM @lint-ignore
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-)
-
-@REM cuDNN 8.3+ required zlib to be installed on the path
-echo Installing ZLIB dlls
-curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-
-goto cuda_common
-
 :cuda_common
 :: NOTE: We only install CUDA if we don't have it installed already.
 :: With GHA runners these should be pre-installed as part of our AMI process
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -1,22 +1,12 @@
 set ADDITIONAL_OPTIONS=""
 set PYTHON_EXEC="python"
-
-
 if "%DESIRED_PYTHON%" == "3.13t" (
    echo Python version is set to 3.13t
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
-) else if "%DESIRED_PYTHON%"=="3.14" (
-    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
-) else if "%DESIRED_PYTHON%"=="3.14t" (
-    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
-    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
-    set PYTHON_EXEC="python3.14t"
 ) else (
-    echo Python version is set to %DESIRED_PYTHON%
+    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
 )

--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -7,8 +7,6 @@ call "internal\install_python.bat"

 %PYTHON_EXEC% --version
 set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
-if "%DESIRED_PYTHON%" == "3.14t" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
-if "%DESIRED_PYTHON%" == "3.14" %PYTHON_EXEC% -m pip install numpy==2.3.2 cmake
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -133,25 +133,6 @@ EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
-    3.14t)
-        echo "Using 3.14 deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
-        CONDA_ENV_CREATE_FLAGS="python-freethreading"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
-    3.14)
-        echo "Using 3.14t deps"
-        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
-        PYYAML_PINNED_VERSION=">=6.0.1"
-        NUMPY_PINNED_VERSION="=2.1.0"
-        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-        desired_python="3.14.0rc1"
-        RENAME_WHEEL=false
-        ;;
    3.13t)
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -51,12 +51,16 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
+    cache_control_flag=""
+    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
+      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
+    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}"
+          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
      )
    done
  )
--- a/.flake8
+++ b/.flake8
@ -48,7 +48,6 @@ per-file-ignores =
    torch/__init__.py: F401,TOR901
    torch/_custom_op/impl.py: TOR901
    torch/_export/serde/upgrade.py: TOR901
-    torch/_functorch/predispatch.py: TOR901
    torch/_functorch/vmap.py: TOR901
    torch/_inductor/test_operators.py: TOR901
    torch/_library/abstract_impl.py: TOR901
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -1,80 +0,0 @@
-# .github/workflows/build-external.yml
-name: Build External packages
-
-description: build external packages for PyTorch
-
-inputs:
-  cuda-arch-list:
-    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
-    type: string
-    required: true
-    default: ""
-  docker-image:
-    description: Base image to use
-    type: string
-    required: true
-  build-targets:
-    description: Build targets
-    type: string
-    required: true
-  torch-wheel-dir:
-    description: Directory to built torch wheel
-    type: string
-    required: false
-    default: dist
-  output-dir:
-    description: Directory to store build artifact
-    default: external
-    type: string
-    required: false
-
-outputs:
-  build_time:
-    description: "Total build time in seconds"
-    value: ${{ steps.build-external.outputs.build_time }}
-  output_dir:
-    description: "Directory where build artifact is stored"
-    value: ${{ steps.build-external.outputs.output_dir }}
-
-runs:
-  using: composite
-  steps:
-    - name: Build external packages in sequence
-      id: build-external
-      env:
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_REGION: us-east-1
-        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
-        BASE_IMAGE: ${{ inputs.docker-image }}
-        BUILD_TARGETS: ${{ inputs.build-targets }}
-        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
-      shell: bash
-      run: |
-        set -euo pipefail
-        python3 --version
-        docker images
-        START_TIME=$(date +%s)
-        (
-          cd .ci/lumen_cli
-          python3 -m pip install -e .
-        )
-        MAX_JOBS="$(nproc --ignore=6)"
-        export MAX_JOBS
-
-        # Split the comma-separated list and build each target
-        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
-        for target in "${TARGETS[@]}"; do
-          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
-          export OUTPUT_DIR
-          echo "Building external package: $target in directory $OUTPUT_DIR"
-          python3 -m cli.run build external "$target"
-
-        done
-
-        END_TIME=$(date +%s)
-        {
-          echo "build_time=$((END_TIME - START_TIME))"
-          if [ -d "$PARENT_OUTPUT_DIR" ]; then
-            echo "output_dir=$PARENT_OUTPUT_DIR"
-          fi
-        } >> "$GITHUB_OUTPUT"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-10a5002c6195bd95e34df8fe28ff8a2d55a2a922
+bdb88e1d66f272cad72156c90ac8428ca61a601c
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-add1adfec742dfb13e614dab3372b5aafd1ff046
+e18859298d109870b22cb5b8672d1078818e268d
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -67,8 +67,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy

 #################### TORCH NIGHTLY  BASE IMAGE ####################

@ -92,8 +90,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy

 WORKDIR /workspace

@ -116,7 +112,6 @@ ARG PINNED_TORCH_VERSION
 RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
    --mount=type=cache,target=/root/.cache/uv \
    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
-        echo "[INFO] Installing torch wheels to build vllm"; \
        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
@ -124,10 +119,10 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
        uv pip install --system "${vision_whl}"; \
        uv pip install --system "${audio_whl}"; \
    elif [ -n "$PINNED_TORCH_VERSION" ]; then \
-        echo "[INFO] Installing pinned torch nightly version to build vllm: $PINNED_TORCH_VERSION"; \
+        echo "[INFO] Installing pinned torch nightly version: $PINNED_TORCH_VERSION"; \
        uv pip install --system "$PINNED_TORCH_VERSION" --index-url https://download.pytorch.org/whl/nightly/cu128; \
    else \
-        echo "[INFO] Installing torch nightly with latest one to build vllm"; \
+        echo "[INFO] Installing torch nightly with latest one"; \
        uv pip install --system torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128; \
    fi

@ -141,22 +136,15 @@ uv pip install --system -r requirements/common.txt


 # Must put before installing xformers, so it can install the correct version of xfomrers.
-ARG exformer_cuda_arch_list='7.5;8.0+PTX;9.0a'
-ENV TORCH_CUDA_ARCH_LIST=${exformer_cuda_arch_list}
-
+ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}

-RUN echo ${TORCH_CUDA_ARCH_LIST}
-RUN echo ${MAX_JOBS}
-RUN pip freeze | grep -E 'ninja'
-
 # Build xformers with cuda and torch nightly/wheel
 # following official xformers guidance: https://github.com/facebookresearch/xformers#build
-# sha for https://github.com/facebookresearch/xformers/tree/v0.0.31
-ARG XFORMERS_COMMIT=eb0946a363464da96ea40afd1a7f72a907c25497
+ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
 ENV CCACHE_DIR=/root/.cache/ccache
-
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
    echo 'git clone xformers...' \
@ -169,14 +157,14 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
    && cd .. \
    && rm -rf xformers
-
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system xformers-dist/*.whl --verbose

 # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
-RUN cat torch_build_versions.txt
+RUN cat  torch_build_versions.txt
+
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'

 #################### BASE BUILD IMAGE ####################
@ -189,8 +177,6 @@ ARG TARGETPLATFORM

 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy

 COPY . .

@ -206,7 +192,7 @@ RUN --mount=type=bind,source=.git,target=.git \
 # Max jobs used by Ninja to build extensions
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
-ARG nvcc_threads=4
+ARG nvcc_threads=2
 ENV NVCC_THREADS=$nvcc_threads
 ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
@ -230,14 +216,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        && export SCCACHE_S3_NO_CREDENTIALS=${SCCACHE_S3_NO_CREDENTIALS} \
        && export SCCACHE_IDLE_TIMEOUT=0 \
        && export CMAKE_BUILD_TYPE=Release \
-        && export VLLM_DOCKER_BUILD_CONTEXT=1 \
        && sccache --show-stats \
        && python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38 \
        && sccache --show-stats; \
    fi

-ARG vllm_target_device="cuda"
-ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
 RUN --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=cache,target=/root/.cache/uv \
@ -246,7 +229,6 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        # Clean any existing CMake artifacts
        rm -rf .deps && \
        mkdir -p .deps && \
-        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
    fi

@ -314,8 +296,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy

 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
@ -328,7 +308,7 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
        torch_whl=$(find /dist -maxdepth 1 -name 'torch-*.whl' -print -quit); \
        vision_whl=$(find /dist/vision -name 'torchvision*.whl' | head -n1 | xargs); \
        audio_whl=$(find /dist/audio -name 'torchaudio*.whl' | head -n1 | xargs); \
-        echo "[INFO] Use wheels to build : '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
+        echo "Found: '${torch_whl}' '${audio_whl}' '${vision_whl}'"; \
        uv pip install --system "${torch_whl}[opt-einsum]"; \
        uv pip install --system "${vision_whl}"; \
        uv pip install --system "${audio_whl}"; \
@ -384,8 +364,6 @@ FROM vllm-base as test

 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
-# Use copy mode to avoid hardlink failures with Docker cache mounts
-ENV UV_LINK_MODE=copy

 COPY tests/ tests/
 COPY examples examples
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,24 +0,0 @@
-version: 2
-updates:
-  # Update to the latest transformers version with dependabot
-  - package-ecosystem: "pip"
-    directory: "/.ci/docker/ci_commit_pins"
-    schedule:
-      interval: "daily"
-    target-branch: "main"
-    allow:
-      - dependency-name: "transformers"
-    ignore:
-      - dependency-name: "*"
-        update-types: ["version-update:semver-patch"]
-    commit-message:
-      prefix: "[Dependabot] Update"
-      include: "scope"
-    labels:
-      - "dependencies"
-      - "open source"
-      - "python"
-      - "topic: not user facing"
-      - "module: ci"
-      - "module: inductor"
-      - "ciflow/inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,7 +27,6 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,19 +16,17 @@ from typing import Optional


 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "12.9"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
-    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
    "12.9": "9",
-    "13.0": "9",
 }

 # NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
@ -56,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -73,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -90,28 +88,11 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.12.0.46; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.1.1 | "
        "intel-cmplr-lib-ur==2025.1.1 | "
@ -143,7 +124,9 @@ def get_nccl_wheel_version(arch_version: str) -> str:
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
-    return next(x for x in requirements if x.startswith("nvidia-nccl")).split("==")[1]
+    return next(x for x in requirements if x.startswith("nvidia-nccl-cu")).split("==")[
+        1
+    ]


 def read_nccl_pin(arch_version: str) -> str:
@ -240,12 +223,8 @@ def generate_libtorch_matrix(
        if os == "linux":
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
        elif os == "windows":
            arches += CUDA_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -310,8 +289,6 @@ def generate_wheels_matrix(
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
-            if "13.0" in arches:
-                arches.remove("13.0")
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -337,8 +314,8 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13t on cpu-s390x
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
-            # TODO: Enable python 3.14 for rest
-            if os not in ["linux", "linux-aarch64", "macos-arm64", "windows"] and (
+            # TODO: Enable python 3.14 on non linux OSes
+            if os != "linux" and (
                python_version == "3.14" or python_version == "3.14t"
            ):
                continue
@ -346,7 +323,7 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["13.0", "12.9", "12.8", "12.6"]
+                arch_version in ["12.9", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
@ -379,6 +356,29 @@ def generate_wheels_matrix(
                        ),  # include special case for aarch64 build, remove the -aarch64 postfix
                    }
                )
+                # Special build building to use on Colab. Python 3.11 for 12.6 CUDA
+                if python_version == "3.11" and arch_version == CUDA_STABLE:
+                    ret.append(
+                        {
+                            "python_version": python_version,
+                            "gpu_arch_type": gpu_arch_type,
+                            "gpu_arch_version": gpu_arch_version,
+                            "desired_cuda": translate_desired_cuda(
+                                gpu_arch_type, gpu_arch_version
+                            ),
+                            "container_image": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[0],
+                            "container_image_tag_prefix": WHEEL_CONTAINER_IMAGES[
+                                arch_version
+                            ].split(":")[1],
+                            "package_type": package_type,
+                            "pytorch_extra_install_requirements": "",
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                                ".", "_"
+                            ),
+                        }
+                    )
            else:
                ret.append(
                    {
@ -409,7 +409,6 @@ def generate_wheels_matrix(
    return ret


-validate_nccl_dep_consistency("13.0")
 validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -35,9 +35,6 @@ cd magma
 mkdir build && cd build

 set GPU_TARGET=All
-if "%CUVER_NODOT%" == "130" (
-  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-)
 if "%CUVER_NODOT%" == "129" (
  set CUDA_ARCH_LIST=-gencode=arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90 -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 )
--- a/.github/scripts/windows/build_triton.bat
+++ b/.github/scripts/windows/build_triton.bat
@ -1,12 +1,18 @@
@echo on

-set DESIRED_PYTHON=%PY_VERS%
-call .ci/pytorch/windows/internal/install_python.bat
-
+set PYTHON_PREFIX=%PY_VERS:.=%
+set PYTHON_PREFIX=py%PYTHON_PREFIX:;=;py%
+call .ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+:: Create a new conda environment
+if "%PY_VERS%" == "3.13t" (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python-freethreading python=3.13
+) else (
+    call conda create -n %PYTHON_PREFIX% -y -c=conda-forge python=%PY_VERS%
+)
 :: Fix cmake version for issue https://github.com/pytorch/pytorch/issues/150480
-%PYTHON_EXEC% -m pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja==1.11.1.4
+call conda run -n %PYTHON_PREFIX% pip install wheel pybind11 certifi cython cmake==3.31.6 setuptools==72.1.0 ninja

 dir "%VC_INSTALL_PATH%"

 call "%VC_INSTALL_PATH%\VC\Auxiliary\Build\vcvarsall.bat" x64
-%PYTHON_EXEC% .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
+call conda run -n %PYTHON_PREFIX% python .github/scripts/build_triton_wheel.py --device=%BUILD_DEVICE% %RELEASE%
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -114,12 +114,12 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
-      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.6"] %}
+      {%- elif config["gpu_arch_type"] == "cuda" and config["gpu_arch_version"] in ["12.8", "12.9"] %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
      {%- elif config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
      {%- else %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -110,33 +110,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -15,7 +15,7 @@
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: !{{ config["desired_cuda"] }}
 {%- if config["gpu_arch_version"] %}
-      GPU_ARCH_VERSION: "!{{ config["gpu_arch_version"] }}"
+      GPU_ARCH_VERSION: !{{ config["gpu_arch_version"] }}
 {%- endif %}
      GPU_ARCH_TYPE: !{{ config["gpu_arch_type"] }}
 {%- if include_skip_tests %}
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -187,8 +187,6 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,13 +96,6 @@ on:
        required: false
        type: string
        default: ""
-      build-external-packages:
-        description: |
-          If set, the build external packages and saves their wheels as artifacts
-          use command separated list of packages to build ex: 'vllm,transformers'.
-        required: false
-        type: string
-        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -128,7 +121,7 @@ jobs:
    # Don't run on forked repos
    if: github.repository_owner == 'pytorch'
    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runner }}
-    timeout-minutes: 480
+    timeout-minutes: 240
    outputs:
      docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
@ -269,7 +262,6 @@ jobs:
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          BUILD_ADDITIONAL_PACKAGES: ${{ inputs.build-additional-packages }}
-          RUNNER: ${{ inputs.runner }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -341,7 +333,6 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e BUILD_ADDITIONAL_PACKAGES \
-            -e RUNNER \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
@ -365,26 +356,6 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

-      - name: Build external packages
-        id: build-external-packages
-        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
-        uses: ./.github/actions/build-external-packages
-        with:
-          build-targets: ${{ inputs.build-external-packages }}
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-          cuda-arch-list: ${{ inputs.cuda-arch-list }}
-          output-dir: external
-
-      - name: Move external packages to dist
-        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
-        shell: bash
-        run: |
-          src="${{ steps.build-external-packages.outputs.output_dir }}"
-          if [ -d "$src" ]; then
-            mkdir -p "dist/$(dirname "$src")"
-            mv "$src" "dist/$(dirname "$src")/"
-          fi
-
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -72,10 +72,6 @@ on:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
-      VLLM_TEST_HUGGING_FACE_TOKEN:
-        required: false
-        description: |
-          HF Auth token to test vllm
      SCRIBE_GRAPHQL_ACCESS_TOKEN:
        required: false
        description: |
@ -290,7 +286,6 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-          VLLM_TEST_HUGGING_FACE_TOKEN: ${{ secrets.VLLM_TEST_HUGGING_FACE_TOKEN }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
@ -367,7 +362,6 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e SKIP_SCCACHE_INITIALIZATION=1 \
            -e HUGGING_FACE_HUB_TOKEN \
-            -e VLLM_TEST_HUGGING_FACE_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
            -e ARTIFACTS_FILE_SUFFIX \
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -48,7 +48,6 @@ jobs:
      fail-fast: false
      matrix:
        include: [
-          { tag: "cuda13.0" },
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        cuda_version: ["130", "129", "128", "126"]
+        cuda_version: ["129", "128", "126"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@ -22,7 +22,7 @@ jobs:
    runs-on: windows-2022
    strategy:
      matrix:
-        cuda_version: ["130", "129", "128", "126"]
+        cuda_version: ["129", "128", "126"]
        config: ["Release", "Debug"]
    env:
      CUDA_VERSION: ${{ matrix.cuda_version }}
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -46,11 +46,9 @@ jobs:
      fail-fast: false
      matrix:
        include: [
-          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.9",         runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
-          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -194,7 +194,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t", "3.14", "3.14t" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13", "3.13t" ]
        device: ["xpu"]
    timeout-minutes: 40
    env:
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -57,11 +57,6 @@ jobs:
          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
-      - name: Copy docs requirements for inclusion
-        run: |
-          # Replace symlink with actual file
-          rm docs/requirements.txt || true
-          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
            # Create new folder with specified name so extracting the archive yields that
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -122,7 +122,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -148,7 +148,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -233,7 +233,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -259,7 +259,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -344,7 +344,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -354,7 +354,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -370,7 +370,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -455,7 +455,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -465,7 +465,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -481,7 +481,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -566,7 +566,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -576,7 +576,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -592,7 +592,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -677,7 +677,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -687,7 +687,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -703,7 +703,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_VERSION: 12.9-aarch64
      GPU_ARCH_TYPE: cuda-aarch64
      DOCKER_IMAGE: manylinuxaarch64-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -712,225 +712,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14-cpu-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cpu-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cpu-aarch64-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cpu-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14-cuda-aarch64-12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14-cuda-aarch64-12_9-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14"
-      build_name: manywheel-py3_14-cuda-aarch64-12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cpu-aarch64-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-aarch64-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_14t-cpu-aarch64-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-aarch64
-      build_environment: linux-aarch64-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.2xlarge
-      ALPINE_IMAGE: "arm64v8/alpine"
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cpu-aarch64-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cpu-aarch64-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu-aarch64
-      DOCKER_IMAGE: manylinux2_28_aarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cpu-aarch64
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
-  manywheel-py3_14t-cuda-aarch64-12_9-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
-      ALPINE_IMAGE: "arm64v8/alpine"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
-      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
-      timeout-minutes: 420
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9-aarch64"
-      GPU_ARCH_TYPE: cuda-aarch64
-      DOCKER_IMAGE: manylinuxaarch64-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
-      DESIRED_PYTHON: "3.14t"
-      build_name: manywheel-py3_14t-cuda-aarch64-12_9
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -122,7 +122,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@ -145,7 +145,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@ -154,7 +154,7 @@ jobs:
      build_name: libtorch-cuda12_6-shared-with-deps-release
      build_environment: linux-binary-libtorch
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu  # 12.6 build can use maxwell (sm_50) runner
+      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_6-shared-with-deps-release-upload:  # Uploading
@ -169,7 +169,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
@ -190,7 +190,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@ -213,7 +213,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@ -222,7 +222,7 @@ jobs:
      build_name: libtorch-cuda12_8-shared-with-deps-release
      build_environment: linux-binary-libtorch
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_8-shared-with-deps-release-upload:  # Uploading
@ -237,7 +237,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@ -258,7 +258,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -281,7 +281,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -290,7 +290,7 @@ jobs:
      build_name: libtorch-cuda12_9-shared-with-deps-release
      build_environment: linux-binary-libtorch
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
@ -305,7 +305,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
@ -326,7 +326,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@ -350,7 +350,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
@ -419,7 +419,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.3
-      GPU_ARCH_VERSION: "6.3"
+      GPU_ARCH_VERSION: 6.3
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
@ -440,7 +440,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@ -464,7 +464,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
@ -533,7 +533,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -52,7 +52,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -75,7 +75,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
@ -83,6 +83,6 @@ jobs:
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -54,7 +54,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
@ -77,7 +77,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
-      GPU_ARCH_VERSION: "6.4"
+      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -115,33 +115,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -260,33 +239,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -405,33 +363,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -550,33 +487,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -695,33 +611,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -840,33 +735,12 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
+            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
+            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
+          else
+            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
+          fi
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -900,293 +774,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.14"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-          # Build
-          USE_PYTORCH_METAL_EXPORT=1
-          USE_COREML_DELEGATE=1
-          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
-          export USE_PYTORCH_METAL_EXPORT
-          export USE_COREML_DELEGATE
-          export TORCH_PACKAGE_NAME
-          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
-      - name: Test PyTorch wheel
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-
-          # Create new "clean" conda environment for testing
-
-          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
-          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
-
-          # shellcheck disable=SC2086
-          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: wheel-py3_14-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_14-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_14-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.14"
-      build_name: wheel-py3_14-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-  wheel-py3_14t-cpu-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: macos-14-xlarge
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      SKIP_ALL_TESTS: 1
-      DESIRED_PYTHON: "3.14t"
-    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          # shellcheck disable=SC2129
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          # shellcheck disable=SC2129
-          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
-      - name: Install conda and dependencies
-        run: |
-          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
-          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
-          chmod +x "${RUNNER_TEMP}/conda.sh"
-          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
-          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
-          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
-            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
-          fi
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: Populate binary env
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-          # Build
-          USE_PYTORCH_METAL_EXPORT=1
-          USE_COREML_DELEGATE=1
-          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
-          export USE_PYTORCH_METAL_EXPORT
-          export USE_COREML_DELEGATE
-          export TORCH_PACKAGE_NAME
-          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
-      - name: Test PyTorch wheel
-        run: |
-          # shellcheck disable=SC1091
-          source "${RUNNER_TEMP}/anaconda/bin/activate"
-          set -eux -o pipefail
-          # shellcheck disable=SC1090
-          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
-
-          # Create new "clean" conda environment for testing
-
-          SMOKE_TEST_PARAMS=""
-
-          EXTRA_CONDA_INSTALL_FLAGS=""
-          CONDA_ENV_CREATE_FLAGS=""
-          # shellcheck disable=SC2153
-          case $DESIRED_PYTHON in
-            3.14t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.14)
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
-              desired_python="3.14.0rc1"
-              ;;
-            3.13t)
-              CONDA_ENV_CREATE_FLAGS="python-freethreading"
-              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
-              desired_python="3.13"
-              ;;
-            *)
-              # shellcheck disable=SC2153
-              desired_python=${DESIRED_PYTHON}
-              ;;
-          esac
-
-          # shellcheck disable=SC2086
-          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
-          conda activate test_conda_env
-          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
-
-          # shellcheck disable=SC2086
-          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
-      - uses: actions/upload-artifact@v4.4.0
-        if: always()
-        with:
-          name: wheel-py3_14t-cpu
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  wheel-py3_14t-cpu-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: wheel-py3_14t-cpu-build
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: wheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cpu
-      GPU_ARCH_TYPE: cpu
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cpu
-      DESIRED_PYTHON: "3.14t"
-      build_name: wheel-py3_14t-cpu
-      use_s3: False
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -299,7 +299,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -415,7 +415,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -527,7 +527,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
@ -549,7 +549,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -665,7 +665,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -777,7 +777,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
@ -799,7 +799,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -915,7 +915,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
@ -1027,7 +1027,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -299,7 +299,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -415,7 +415,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -527,7 +527,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: "12.6"
+      GPU_ARCH_VERSION: 12.6
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
@ -549,7 +549,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -665,7 +665,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -777,7 +777,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu128
-      GPU_ARCH_VERSION: "12.8"
+      GPU_ARCH_VERSION: 12.8
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
@ -799,7 +799,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -915,7 +915,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
@ -1027,7 +1027,7 @@ jobs:
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
-      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_VERSION: 12.9
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@ -4,12 +4,9 @@ on:
  pull_request:
    paths:
      - .github/workflows/h100-cutlass-backend.yml
-      - torch/_inductor/codegen/cuda/**
-      - test/inductor/test_cutlass_backend.py
-      - test/inductor/test_cutlass_evt.py
  workflow_dispatch:
  schedule:
-    - cron: 22 9,21 * * *  # every 12 hours
+    - cron: 22 9 * * *  # every 24 hours about 2:22am PDT
  push:
    tags:
      - ciflow/h100-cutlass-backend/*
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -58,14 +58,9 @@ on:
        required: false
        type: string
        default: inductor_huggingface_perf_cuda_h100,inductor_timm_perf_cuda_h100,inductor_torchbench_perf_cuda_h100
-  pull_request:
-    # Changing these files guarantees that this workflow needs to be run
-    paths:
-      - .github/workflows/inductor-perf-test-nightly-h100.yml
-      - .ci/docker/ci_commit_pins/huggingface-requirements.txt

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

 permissions:
@ -165,9 +160,10 @@ jobs:
    name: cuda12.8-py3.10-gcc9-sm90
    uses: ./.github/workflows/_linux-test.yml
    needs: build
+    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm90
-      dashboard-tag: training-${{ inputs.training || 'true' }}-inference-${{ inputs.inference || 'true' }}-default-${{ inputs.default || 'true' }}-dynamic-${{ inputs.dynamic || 'true' }}-cudagraphs-${{ inputs.cudagraphs || 'true' }}-cppwrapper-${{ inputs.cppwrapper || 'false' }}-aotinductor-${{ inputs.aotinductor || 'false' }}-maxautotune-${{ inputs.maxautotune || 'false' }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs || 'false' }}-cudagraphs_low_precision-${{ inputs.cudagraphs || 'false' }}
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.build.outputs.docker-image }}
      test-matrix: ${{ needs.build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -156,13 +156,13 @@ jobs:
      sync-tag: asan-test
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-build:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-build:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12-onnx
+      build-environment: linux-jammy-py3.9-clang12-onnx
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
      test-matrix: |
        { include: [
@ -171,16 +171,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-test:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-test:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-onnx-build
+      - linux-jammy-py3_9-clang12-onnx-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12-onnx
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_9-clang12-build:
--- a/.github/workflows/test-h100.yml
+++ b/.github/workflows/test-h100.yml
@ -4,10 +4,6 @@ on:
  pull_request:
    paths:
      - .github/workflows/test-h100.yml
-      - test/inductor/test_max_autotune.py
-      - torch/_inductor/kernel/mm.py
-      - torch/_inductor/kernel/mm_grouped.py
-
  workflow_dispatch:
  schedule:
    - cron: 0 4,10,16,22 * * *  # every 6 hours
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -201,9 +201,9 @@ jobs:
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.4" },
        ]}
    secrets: inherit

--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -1,70 +0,0 @@
-name: vllm-test
-
-on:
-  push:
-    tags:
-      - ciflow/vllm/*
-  workflow_dispatch:
-  schedule:
-    # Every 12 hours starting at 00:00 UTC (00:00 and 12:00)
-    - cron: '0 0,12 * * *'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  torch-build:
-    name: ci-vllm-test
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-additional-packages: "vision audio"
-      build-external-packages: "vllm"
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.0;8.9;9.0'
-      runner: linux.24xlarge.memory
-      test-matrix: |
-        { include: [
-          { config:  "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_entrypoints_test", shard: 1, num_shards: 1,runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_regression_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_280_failure_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_multi_model_processor_test", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_pytorch_compilation_unit_tests", shard: 1, num_shards: 1, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_test", shard: 0, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_test", shard: 1, num_shards: 4, runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_test", shard: 2, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_test", shard: 3, num_shards: 4,  runner: "linux.g6.4xlarge.experimental.nvidia.gpu" },
-          { config: "vllm_lora_tp_test_distributed", shard: 1, num_shards: 1, runner: "linux.aws.h100.4"},
-        ]}
-    secrets: inherit
-
-  vllm-test-sm89:
-      name: ci-vllm-test
-      uses: ./.github/workflows/_linux-test.yml
-      needs: [
-        torch-build,
-      ]
-      with:
-        build-environment: linux-jammy-cuda12.8-py3.12-gcc11
-        docker-image: ${{ needs.torch-build.outputs.docker-image }}
-        test-matrix: ${{ needs.torch-build.outputs.test-matrix }}
-      secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -32,7 +32,6 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
-aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -132,7 +132,7 @@ include_patterns = [
    'test/test_complex.py',
    'test/test_datapipe.py',
    'test/test_futures.py',
-    'test/test_numpy_interop.py',
+    # 'test/test_numpy_interop.py',
    'test/test_torch.py',
    'test/test_type_hints.py',
    'test/test_type_info.py',
@ -1454,7 +1454,7 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    'usort==1.0.8.post1',
    'isort==6.0.1',
-    'ruff==0.12.9',  # sync with RUFF
+    'ruff==0.12.2',  # sync with RUFF
 ]
 is_formatter = true

@ -1589,7 +1589,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.12.9',  # sync with PYFMT
+    'ruff==0.12.2',  # sync with PYFMT
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -279,7 +279,6 @@ header_template_rule(
        "@AT_BLAS_F2C@": "0",
        "@AT_BLAS_USE_CBLAS_DOT@": "1",
        "@AT_KLEIDIAI_ENABLED@": "0",
-        "@AT_USE_EIGEN_SPARSE@": "0",
    },
 )

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -289,7 +289,6 @@ option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build."
 option(USE_PROF "Use profiling" OFF)
 option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON)
 option(USE_SNPE "Use Qualcomm's SNPE library" OFF)
-option(USE_EIGEN_SPARSE "Use Eigen Sparse Matrices" OFF)
 option(USE_SYSTEM_EIGEN_INSTALL
    "Use system Eigen instead of the one under third_party" OFF)
 cmake_dependent_option(
--- a/README.md
+++ b/README.md
@ -242,6 +242,7 @@ git submodule update --init --recursive
 **Common**

 ```bash
+conda install cmake ninja
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section above
 pip install -r requirements.txt
 ```
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -96,8 +96,6 @@ file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp")
 file(GLOB vulkan_cpp "vulkan/*.cpp")
 file(GLOB native_vulkan_cpp "native/vulkan/*.cpp" "native/vulkan/api/*.cpp" "native/vulkan/impl/*.cpp" "native/vulkan/ops/*.cpp")

-file(GLOB native_eigen_cpp "native/sparse/eigen/*.cpp")
-
 # Metal
 file(GLOB metal_h "metal/*.h")
 file(GLOB metal_cpp "metal/*.cpp")
@ -343,9 +341,6 @@ if(USE_VULKAN)
 else()
  set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp})
 endif()
-if(USE_EIGEN_SPARSE)
-  set(all_cpu_cpp ${all_cpu_cpp} ${native_eigen_cpp})
-endif()

 if(USE_MTIA)
    set(ATen_MTIA_SRCS ${ATen_MTIA_SRCS} ${mtia_cpp} ${mtia_h} ${native_mtia_cpp} ${native_mtia_h})
@ -475,6 +470,10 @@ if(USE_ROCM)
    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
      ${native_hip_bgemm} ${native_hip_ck})
  endif()
+  if(WIN32) # Windows doesn't support Composable Kernels and Triton
+    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
+      ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+  endif()

  # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
  list(APPEND all_hip_cpp
--- a/aten/src/ATen/Config.h.in
+++ b/aten/src/ATen/Config.h.in
@ -20,4 +20,3 @@
 #define AT_BLAS_F2C() @AT_BLAS_F2C@
 #define AT_BLAS_USE_CBLAS_DOT() @AT_BLAS_USE_CBLAS_DOT@
 #define AT_KLEIDIAI_ENABLED() @AT_KLEIDIAI_ENABLED@
-#define AT_USE_EIGEN_SPARSE() @AT_USE_EIGEN_SPARSE@
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -698,14 +698,6 @@ bool Context::hasLAPACK() {
 #endif
 }

-bool Context::hasEigenSparse() {
-#if AT_USE_EIGEN_SPARSE()
-  return true;
-#else
-  return false;
-#endif
-}
-
 at::QEngine Context::qEngine() const {
  static auto _quantized_engine = []() {
    at::QEngine qengine = at::kNoQEngine;
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -133,7 +133,6 @@ class TORCH_API Context {
  static bool hasLAPACK();
  static bool hasMKLDNN();
  static bool ckSupported();
-  static bool hasEigenSparse();
  static bool hasMAGMA() {
    return detail::getCUDAHooks().hasMAGMA();
  }
@ -616,10 +615,6 @@ inline bool hasLAPACK() {
  return globalContext().hasLAPACK();
 }

-inline bool hasEigenSparse() {
-  return globalContext().hasEigenSparse();
-}
-
 inline bool hasMAGMA() {
  return globalContext().hasMAGMA();
 }
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1) {
-      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
        continue;
      }
    }
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
    if (tensor.dim() == 1) {
-      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
        continue;
      }
    }
--- a/Show More
+++ b/Show More