document some quantization public apis

2025-11-12 14:54:55 +08:00 · 2025-10-10 13:39:03 -07:00
3140 changed files with 50170 additions and 124989 deletions
--- a/.bc-linter.yml
+++ b/.bc-linter.yml
@ -13,4 +13,3 @@ exclude:
  - "**/benchmarks/**"
  - "**/test_*.py"
  - "**/*_test.py"
  - "tools/**"
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -8,8 +8,6 @@ if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -7,13 +7,13 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 RUN yum -y update
 RUN yum -y install epel-release
 # install glibc-langpack-en make sure en_US.UTF-8 locale is available
 RUN yum -y install glibc-langpack-en
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -41,7 +41,6 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
 ARG CUDA_VERSION=12.6
 ARG DEVTOOLSET_VERSION=13
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
@ -51,8 +50,7 @@ ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
-ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
@ -70,22 +68,8 @@ FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
-FROM ${ROCM_IMAGE} as rocm_base
+FROM ${ROCM_IMAGE} as rocm
 ARG DEVTOOLSET_VERSION=13
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 # Install devtoolset on ROCm base image
 RUN yum -y update && \
    yum -y install epel-release && \
    yum -y install glibc-langpack-en && \
    yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
 RUN git config --global --add safe.directory '*'
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 FROM rocm_base as rocm
 ARG PYTORCH_ROCM_ARCH
 ARG DEVTOOLSET_VERSION=13
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
@ -104,7 +88,6 @@ COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0
 # Final step
 FROM ${BASE_TARGET} as final
 ARG DEVTOOLSET_VERSION=13
 COPY --from=openssl            /opt/openssl           /opt/openssl
 COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
 COPY --from=conda              /opt/conda             /opt/conda
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -36,7 +36,11 @@ case ${DOCKER_TAG_PREFIX} in
    ;;
  rocm*)
    BASE_TARGET=rocm
-    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+    PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
    # add gfx950 conditionally starting in ROCm 7.0
    if [[ "$ROCM_VERSION" == *"7.0"* ]]; then
        PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
    fi
    EXTRA_BUILD_ARGS="${EXTRA_BUILD_ARGS} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
    ;;
  *)
@ -59,7 +63,7 @@ docker build \
  --target final \
  --progress plain \
  --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "DEVTOOLSET_VERSION=13" \
+  --build-arg "DEVTOOLSET_VERSION=11" \
  ${EXTRA_BUILD_ARGS} \
  -t ${tmp_tag} \
  $@ \
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -113,7 +113,6 @@ case "$tag" in
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    INSTALL_MINGW=yes
    ;;
  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
    CUDA_VERSION=13.0.0
@ -168,18 +167,6 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3.11-clang12)
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=12
    VISION=no
    TRITON=no
    ;;
  pytorch-linux-jammy-py3.12-clang12)
    ANACONDA_PYTHON_VERSION=3.12
    CLANG_VERSION=12
    VISION=no
    TRITON=no
    ;;
  pytorch-linux-jammy-rocm-n-py3 | pytorch-linux-jammy-rocm-n-py3-benchmarks | pytorch-linux-noble-rocm-n-py3)
    if [[ $tag =~ "jammy" ]]; then
      ANACONDA_PYTHON_VERSION=3.10
@ -194,7 +181,7 @@ case "$tag" in
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
-    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950;gfx1100"
+    PYTORCH_ROCM_ARCH="gfx90a;gfx942;gfx950"
    if [[ $tag =~ "benchmarks" ]]; then
      INDUCTOR_BENCHMARKS=yes
    fi
@ -207,16 +194,13 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-noble-xpu-n-py3 | pytorch-linux-noble-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
    TRITON=yes
    if [[ $tag =~ "benchmarks" ]]; then
      INDUCTOR_BENCHMARKS=yes
    fi
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
@ -260,12 +244,6 @@ case "$tag" in
    HALIDE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-cuda12.8-py3.12-pallas)
    CUDA_VERSION=12.8.1
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    PALLAS=yes
    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
@ -279,9 +257,9 @@ case "$tag" in
    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
-  pytorch-linux-jammy-aarch64-py3.10-gcc13)
+  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=13
+    GCC_VERSION=11
    ACL=yes
    VISION=yes
    OPENBLAS=yes
@ -289,19 +267,9 @@ case "$tag" in
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
-  pytorch-linux-jammy-aarch64-py3.10-clang21)
+  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=21
+    GCC_VERSION=11
    ACL=yes
    VISION=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc13-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=13
    ACL=yes
    VISION=yes
    OPENBLAS=yes
@ -376,7 +344,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
@ -387,14 +355,12 @@ docker build \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "PALLAS=${PALLAS}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "OPENBLAS=${OPENBLAS:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
--- a/.ci/docker/ci_commit_pins/jax.txt
+++ b/.ci/docker/ci_commit_pins/jax.txt
@ -1 +0,0 @@
 0.8.0
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
+27664085f804afc83df26f740bb46c365854f2c4
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -3,7 +3,7 @@
 set -eux
-ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
 ACL_INSTALL_DIR="/acl"
 # Clone ACL
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -8,8 +8,8 @@ if [ -n "$CLANG_VERSION" ]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
-    if [[ $CLANG_VERSION -ge 18 ]]; then
+    if [[ $CLANG_VERSION == 18 ]]; then
-      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-${CLANG_VERSION} main"
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
  fi
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -49,20 +49,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    export SYSROOT_DEP="sysroot_linux-64=2.17"
  fi
 # Install correct Python version
 # Also ensure sysroot is using a modern GLIBC to match system compilers
 if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
             python="3.14.0" \
             ${SYSROOT_DEP} \
             -c conda-forge
 else
  # Install correct Python version
  # Also ensure sysroot is using a modern GLIBC to match system compilers
  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
             python="$ANACONDA_PYTHON_VERSION" \
             ${SYSROOT_DEP}
-fi
+
  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -83,6 +83,10 @@ function build_cpython {
        py_suffix=${py_ver::-1}
        py_folder=$py_suffix
    fi
    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
    if [ "$py_suffix" == "3.14.0" ]; then
        py_suffix="3.14.0rc2"
    fi
    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
    do_cpython_build $py_ver Python-$py_suffix
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi
-NVSHMEM_VERSION=3.4.5
+NVSHMEM_VERSION=3.3.24
 function install_cuda {
  version=$1
@ -150,7 +150,7 @@ function install_130 {
  CUDNN_VERSION=9.13.0.50
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
-  install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
+  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 13 $CUDNN_VERSION
--- a/.ci/docker/common/install_gcc.sh
+++ b/.ci/docker/common/install_gcc.sh
@ -7,11 +7,11 @@ if [ -n "$GCC_VERSION" ]; then
  # Need the official toolchain repo to get alternate packages
  add-apt-repository ppa:ubuntu-toolchain-r/test
  apt-get update
-  apt-get install -y g++-$GCC_VERSION gfortran-$GCC_VERSION
+  apt-get install -y g++-$GCC_VERSION
  update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-"$GCC_VERSION" 50
  update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-"$GCC_VERSION" 50
  update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-"$GCC_VERSION" 50
-  update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-"$GCC_VERSION" 50
+
  # Cleanup package manager
  apt-get autoclean && apt-get clean
--- a/.ci/docker/common/install_jax.sh
+++ b/.ci/docker/common/install_jax.sh
@ -1,40 +0,0 @@
 #!/bin/bash
 set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 # Get the pinned JAX version (same for all CUDA versions)
 JAX_VERSION=$(get_pinned_commit /ci_commit_pins/jax)
 function install_jax_12() {
  echo "Installing JAX ${JAX_VERSION} with CUDA 12 support"
  pip_install "jax[cuda12]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
  # Verify installation
  python -c "import jax"  # check for errors
  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 12"
 }
 function install_jax_13() {
  echo "Installing JAX ${JAX_VERSION} with CUDA 13 support"
  pip_install "jax[cuda13]==${JAX_VERSION}" -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
  # Verify installation
  python -c "import jax"  # check for errors
  echo "JAX ${JAX_VERSION} installation completed successfully for CUDA 13"
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    12.4|12.6|12.6.*|12.8|12.8.*|12.9|12.9.*) install_jax_12;
        ;;
    13.0|13.0.*) install_jax_13;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
    shift
 done
--- a/.ci/docker/common/install_libgomp.sh
+++ b/.ci/docker/common/install_libgomp.sh
@ -1,56 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 # install dependencies
 dnf -y install gmp-devel libmpc-devel texinfo flex bison
 cd /usr/local/src
 # fetch source for gcc 13
 git clone --depth 1 --single-branch -b releases/gcc-13.3.0 https://github.com/gcc-mirror/gcc.git gcc-13.3.0
 mkdir -p gcc-13.3.0/build-gomp
 cd gcc-13.3.0/build-gomp
 # configure gcc build
 # I got these flags by:
 # 1. downloading the source rpm for gcc-11 on AlmaLinux 8 container
 #    dnf install -y dnf-plugins-core rpmdevtools
 #   dnf download --source libgomp
 # 2. extracting the gcc.spec from the source.
 #    rpmdev-extract gcc-xx.src.rpm
 # 3. extracting optflags and ld_flags from gcc.spec:
 #    rpm --eval '%{optflags}'
 #    rpm --eval '%{build_ldflags}'
 #
 # I had to remove the following flags because they didn't compile for this version of libgomp:
 #   -Werror=format-security
 #   -specs=/usr/lib/rpm/redhat/redhat-hardened-cc1
 #   -specs=/usr/lib/rpm/redhat/redhat-annobin-cc1
 #
 # I added -march=armv8-a -mtune=generic to make them explicit. I don't think they're strictly needed.
 OPT_FLAGS='-O2 -march=armv8-a -mtune=generic'\
 ' -fexceptions -g -grecord-gcc-switches -pipe -Wall'\
 ' -Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS'\
 ' -fstack-protector-strong -fasynchronous-unwind-tables'\
 ' -fstack-clash-protection'
 LDFLAGS='-Wl,-z,relro -Wl,--as-needed -Wl,-z,now'
 CFLAGS="$OPT_FLAGS" \
 CXXFLAGS="$OPT_FLAGS" \
 LDFLAGS="$LDFLAGS" \
 ../configure \
  --prefix=/usr \
  --libdir=/usr/lib64 \
  --enable-languages=c,c++ \
  --disable-multilib \
  --disable-bootstrap \
  --enable-libgomp
 # only build libgomp
 make -j$(nproc) all-target-libgomp
 make install-target-libgomp
--- a/.ci/docker/common/install_mingw.sh
+++ b/.ci/docker/common/install_mingw.sh
@ -1,10 +0,0 @@
 #!/bin/bash
 set -ex
 # Install MinGW-w64 for Windows cross-compilation
 apt-get update
 apt-get install -y g++-mingw-w64-x86-64-posix
 echo "MinGW-w64 installed successfully"
 x86_64-w64-mingw32-g++ --version
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.23.1
+pip_install onnxruntime==1.23.0
-pip_install onnxscript==0.5.4
+pip_install onnxscript==0.5.3
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -10,7 +10,6 @@ git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION}" -
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 OPENBLAS_BUILD_FLAGS="
 CC=gcc
 NUM_THREADS=128
 USE_OPENMP=1
 NO_SHARED=0
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -40,7 +40,11 @@ EOF
    # Default url values
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {
    rocm_version_nodot=${rocm_version//./}
-    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
    rocm_dir="/opt/rocm"
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -9,7 +9,7 @@ set -xe
 function install_ubuntu() {
    . /etc/os-release
-    if [[ ! " jammy noble " =~ " ${VERSION_CODENAME} " ]]; then
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
@ -35,24 +35,25 @@ function install_ubuntu() {
    # The xpu-smi packages
    apt-get install -y flex bison xpu-smi
-    # Compute and Media Runtimes
+    if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
-    if [[ " ${VERSION_CODENAME} " =~ " noble " ]]; then
+        # Compute and Media Runtimes
        apt-get install -y \
-            intel-opencl-icd libze-intel-gpu1 libze1 \
+            intel-opencl-icd intel-level-zero-gpu level-zero \
-            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
+            intel-media-va-driver-non-free libmfx1 libmfxgen1 libvpl2 \
-            libegl-mesa0 libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
+            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
-            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
+            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    else # jammy
+        # Development Packages
        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    else # rolling driver
        apt-get install -y \
            intel-opencl-icd libze-intel-gpu1 libze1 \
            intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
            libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
            libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
            mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc
        apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    fi
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev
    # Install Intel Support Packages
    apt-get install -y ${XPU_PACKAGES}
@ -65,7 +66,7 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.10 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
@ -146,7 +147,7 @@ function install_sles() {
 XPU_DRIVER_VERSION=""
 if [[ "${XPU_DRIVER_TYPE,,}" == "lts" ]]; then
    # Use GPU driver LTS releases
-    XPU_DRIVER_VERSION="/lts/2523"
+    XPU_DRIVER_VERSION="/lts/2350"
 fi
 # Default use Intel® oneAPI Deep Learning Essentials 2025.1
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,17 +39,17 @@ case ${DOCKER_TAG_PREFIX} in
        DOCKER_GPU_BUILD_ARG=""
        ;;
    rocm*)
        # we want the patch version of 7.0 instead
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        # we want the patch version of 6.4 instead
        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -149,7 +149,7 @@ FROM cpu_final as rocm_final
 ARG ROCM_VERSION=6.0
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
 # Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
 # below workaround helps avoid error
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -50,10 +50,6 @@ RUN rm install_ninja.sh
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # Build a newer version of libgomp than that supported in in Almalinux 8.
 COPY ./common/install_libgomp.sh install_libgomp.sh
 RUN bash ./install_libgomp.sh && rm install_libgomp.sh
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -115,9 +115,6 @@ RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
 # cmake-3.28.0 from pip for onnxruntime
 RUN python3 -mpip install cmake==3.28.0
 ADD ./common/patch_libstdc.sh patch_libstdc.sh
 RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
 # build onnxruntime 1.21.0 from sources.
 # it is not possible to build it from sources using pip,
 # so just build it from upstream repository.
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -75,25 +75,25 @@ case ${image} in
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    manylinux2_28-builder:rocm*)
        # we want the patch version of 7.0 instead
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        # we want the patch version of 6.4 instead
        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201;gfx950;gfx1150;gfx1151"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        # add gfx950 conditionally starting in ROCm 7.0
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH};gfx950"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
    manylinux2_28-builder:xpu)
        TARGET=xpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -10,6 +10,11 @@ BAD_SSL = "https://self-signed.badssl.com"
 print("Testing SSL certificate checking for Python:", sys.version)
 if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
    print("This version never checks SSL certs; skipping tests")
    sys.exit(0)
 EXC = OSError
 print(f"Connecting to {GOOD_SSL} should work")
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -120,8 +120,9 @@ ninja==1.11.1.4
 numba==0.55.2 ; python_version == "3.10" and platform_machine != "s390x"
 numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #Description: Just-In-Time Compiler for Numerical Functions
-#Pinned versions: 0.55.2, 0.60.0
+#Pinned versions: 0.54.1, 0.49.0, <=0.49.1
 #test that import: test_numba_integration.py
 #For numba issue see https://github.com/pytorch/pytorch/issues/51511
 #Need release > 0.61.2 for s390x due to https://github.com/numba/numba/pull/10073
 #numpy
@ -138,12 +139,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
+numpy==2.1.2; python_version >= "3.13"
 numpy==2.3.4; python_version >= "3.14"
 pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
+pandas==2.2.3; python_version >= "3.13"
 pandas==2.3.3; python_version >= "3.14"
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -155,8 +154,7 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
-optree==0.13.0 ; python_version < "3.14"
+optree==0.13.0
 optree==0.17.0 ; python_version >= "3.14"
 #Description: A library for tree manipulation
 #Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
@ -244,9 +242,10 @@ pygments==2.15.0
 #Pinned versions: 14.1.0
 #test that import:
-scikit-image==0.22.0
+scikit-image==0.19.3 ; python_version < "3.10"
 scikit-image==0.22.0 ; python_version >= "3.10"
 #Description: image processing routines
-#Pinned versions: 0.22.0
+#Pinned versions:
 #test that import: test_nn.py
 #scikit-learn
@ -255,8 +254,7 @@ scikit-image==0.22.0
 #test that import:
 scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
+scipy==1.14.1 ; python_version >= "3.12"
 scipy==1.16.2 ; python_version >= "3.14"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@ -328,8 +326,7 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
-lxml==5.3.0 ; python_version < "3.14"
+lxml==5.3.0
 lxml==6.0.2 ; python_version >= "3.14"
 #Description: This is a requirement of unittest-xml-reporting
 PyGithub==2.3.0
@ -339,14 +336,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
-onnx==1.19.1 ; python_version < "3.14"
+onnx==1.18.0
 # Unpin once Python 3.14 is supported. See  onnxruntime issue 26309.
 onnx==1.18.0 ; python_version == "3.14"
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
-onnxscript==0.5.4
+onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -366,7 +361,7 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 # To build PyTorch itself
-pyyaml==6.0.3
+pyyaml==6.0.2
 pyzstd
 setuptools==78.1.1
 packaging==23.1
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,11 +1,15 @@
-sphinx==7.2.6
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
+#Pinned versions: 5.3.0
-pytorch_sphinx_theme2==0.2.0
+standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed to generate PyTorch docs
+#Description: This is needed by Sphinx, so it needs to be added here.
-#Pinned versions: 0.2.0
+# The reasons are as follows:
 # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
 # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0
-breathe==4.36.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.36.0
+#Pinned versions: 4.34.0
-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3
-docutils==0.20
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.20
+#Pinned versions: 0.16
 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0
-myst-nb==1.3.0
+myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 1.3.0
+#Pinned versions: 0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.6.1
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-myst-parser==4.0.1
+myst-parser==0.18.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.5.1
+3.5.0
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -100,16 +100,9 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 # Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
 ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 ARG INSTALL_MINGW
 COPY ./common/install_mingw.sh install_mingw.sh
 RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
 RUN rm install_mingw.sh
 ARG TRITON
 ARG TRITON_CPU
@ -143,15 +136,6 @@ COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt
 ARG PALLAS
 ARG CUDA_VERSION
 # Install JAX with CUDA support (for Pallas)
 COPY ./common/install_jax.sh install_jax.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./ci_commit_pins/jax.txt /ci_commit_pins/jax.txt
 RUN if [ -n "${PALLAS}" ]; then bash ./install_jax.sh ${CUDA_VERSION}; fi
 RUN rm -f install_jax.sh common_utils.sh /ci_commit_pins/jax.txt
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/lumen_cli/cli/lib/common/cli_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/cli_helper.py
@ -8,11 +8,9 @@ from abc import ABC, abstractmethod
 try:
-    from collections.abc import Callable  # Python 3.11+
+    from typing import Any, Callable, Required, TypedDict  # Python 3.11+
    from typing import Any, Required, TypedDict
 except ImportError:
-    from collections.abc import Callable
+    from typing import Any, Callable, TypedDict
    from typing import Any, TypedDict
    from typing_extensions import Required  # Fallback for Python <3.11
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
        logger.info("Successfully cloned %s", target)
        return r, commit
-    except GitCommandError:
+    except GitCommandError as e:
-        logger.exception("Git operation failed")
+        logger.error("Git operation failed: %s", e)
        raise
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -143,7 +143,7 @@ def sample_vllm_test_library():
                "pytest -v -s compile/test_decorator.py",
            ],
        },
-        "vllm_language_model_test_extended_generation_28_failure_test": {
+        "vllm_languagde_model_test_extended_generation_28_failure_test": {
            "title": "Language Models Test (Extended Generation) 2.8 release failure",
            "id": "vllm_languagde_model_test_extended_generation_28_failure_test",
            "package_install": [
--- a/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/vllm_build.py
@ -63,7 +63,7 @@ class VllmBuildParameters:
    # DOCKERFILE_PATH: path to Dockerfile used when use_local_dockerfile is True"
    use_local_dockerfile: bool = env_bool_field("USE_LOCAL_DOCKERFILE", True)
    dockerfile_path: Path = env_path_field(
-        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile"
+        "DOCKERFILE_PATH", ".github/ci_configs/vllm/Dockerfile.tmp_vllm"
    )
    # the cleaning script to remove torch dependencies from pip
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,7 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.9.6"
+    "uv==0.8.6"
 ]
 [tool.setuptools]
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,11 +1,11 @@
 SHELL=/usr/bin/env bash
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.1
+DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1150;gfx1151;gfx1200;gfx1201
+#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
@ -16,20 +16,15 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 .PHONY: all
 all: magma-rocm71
 all: magma-rocm70
 all: magma-rocm64
 all: magma-rocm63
 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 .PHONY: magma-rocm71
 magma-rocm71: DESIRED_ROCM := 7.1
 magma-rocm71:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
@ -39,3 +34,8 @@ magma-rocm70:
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm63
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
 	$(DOCKER_RUN)
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -187,22 +187,19 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            export USE_CUFILE=0
        else
            DEPS_LIST+=(
                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
                "/usr/local/cuda/lib64/libcublas.so.12"
                "/usr/local/cuda/lib64/libcublasLt.so.12"
                "/usr/local/cuda/lib64/libcudart.so.12"
                "/usr/local/cuda/lib64/libnvrtc.so.12"
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
            DEPS_SONAME+=(
                "libnvToolsExt.so.1"
                "libcublas.so.12"
                "libcublasLt.so.12"
                "libcudart.so.12"
                "libnvrtc.so.12"
                "libcupti.so.12")
            if [[ $CUDA_VERSION != 12.9* ]]; then
                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
                DEPS_SONAME+=("libnvToolsExt.so.1")
            fi
        fi
    else
        echo "Using nvidia libs from pypi."
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -168,16 +168,14 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/umf/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Enable XCCL build
  export USE_XCCL=1
  export USE_MPI=0
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi
@ -235,9 +233,7 @@ if [[ "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
-if [[ "$BUILD_ENVIRONMENT" == *-full-debug* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=Debug
 elif [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi
@ -303,11 +299,6 @@ else
      python -m build --wheel --no-isolation
    fi
    pip_install_whl "$(echo dist/*.whl)"
    if [[ "$BUILD_ENVIRONMENT" == *full-debug* ]]; then
      # Regression test for https://github.com/pytorch/pytorch/issues/164297
      # Torch should be importable and that's about it
      pushd /; python -c "import torch;print(torch.__config__.show(), torch.randn(5) + 1.7)"; popd
    fi
    if [[ "${BUILD_ADDITIONAL_PACKAGES:-}" == *vision* ]]; then
      install_torchvision
@ -428,7 +419,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  # export test times so that potential sharded tests that'll branch off this build will use consistent data
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
-  PYTHONPATH=. python tools/stats/export_test_times.py
+  python tools/stats/export_test_times.py
 fi
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi
 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -256,7 +256,7 @@ test_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
  for backend in eager inductor; do
@ -319,7 +319,7 @@ test_aoti_torchbench_smoketest() {
  local device=mps
  local dtypes=(undefined float16 bfloat16 notset)
  local dtype=${dtypes[$1]}
-  local models=(llama BERT_pytorch dcgan yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor vgg16)
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)
  echo "Launching torchbench inference performance run for AOT Inductor and dtype ${dtype}"
  local dtype_arg="--${dtype}"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then
  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
+  # Count the number of lines in the file and turn that number into a variable
-  # showing the undocumented count in the third column.
+  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
-  # Example: | TOTAL | 99.83% | 2 |
+  # Skip the report header by subtracting 2: the header will be output even if
  # there are no undocumented items.
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-
+  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
-  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
+  undocumented=$((lines - 2))
-  # The table format is: | Module | Coverage | Undocumented |
+  if [ $undocumented -lt 0 ]; then
  # Extract the third column (undocumented count) from the TOTAL row
  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
    echo coverage output not found
    exit 1
-  elif [ "$undocumented" -gt 0 ]; then
+  elif [ $undocumented -gt 0 ]; then
-    set +x  # Disable command echoing for cleaner output
+    echo undocumented objects found:
-    echo ""
+    cat build/coverage/python.txt
    echo "====================="
    echo "UNDOCUMENTED OBJECTS:"
    echo "====================="
    echo ""
    # Find the line number of the TOTAL row and print only what comes after it
    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
    if [ -n "$total_line" ]; then
      # Print only the detailed list (skip the statistics table)
      tail -n +$((total_line + 2)) build/coverage/python.txt
    else
      # Fallback to showing entire file if TOTAL line not found
      cat build/coverage/python.txt
    fi
    echo ""
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    set -x  # Re-enable command echoing
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -208,8 +208,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/ccl/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/pti/latest/env/vars.sh
  # Check XPU status before testing
  timeout 30 xpu-smi discovery || true
 fi
@ -339,13 +337,13 @@ test_python() {
 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
 test_python_smoke_b200() {
  # Targeted smoke tests for B200 - staged approach to avoid too many failures
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
@ -462,37 +460,31 @@ test_inductor_shard() {
    --verbose
 }
-test_inductor_aoti_cpp() {
+test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi
  # aoti cmake custom command requires `torch` to be installed
  # initialize the cmake build cache and install torch
  /usr/bin/env "${BUILD_COMMAND[@]}"
  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }
 test_inductor_aoti_cross_compile_for_windows() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  # Set WINDOWS_CUDA_HOME environment variable
  WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
  export WINDOWS_CUDA_HOME
  echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
  echo "Contents:"
  ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
  python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
 }
 test_inductor_cpp_wrapper_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -574,8 +566,6 @@ fi
 if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
@ -669,8 +659,6 @@ test_perf_for_dashboard() {
    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
    device=xpu
  fi
  for mode in "${modes[@]}"; do
@ -826,11 +814,6 @@ test_inductor_halide() {
  assert_git_not_dirty
 }
 test_inductor_pallas() {
  python test/run_test.py --include inductor/test_pallas.py --verbose
  assert_git_not_dirty
 }
 test_inductor_triton_cpu() {
  python test/run_test.py --include inductor/test_triton_cpu_backend.py inductor/test_torchinductor_strided_blocks.py --verbose
  assert_git_not_dirty
@ -855,7 +838,7 @@ test_dynamo_benchmark() {
      elif [[ "${suite}" == "timm_models" ]]; then
        export TORCHBENCH_ONLY_MODELS="inception_v3"
      elif [[ "${suite}" == "torchbench" ]]; then
-        export TORCHBENCH_ONLY_MODELS="BERT_pytorch"
+        export TORCHBENCH_ONLY_MODELS="hf_Bert"
      fi
    fi
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
@ -886,13 +869,13 @@ test_inductor_torchbench_smoketest_perf() {
  mkdir -p "$TEST_REPORTS_DIR"
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
-    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only BERT_pytorch \
+    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4
  # Check memory compression ratio for a few models
-  for test in BERT_pytorch yolov3; do
+  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
      --disable-cudagraphs --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" \
      --only $test --output "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv"
@ -903,7 +886,7 @@ test_inductor_torchbench_smoketest_perf() {
  done
  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
+  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
@ -917,7 +900,7 @@ test_inductor_set_cpu_affinity(){
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
-  if [[ "$(uname -m)" != "aarch64" ]]; then
+  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
    # Use Intel OpenMP for x86
    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@ -931,7 +914,7 @@ test_inductor_set_cpu_affinity(){
  cores=$((cpus / thread_per_core))
  # Set number of cores to 16 on aarch64 for performance runs
-  if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
    cores=16
  fi
  export OMP_NUM_THREADS=$cores
@ -1632,7 +1615,6 @@ test_operator_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)
  ARCH=$(uname -m)
  test_inductor_set_cpu_affinity
@ -1647,7 +1629,7 @@ test_operator_benchmark() {
  pip_install pandas
  python check_perf_csv.py \
      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
-      --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
+      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 test_operator_microbenchmark() {
@ -1660,7 +1642,7 @@ test_operator_microbenchmark() {
  cd "${TEST_DIR}"/benchmarks/operator_benchmark
-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
@ -1684,7 +1666,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
  fi
  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
-elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
@ -1731,14 +1713,10 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-pallas* ]]; then
  test_inductor_pallas
 elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
  test_inductor_aoti_cross_compile_for_windows
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
@ -1770,7 +1748,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
@ -1779,7 +1757,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
-    test_inductor_aoti_cpp
+    test_inductor_aoti
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
--- a/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
+++ b/.ci/pytorch/win-test-helpers/arm64/build_pytorch.ps1
@ -70,7 +70,7 @@ sccache --zero-stats
 sccache --show-stats
 # Build the wheel
-python -m build --wheel --no-isolation
+python -m build --wheel --no-build-isolation
 if ($LASTEXITCODE -ne 0) { exit 1 }
 # Install the wheel locally
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -15,35 +15,37 @@ if errorlevel 1 exit /b 1
 if not errorlevel 0 exit /b 1
 cd %TMP_DIR_WIN%\build\torch\test
 :: Enable delayed variable expansion to make the list
 setlocal enabledelayedexpansion
 set EXE_LIST=
 for /r "." %%a in (*.exe) do (
-  if "%%~na" == "c10_intrusive_ptr_benchmark" (
+    call :libtorch_check "%%~na" "%%~fa"
    @REM NB: This is not a gtest executable file, thus couldn't be handled by
    @REM pytest-cpp and is excluded from test discovery by run_test
    call "%%~fa"
    if errorlevel 1 goto fail
    if not errorlevel 0 goto fail
  ) else (
    if "%%~na" == "verify_api_visibility" (
      @REM Skip verify_api_visibility as it is a compile-level test
    ) else (
      set EXE_LIST=!EXE_LIST! cpp/%%~na
    )
  )
 )
 goto :eof
 :libtorch_check
 cd %CWD%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\test
-:: Run python test\run_test.py on the list
+:: Skip verify_api_visibility as it a compile level test
-set NO_TD=True && python test\run_test.py --cpp --verbose -i !EXE_LIST!
+if "%~1" == "verify_api_visibility" goto :eof
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
-goto :eof
+echo Running "%~2"
 if "%~1" == "c10_intrusive_ptr_benchmark" (
  :: NB: This is not a gtest executable file, thus couldn't be handled by pytest-cpp
  call "%~2"
  goto :eof
 )
 python test\run_test.py --cpp --verbose -i "cpp/%~1"
 if errorlevel 1 (
  echo %1 failed with exit code %errorlevel%
  goto fail
 )
 if not errorlevel 0 (
  echo %1 failed with exit code %errorlevel%
  goto fail
 )
 :eof
 exit /b 0
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi
 # TODO: Move this to .ci/docker/requirements-ci.txt
-python -m pip install "psutil==5.9.1" nvidia-ml-py "pytest-shard==0.1.2"
+python -m pip install "psutil==5.9.1" "pynvml==11.4.1" "pytest-shard==0.1.2"
 run_tests() {
    # Run nvidia-smi if available
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
 ) else if "%DESIRED_PYTHON%"=="3.14" (
    echo Python version is set to 3.14 or 3.14t
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.14t"
 ) else (
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,7 +71,14 @@ export PYTORCH_BUILD_NUMBER=1
 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-TRITON_CONSTRAINT="platform_system == 'Linux'"
+
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
 # CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
 if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
@ -163,13 +170,8 @@ if [[ "$(uname)" != Darwin ]]; then
  MEMORY_LIMIT_MAX_JOBS=12
  NUM_CPUS=$(( $(nproc) - 2 ))
-  if [[ "$(uname)" == Linux ]]; then
+  # Defaults here for **binary** linux builds so they can be changed in one place
-    # Defaults here for **binary** linux builds so they can be changed in one place
+  export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
    export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
  else
    # For other builds
    export MAX_JOBS=${NUM_CPUS}
  fi
  cat >>"$envfile" <<EOL
  export MAX_JOBS="${MAX_JOBS}"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -60,11 +60,9 @@ performance-*,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include,
 readability-named-parameter,
 readability-misplaced-array-index,
 readability-redundant*,
 readability-simplify-subscript-expr,
 readability-static-definition-in-anonymous-namespace
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
--- a/.claude/skills/add-uint-support/SKILL.md
+++ b/.claude/skills/add-uint-support/SKILL.md
@ -1,319 +0,0 @@
 ---
 name: add-uint-support
 description: Add unsigned integer (uint) type support to PyTorch operators by updating AT_DISPATCH macros. Use when adding support for uint16, uint32, uint64 types to operators, kernels, or when user mentions enabling unsigned types, barebones unsigned types, or uint support.
 ---
 # Add Unsigned Integer (uint) Support to Operators
 This skill helps add support for unsigned integer types (uint16, uint32, uint64) to PyTorch operators by updating their AT_DISPATCH macros.
 ## When to use this skill
 Use this skill when:
 - Adding uint16, uint32, or uint64 support to an operator
 - User mentions "unsigned types", "uint support", "barebones unsigned types"
 - Enabling support for kUInt16, kUInt32, kUInt64 in kernels
 - Working with operator implementations that need expanded type coverage
 ## Quick reference
 **Add unsigned types to existing dispatch:**
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES));
 // After (method 1: add unsigned types explicitly)
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 // After (method 2: use V2 integral types if AT_INTEGRAL_TYPES present)
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
 ```
 ## Type group reference
 **Unsigned type groups:**
 - `AT_BAREBONES_UNSIGNED_TYPES`: kUInt16, kUInt32, kUInt64
 - `AT_INTEGRAL_TYPES_V2`: AT_INTEGRAL_TYPES + AT_BAREBONES_UNSIGNED_TYPES
 **Relationship:**
 ```cpp
 AT_INTEGRAL_TYPES          // kByte, kChar, kInt, kLong, kShort
 AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
 AT_INTEGRAL_TYPES_V2       // INTEGRAL_TYPES + BAREBONES_UNSIGNED_TYPES
 ```
 ## Instructions
 ### Step 1: Determine if conversion to V2 is needed
 Check if the file uses AT_DISPATCH_V2:
 **If using old AT_DISPATCH:**
 - First convert to AT_DISPATCH_V2 using the at-dispatch-v2 skill
 - Then proceed with adding uint support
 **If already using AT_DISPATCH_V2:**
 - Proceed directly to Step 2
 ### Step 2: Analyze the current dispatch macro
 Identify what type groups are currently in use:
 ```cpp
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  // body
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
    ^^^^^^^^^^^^^^^^^^^^^^^^^
    Current type coverage
 ```
 Common patterns:
 - `AT_EXPAND(AT_ALL_TYPES)` → includes AT_INTEGRAL_TYPES + AT_FLOATING_TYPES
 - `AT_EXPAND(AT_INTEGRAL_TYPES)` → signed integers only
 - `AT_EXPAND(AT_FLOATING_TYPES)` → floating point types
 ### Step 3: Choose the uint addition method
 Two approaches:
 **Method 1: Add AT_BAREBONES_UNSIGNED_TYPES explicitly**
 - Use when: You want to be explicit about adding uint support
 - Add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the type list
 **Method 2: Substitute AT_INTEGRAL_TYPES with AT_INTEGRAL_TYPES_V2**
 - Use when: The dispatch already uses `AT_EXPAND(AT_INTEGRAL_TYPES)`
 - More concise: replaces one type group with its superset
 - Only applicable if AT_INTEGRAL_TYPES is present
 ### Step 4: Apply the transformation
 **Method 1 example:**
 ```cpp
 // Before
 AT_DISPATCH_V2(
    dtype,
    "min_values_cuda",
    AT_WRAP([&]() {
      kernel_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    kBFloat16, kHalf, kBool
 );
 // After (add unsigned types)
 AT_DISPATCH_V2(
    dtype,
    "min_values_cuda",
    AT_WRAP([&]() {
      kernel_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
    kBFloat16, kHalf, kBool
 );
 ```
 **Method 2 example:**
 ```cpp
 // Before
 AT_DISPATCH_V2(
    dtype,
    "integral_op",
    AT_WRAP([&]() {
      kernel<scalar_t>();
    }),
    AT_EXPAND(AT_INTEGRAL_TYPES)
 );
 // After (substitute with V2)
 AT_DISPATCH_V2(
    dtype,
    "integral_op",
    AT_WRAP([&]() {
      kernel<scalar_t>();
    }),
    AT_EXPAND(AT_INTEGRAL_TYPES_V2)
 );
 ```
 ### Step 5: Handle AT_ALL_TYPES vs individual type groups
 If the dispatch uses `AT_EXPAND(AT_ALL_TYPES)`:
 - `AT_ALL_TYPES` = `AT_INTEGRAL_TYPES` + `AT_FLOATING_TYPES`
 - To add uint: add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the list
 If the dispatch separately lists INTEGRAL and FLOATING:
 ```cpp
 // Before
 AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
 // After (Method 2 preferred)
 AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES)
 ```
 ### Step 6: Verify all dispatch sites
 Check the file for ALL dispatch macros that need uint support:
 - Some operators have multiple dispatch sites (CPU, CUDA, different functions)
 - Apply the transformation consistently across all sites
 - Ensure each gets the same type coverage updates
 ### Step 7: Validate the changes
 Check that:
 - [ ] AT_DISPATCH_V2 format is used (not old AT_DISPATCH)
 - [ ] Unsigned types are added via one of the two methods
 - [ ] All relevant dispatch sites in the file are updated
 - [ ] Type groups use `AT_EXPAND()`
 - [ ] Arguments are properly formatted and comma-separated
 ## Common patterns
 ### Pattern 1: AT_ALL_TYPES + extras
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
 ```
 ### Pattern 2: Separate INTEGRAL + FLOATING
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
 ```
 ### Pattern 3: Old dispatch needs conversion first
 ```cpp
 // Before (needs v2 conversion first)
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
  kernel<scalar_t>();
 });
 // After v2 conversion
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 // After adding uint support
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
 ```
 ## Multiple dispatch sites example
 For a file with multiple functions:
 ```cpp
 void min_values_kernel_cuda(TensorIterator& iter) {
  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
    impl<scalar_t>(iter);
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  //                           Added uint support
 }
 void min_launch_kernel(TensorIterator &iter) {
  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
    gpu_reduce_kernel<scalar_t>(iter);
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  //                           Added uint support here too
 }
 ```
 ## Decision tree
 Use this decision tree to determine the approach:
 ```
 Is the file using AT_DISPATCH_V2?
 ├─ No → Use at-dispatch-v2 skill first, then continue
 └─ Yes
   └─ Does it use AT_EXPAND(AT_INTEGRAL_TYPES)?
      ├─ Yes → Replace with AT_EXPAND(AT_INTEGRAL_TYPES_V2)
      └─ No → Add AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) to type list
 ```
 ## Edge cases
 ### Case 1: Dispatch with only floating types
 If the operator only supports floating point types, don't add uint support:
 ```cpp
 // Leave as-is - floating point only operator
 AT_DISPATCH_V2(dtype, "float_op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf);
 ```
 ### Case 2: Complex types present
 Unsigned types work alongside complex types:
 ```cpp
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
    AT_EXPAND(AT_COMPLEX_TYPES),
    kHalf, kBFloat16);
 ```
 ### Case 3: Already has uint support
 Check if uint types are already present:
 - If `AT_INTEGRAL_TYPES_V2` is used → already has uint support
 - If `AT_BAREBONES_UNSIGNED_TYPES` is already in list → already has uint support
 - Skip the file if uint support is already present
 ## Workflow
 When asked to add uint support:
 1. Read the target file
 2. Check if using AT_DISPATCH_V2:
   - If not → use at-dispatch-v2 skill first
 3. Identify all dispatch macro sites
 4. For each dispatch:
   - Analyze current type groups
   - Choose method (add BAREBONES_UNSIGNED or upgrade to V2)
   - Apply transformation with Edit tool
 5. Show the user the changes
 6. Explain what was modified
 ## Important notes
 - Always check if v2 conversion is needed first
 - Apply changes consistently across all dispatch sites in the file
 - Method 2 (AT_INTEGRAL_TYPES_V2) is cleaner when applicable
 - Method 1 (explicit AT_BAREBONES_UNSIGNED_TYPES) is more explicit
 - Unsigned types are: kUInt16, kUInt32, kUInt64 (not kByte which is uint8)
 - Some operators may not semantically support unsigned types - use judgment
 ## Testing
 After adding uint support, the operator should accept uint16, uint32, and uint64 tensors. The user is responsible for functional testing.
--- a/.claude/skills/at-dispatch-v2/SKILL.md
+++ b/.claude/skills/at-dispatch-v2/SKILL.md
@ -1,305 +0,0 @@
 ---
 name: at-dispatch-v2
 description: Convert PyTorch AT_DISPATCH macros to AT_DISPATCH_V2 format in ATen C++ code. Use when porting AT_DISPATCH_ALL_TYPES_AND*, AT_DISPATCH_FLOATING_TYPES*, or other dispatch macros to the new v2 API. For ATen kernel files, CUDA kernels, and native operator implementations.
 ---
 # AT_DISPATCH to AT_DISPATCH_V2 Converter
 This skill helps convert PyTorch's legacy AT_DISPATCH macros to the new AT_DISPATCH_V2 format, as defined in `aten/src/ATen/Dispatch_v2.h`.
 ## When to use this skill
 Use this skill when:
 - Converting AT_DISPATCH_* macros to AT_DISPATCH_V2
 - Porting ATen kernels to use the new dispatch API
 - Working with files in `aten/src/ATen/native/` that use dispatch macros
 - User mentions "AT_DISPATCH", "dispatch v2", "Dispatch_v2.h", or macro conversion
 ## Quick reference
 **Old format:**
 ```cpp
 AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, dtype, "kernel_name", [&]() {
  // lambda body
 });
 ```
 **New format:**
 ```cpp
 AT_DISPATCH_V2(dtype, "kernel_name", AT_WRAP([&]() {
  // lambda body
 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool);
 ```
 ## Key transformations
 1. **Reorder arguments**: `scalar_type` and `name` come first, then lambda, then types
 2. **Wrap the lambda**: Use `AT_WRAP(lambda)` to handle internal commas
 3. **Expand type groups**: Use `AT_EXPAND(AT_ALL_TYPES)` instead of implicit expansion
 4. **List individual types**: Add extra types (kHalf, kBFloat16, etc.) after expanded groups
 5. **Add include**: `#include <ATen/Dispatch_v2.h>` near other Dispatch includes
 ## Instructions
 ### Step 1: Add the Dispatch_v2.h include
 Add the v2 header near the existing `#include <ATen/Dispatch.h>`:
 ```cpp
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 ```
 Keep the old Dispatch.h include for now (other code may still need it).
 ### Step 2: Identify the old dispatch pattern
 Common patterns to convert:
 - `AT_DISPATCH_ALL_TYPES_AND{2,3,4}(type1, type2, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_FLOATING_TYPES_AND{2,3}(type1, type2, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND{2,3}(type1, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3}(type1, ..., scalar_type, name, lambda)`
 ### Step 3: Map the old macro to type groups
 Identify which type group macro corresponds to the base types:
 | Old macro base | AT_DISPATCH_V2 type group |
 |----------------|---------------------------|
 | `ALL_TYPES` | `AT_EXPAND(AT_ALL_TYPES)` |
 | `FLOATING_TYPES` | `AT_EXPAND(AT_FLOATING_TYPES)` |
 | `INTEGRAL_TYPES` | `AT_EXPAND(AT_INTEGRAL_TYPES)` |
 | `COMPLEX_TYPES` | `AT_EXPAND(AT_COMPLEX_TYPES)` |
 | `ALL_TYPES_AND_COMPLEX` | `AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX)` |
 For combined patterns, use multiple `AT_EXPAND()` entries:
 ```cpp
 // Old: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(...)
 // New: AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), type1, type2
 ```
 ### Step 4: Extract the individual types
 From `AT_DISPATCH_*_AND2(type1, type2, ...)` or `AT_DISPATCH_*_AND3(type1, type2, type3, ...)`, extract the individual types (type1, type2, etc.).
 These become the trailing arguments after the type group:
 ```cpp
 AT_DISPATCH_V2(..., AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^
                                             Individual types from AND3
 ```
 ### Step 5: Transform to AT_DISPATCH_V2
 Apply the transformation:
 **Pattern:**
 ```cpp
 AT_DISPATCH_V2(
  scalar_type,           // 1st: The dtype expression
  "name",                // 2nd: The debug string
  AT_WRAP(lambda),       // 3rd: The lambda wrapped in AT_WRAP
  type_groups,           // 4th+: Type groups with AT_EXPAND()
  individual_types       // Last: Individual types
 )
 ```
 **Example transformation:**
 ```cpp
 // BEFORE
 AT_DISPATCH_ALL_TYPES_AND3(
    kBFloat16, kHalf, kBool,
    iter.dtype(),
    "min_values_cuda",
    [&]() {
      min_values_kernel_cuda_impl<scalar_t>(iter);
    }
 );
 // AFTER
 AT_DISPATCH_V2(
    iter.dtype(),
    "min_values_cuda",
    AT_WRAP([&]() {
      min_values_kernel_cuda_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    kBFloat16, kHalf, kBool
 );
 ```
 ### Step 6: Handle multi-line lambdas
 For lambdas with internal commas or complex expressions, AT_WRAP is essential:
 ```cpp
 AT_DISPATCH_V2(
    dtype,
    "complex_kernel",
    AT_WRAP([&]() {
      gpu_reduce_kernel<scalar_t, scalar_t>(
        iter,
        MinOps<scalar_t>{},
        thrust::pair<scalar_t, int64_t>(upper_bound(), 0)  // Commas inside!
      );
    }),
    AT_EXPAND(AT_ALL_TYPES)
 );
 ```
 ### Step 7: Verify the conversion
 Check that:
 - [ ] `AT_WRAP()` wraps the entire lambda
 - [ ] Type groups use `AT_EXPAND()`
 - [ ] Individual types don't have `AT_EXPAND()` (just `kBFloat16`, not `AT_EXPAND(kBFloat16)`)
 - [ ] Argument order is: scalar_type, name, lambda, types
 - [ ] Include added: `#include <ATen/Dispatch_v2.h>`
 ## Type group reference
 Available type group macros (use with `AT_EXPAND()`):
 ```cpp
 AT_INTEGRAL_TYPES      // kByte, kChar, kInt, kLong, kShort
 AT_FLOATING_TYPES      // kDouble, kFloat
 AT_COMPLEX_TYPES       // kComplexDouble, kComplexFloat
 AT_QINT_TYPES         // kQInt8, kQUInt8, kQInt32
 AT_ALL_TYPES          // INTEGRAL_TYPES + FLOATING_TYPES
 AT_ALL_TYPES_AND_COMPLEX  // ALL_TYPES + COMPLEX_TYPES
 AT_INTEGRAL_TYPES_V2  // INTEGRAL_TYPES + unsigned types
 AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
 AT_FLOAT8_TYPES       // Float8 variants
 ```
 ## Common patterns
 ### Pattern: AT_DISPATCH_ALL_TYPES_AND2
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
  kernel<scalar_t>(data);
 });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>(data);
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 ```
 ### Pattern: AT_DISPATCH_FLOATING_TYPES_AND3
 ```cpp
 // Before
 AT_DISPATCH_FLOATING_TYPES_AND3(kHalf, kBFloat16, kFloat8_e4m3fn,
    tensor.scalar_type(), "float_op", [&] {
  process<scalar_t>(tensor);
 });
 // After
 AT_DISPATCH_V2(tensor.scalar_type(), "float_op", AT_WRAP([&] {
  process<scalar_t>(tensor);
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn);
 ```
 ### Pattern: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
    kComplexHalf, kHalf,
    self.scalar_type(),
    "complex_op",
    [&] {
      result = compute<scalar_t>(self);
    }
 );
 // After
 AT_DISPATCH_V2(
    self.scalar_type(),
    "complex_op",
    AT_WRAP([&] {
      result = compute<scalar_t>(self);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_COMPLEX_TYPES),
    kComplexHalf,
    kHalf
 );
 ```
 ## Edge cases
 ### Case 1: No extra types (rare)
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES(dtype, "op", [&]() { kernel<scalar_t>(); });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES));
 ```
 ### Case 2: Many individual types (AND4, AND5, etc.)
 ```cpp
 // Before
 AT_DISPATCH_FLOATING_TYPES_AND4(kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2,
    dtype, "float8_op", [&]() { kernel<scalar_t>(); });
 // After
 AT_DISPATCH_V2(dtype, "float8_op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2);
 ```
 ### Case 3: Lambda with no captures
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, dtype, "op", []() {
  static_kernel<scalar_t>();
 });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([]() {
  static_kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBool);
 ```
 ## Benefits of AT_DISPATCH_V2
 1. **No arity in macro name**: Don't need different macros for AND2, AND3, AND4
 2. **Composable type sets**: Mix and match type groups with `AT_EXPAND()`
 3. **Extensible**: Easy to add more types without hitting macro limits
 4. **Clearer**: Type groups are explicit, not implicit in macro name
 ## Important notes
 - Keep `#include <ATen/Dispatch.h>` - other code may need it
 - The `AT_WRAP()` is mandatory - prevents comma parsing issues in the lambda
 - Type groups need `AT_EXPAND()`, individual types don't
 - The v2 API is in `aten/src/ATen/Dispatch_v2.h` - refer to it for full docs
 - See the header file for the Python script to regenerate the macro implementation
 ## Workflow
 When asked to convert AT_DISPATCH macros:
 1. Read the file to identify all AT_DISPATCH uses
 2. Add `#include <ATen/Dispatch_v2.h>` if not present
 3. For each dispatch macro:
   - Identify the pattern and extract components
   - Map the base type group
   - Extract individual types
   - Construct the AT_DISPATCH_V2 call
   - Apply with Edit tool
 4. Show the user the complete converted file
 5. Explain what was changed
 Do NOT compile or test the code - focus on accurate conversion only.
--- a/.claude/skills/docstring/SKILL.md
+++ b/.claude/skills/docstring/SKILL.md
@ -1,359 +0,0 @@
 ---
 name: docstring
 description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
 ---
 # PyTorch Docstring Writing Guide
 This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
 ## General Principles
 - Use **raw strings** (`r"""..."""`) for all docstrings to avoid issues with LaTeX/math backslashes
 - Follow **Sphinx/reStructuredText** (reST) format for documentation
 - Be **concise but complete** - include all essential information
 - Always include **examples** when possible
 - Use **cross-references** to related functions/classes
 ## Docstring Structure
 ### 1. Function Signature (First Line)
 Start with the function signature showing all parameters:
 ```python
 r"""function_name(param1, param2, *, kwarg1=default1, kwarg2=default2) -> ReturnType
 ```
 **Notes:**
 - Include the function name
 - Show positional and keyword-only arguments (use `*` separator)
 - Include default values
 - Show return type annotation
 - This line should NOT end with a period
 ### 2. Brief Description
 Provide a one-line description of what the function does:
 ```python
 r"""conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 Applies a 2D convolution over an input image composed of several input
 planes.
 ```
 ### 3. Mathematical Formulas (if applicable)
 Use Sphinx math directives for mathematical expressions:
 ```python
 .. math::
    \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 ```
 Or inline math: `:math:\`x^2\``
 ### 4. Cross-References
 Link to related classes and functions using Sphinx roles:
 - `:class:\`~torch.nn.ModuleName\`` - Link to a class
 - `:func:\`torch.function_name\`` - Link to a function
 - `:meth:\`~Tensor.method_name\`` - Link to a method
 - `:attr:\`attribute_name\`` - Reference an attribute
 - The `~` prefix shows only the last component (e.g., `Conv2d` instead of `torch.nn.Conv2d`)
 **Example:**
 ```python
 See :class:`~torch.nn.Conv2d` for details and output shape.
 ```
 ### 5. Notes and Warnings
 Use admonitions for important information:
 ```python
 .. note::
    This function doesn't work directly with NLLLoss,
    which expects the Log to be computed between the Softmax and itself.
    Use log_softmax instead (it's faster and has better numerical properties).
 .. warning::
    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
    or :func:`torch.Tensor.detach`.
 ```
 ### 6. Args Section
 Document all parameters with type annotations and descriptions:
 ```python
 Args:
    input (Tensor): input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
    weight (Tensor): filters of shape :math:`(\text{out\_channels} , kH , kW)`
    bias (Tensor, optional): optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
    stride (int or tuple): the stride of the convolving kernel. Can be a single number or a
      tuple `(sH, sW)`. Default: 1
 ```
 **Formatting rules:**
 - Parameter name in **lowercase**
 - Type in parentheses: `(Type)`, `(Type, optional)` for optional parameters
 - Description follows the type
 - For optional parameters, include "Default: ``value``" at the end
 - Use double backticks for inline code: ``` ``None`` ```
 - Indent continuation lines by 2 spaces
 ### 7. Keyword Args Section (if applicable)
 Sometimes keyword arguments are documented separately:
 ```python
 Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
        Default: if None, same :class:`torch.dtype` as this tensor.
    device (:class:`torch.device`, optional): the desired device of returned tensor.
        Default: if None, same :class:`torch.device` as this tensor.
    requires_grad (bool, optional): If autograd should record operations on the
        returned tensor. Default: ``False``.
 ```
 ### 8. Returns Section (if needed)
 Document the return value:
 ```python
 Returns:
    Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
        If ``hard=True``, the returned samples will be one-hot, otherwise they will
        be probability distributions that sum to 1 across `dim`.
 ```
 Or simply include it in the function signature line if obvious from context.
 ### 9. Examples Section
 Always include examples when possible:
 ```python
 Examples::
    >>> inputs = torch.randn(33, 16, 30)
    >>> filters = torch.randn(20, 16, 5)
    >>> F.conv1d(inputs, filters)
    >>> # With square kernels and equal stride
    >>> filters = torch.randn(8, 4, 3, 3)
    >>> inputs = torch.randn(1, 4, 5, 5)
    >>> F.conv2d(inputs, filters, padding=1)
 ```
 **Formatting rules:**
 - Use `Examples::` with double colon
 - Use `>>>` prompt for Python code
 - Include comments with `#` when helpful
 - Show actual output when it helps understanding (indent without `>>>`)
 ### 10. External References
 Link to papers or external documentation:
 ```python
 .. _Link Name:
    https://arxiv.org/abs/1611.00712
 ```
 Reference them in text: ```See `Link Name`_```
 ## Method Types
 ### Native Python Functions
 For regular Python functions, use a standard docstring:
 ```python
 def relu(input: Tensor, inplace: bool = False) -> Tensor:
    r"""relu(input, inplace=False) -> Tensor
    Applies the rectified linear unit function element-wise. See
    :class:`~torch.nn.ReLU` for more details.
    """
    # implementation
 ```
 ### C-Bound Functions (using add_docstr)
 For C-bound functions, use `_add_docstr`:
 ```python
 conv1d = _add_docstr(
    torch.conv1d,
    r"""
 conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 Applies a 1D convolution over an input signal composed of several input
 planes.
 See :class:`~torch.nn.Conv1d` for details and output shape.
 Args:
    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
    weight: filters of shape :math:`(\text{out\_channels} , kW)`
    ...
 """,
 )
 ```
 ### In-Place Variants
 For in-place operations (ending with `_`), reference the original:
 ```python
 add_docstr_all(
    "abs_",
    r"""
 abs_() -> Tensor
 In-place version of :meth:`~Tensor.abs`
 """,
 )
 ```
 ### Alias Functions
 For aliases, simply reference the original:
 ```python
 add_docstr_all(
    "absolute",
    r"""
 absolute() -> Tensor
 Alias for :func:`abs`
 """,
 )
 ```
 ## Common Patterns
 ### Shape Documentation
 Use LaTeX math notation for tensor shapes:
 ```python
 :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
 ```
 ### Reusable Argument Definitions
 For commonly used arguments, define them once and reuse:
 ```python
 common_args = parse_kwargs(
    """
    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
        Default: if None, same as this tensor.
 """
 )
 # Then use with .format():
 r"""
 ...
 Keyword args:
    {dtype}
    {device}
 """.format(**common_args)
 ```
 ### Template Insertion
 Insert reproducibility notes or other common text:
 ```python
 r"""
 {tf32_note}
 {cudnn_reproducibility_note}
 """.format(**reproducibility_notes, **tf32_notes)
 ```
 ## Complete Example
 Here's a complete example showing all elements:
 ```python
 def gumbel_softmax(
    logits: Tensor,
    tau: float = 1,
    hard: bool = False,
    eps: float = 1e-10,
    dim: int = -1,
 ) -> Tensor:
    r"""
    Sample from the Gumbel-Softmax distribution and optionally discretize.
    Args:
        logits (Tensor): `[..., num_features]` unnormalized log probabilities
        tau (float): non-negative scalar temperature
        hard (bool): if ``True``, the returned samples will be discretized as one-hot vectors,
              but will be differentiated as if it is the soft sample in autograd. Default: ``False``
        dim (int): A dimension along which softmax will be computed. Default: -1
    Returns:
        Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
            If ``hard=True``, the returned samples will be one-hot, otherwise they will
            be probability distributions that sum to 1 across `dim`.
    .. note::
        This function is here for legacy reasons, may be removed from nn.Functional in the future.
    Examples::
        >>> logits = torch.randn(20, 32)
        >>> # Sample soft categorical using reparametrization trick:
        >>> F.gumbel_softmax(logits, tau=1, hard=False)
        >>> # Sample hard categorical using "Straight-through" trick:
        >>> F.gumbel_softmax(logits, tau=1, hard=True)
    .. _Link 1:
        https://arxiv.org/abs/1611.00712
    """
    # implementation
 ```
 ## Quick Checklist
 When writing a PyTorch docstring, ensure:
 - [ ] Use raw string (`r"""`)
 - [ ] Include function signature on first line
 - [ ] Provide brief description
 - [ ] Document all parameters in Args section with types
 - [ ] Include default values for optional parameters
 - [ ] Use Sphinx cross-references (`:func:`, `:class:`, `:meth:`)
 - [ ] Add mathematical formulas if applicable
 - [ ] Include at least one example in Examples section
 - [ ] Add warnings/notes for important caveats
 - [ ] Link to related module class with `:class:`
 - [ ] Use proper math notation for tensor shapes
 - [ ] Follow consistent formatting and indentation
 ## Common Sphinx Roles Reference
 - `:class:\`~torch.nn.Module\`` - Class reference
 - `:func:\`torch.function\`` - Function reference
 - `:meth:\`~Tensor.method\`` - Method reference
 - `:attr:\`attribute\`` - Attribute reference
 - `:math:\`equation\`` - Inline math
 - `:ref:\`label\`` - Internal reference
 - ``` ``code`` ``` - Inline code (use double backticks)
 ## Additional Notes
 - **Indentation**: Use 4 spaces for code, 2 spaces for continuation of parameter descriptions
 - **Line length**: Try to keep lines under 100 characters when possible
 - **Periods**: End sentences with periods, but not the signature line
 - **Backticks**: Use double backticks for code: ``` ``True`` ``None`` ``False`` ```
 - **Types**: Common types are `Tensor`, `int`, `float`, `bool`, `str`, `tuple`, `list`, etc.
--- a/.claude/skills/skill-writer/SKILL.md
+++ b/.claude/skills/skill-writer/SKILL.md
@ -1,385 +0,0 @@
 ---
 name: skill-writer
 description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
 ---
 # Skill Writer
 This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
 ## When to use this Skill
 Use this Skill when:
 - Creating a new Agent Skill
 - Writing or updating SKILL.md files
 - Designing skill structure and frontmatter
 - Troubleshooting skill discovery issues
 - Converting existing prompts or workflows into Skills
 ## Instructions
 ### Step 1: Determine Skill scope
 First, understand what the Skill should do:
 1. **Ask clarifying questions**:
   - What specific capability should this Skill provide?
   - When should Claude use this Skill?
   - What tools or resources does it need?
   - Is this for personal use or team sharing?
 2. **Keep it focused**: One Skill = one capability
   - Good: "PDF form filling", "Excel data analysis"
   - Too broad: "Document processing", "Data tools"
 ### Step 2: Choose Skill location
 Determine where to create the Skill:
 **Personal Skills** (`~/.claude/skills/`):
 - Individual workflows and preferences
 - Experimental Skills
 - Personal productivity tools
 **Project Skills** (`.claude/skills/`):
 - Team workflows and conventions
 - Project-specific expertise
 - Shared utilities (committed to git)
 ### Step 3: Create Skill structure
 Create the directory and files:
 ```bash
 # Personal
 mkdir -p ~/.claude/skills/skill-name
 # Project
 mkdir -p .claude/skills/skill-name
 ```
 For multi-file Skills:
 ```
 skill-name/
 ├── SKILL.md (required)
 ├── reference.md (optional)
 ├── examples.md (optional)
 ├── scripts/
 │   └── helper.py (optional)
 └── templates/
    └── template.txt (optional)
 ```
 ### Step 4: Write SKILL.md frontmatter
 Create YAML frontmatter with required fields:
 ```yaml
 ---
 name: skill-name
 description: Brief description of what this does and when to use it
 ---
 ```
 **Field requirements**:
 - **name**:
  - Lowercase letters, numbers, hyphens only
  - Max 64 characters
  - Must match directory name
  - Good: `pdf-processor`, `git-commit-helper`
  - Bad: `PDF_Processor`, `Git Commits!`
 - **description**:
  - Max 1024 characters
  - Include BOTH what it does AND when to use it
  - Use specific trigger words users would say
  - Mention file types, operations, and context
 **Optional frontmatter fields**:
 - **allowed-tools**: Restrict tool access (comma-separated list)
  ```yaml
  allowed-tools: Read, Grep, Glob
  ```
  Use for:
  - Read-only Skills
  - Security-sensitive workflows
  - Limited-scope operations
 ### Step 5: Write effective descriptions
 The description is critical for Claude to discover your Skill.
 **Formula**: `[What it does] + [When to use it] + [Key triggers]`
 **Examples**:
 ✅ **Good**:
 ```yaml
 description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
 ```
 ✅ **Good**:
 ```yaml
 description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
 ```
 ❌ **Too vague**:
 ```yaml
 description: Helps with documents
 description: For data analysis
 ```
 **Tips**:
 - Include specific file extensions (.pdf, .xlsx, .json)
 - Mention common user phrases ("analyze", "extract", "generate")
 - List concrete operations (not generic verbs)
 - Add context clues ("Use when...", "For...")
 ### Step 6: Structure the Skill content
 Use clear Markdown sections:
 ```markdown
 # Skill Name
 Brief overview of what this Skill does.
 ## Quick start
 Provide a simple example to get started immediately.
 ## Instructions
 Step-by-step guidance for Claude:
 1. First step with clear action
 2. Second step with expected outcome
 3. Handle edge cases
 ## Examples
 Show concrete usage examples with code or commands.
 ## Best practices
 - Key conventions to follow
 - Common pitfalls to avoid
 - When to use vs. not use
 ## Requirements
 List any dependencies or prerequisites:
 ```bash
 pip install package-name
 ```
 ## Advanced usage
 For complex scenarios, see [reference.md](reference.md).
 ```
 ### Step 7: Add supporting files (optional)
 Create additional files for progressive disclosure:
 **reference.md**: Detailed API docs, advanced options
 **examples.md**: Extended examples and use cases
 **scripts/**: Helper scripts and utilities
 **templates/**: File templates or boilerplate
 Reference them from SKILL.md:
 ```markdown
 For advanced usage, see [reference.md](reference.md).
 Run the helper script:
 \`\`\`bash
 python scripts/helper.py input.txt
 \`\`\`
 ```
 ### Step 8: Validate the Skill
 Check these requirements:
 ✅ **File structure**:
 - [ ] SKILL.md exists in correct location
 - [ ] Directory name matches frontmatter `name`
 ✅ **YAML frontmatter**:
 - [ ] Opening `---` on line 1
 - [ ] Closing `---` before content
 - [ ] Valid YAML (no tabs, correct indentation)
 - [ ] `name` follows naming rules
 - [ ] `description` is specific and < 1024 chars
 ✅ **Content quality**:
 - [ ] Clear instructions for Claude
 - [ ] Concrete examples provided
 - [ ] Edge cases handled
 - [ ] Dependencies listed (if any)
 ✅ **Testing**:
 - [ ] Description matches user questions
 - [ ] Skill activates on relevant queries
 - [ ] Instructions are clear and actionable
 ### Step 9: Test the Skill
 1. **Restart Claude Code** (if running) to load the Skill
 2. **Ask relevant questions** that match the description:
   ```
   Can you help me extract text from this PDF?
   ```
 3. **Verify activation**: Claude should use the Skill automatically
 4. **Check behavior**: Confirm Claude follows the instructions correctly
 ### Step 10: Debug if needed
 If Claude doesn't use the Skill:
 1. **Make description more specific**:
   - Add trigger words
   - Include file types
   - Mention common user phrases
 2. **Check file location**:
   ```bash
   ls ~/.claude/skills/skill-name/SKILL.md
   ls .claude/skills/skill-name/SKILL.md
   ```
 3. **Validate YAML**:
   ```bash
   cat SKILL.md | head -n 10
   ```
 4. **Run debug mode**:
   ```bash
   claude --debug
   ```
 ## Common patterns
 ### Read-only Skill
 ```yaml
 ---
 name: code-reader
 description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
 allowed-tools: Read, Grep, Glob
 ---
 ```
 ### Script-based Skill
 ```yaml
 ---
 name: data-processor
 description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
 ---
 # Data Processor
 ## Instructions
 1. Use the processing script:
 \`\`\`bash
 python scripts/process.py input.csv --output results.json
 \`\`\`
 2. Validate output with:
 \`\`\`bash
 python scripts/validate.py results.json
 \`\`\`
 ```
 ### Multi-file Skill with progressive disclosure
 ```yaml
 ---
 name: api-designer
 description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
 ---
 # API Designer
 Quick start: See [examples.md](examples.md)
 Detailed reference: See [reference.md](reference.md)
 ## Instructions
 1. Gather requirements
 2. Design endpoints (see examples.md)
 3. Document with OpenAPI spec
 4. Review against best practices (see reference.md)
 ```
 ## Best practices for Skill authors
 1. **One Skill, one purpose**: Don't create mega-Skills
 2. **Specific descriptions**: Include trigger words users will say
 3. **Clear instructions**: Write for Claude, not humans
 4. **Concrete examples**: Show real code, not pseudocode
 5. **List dependencies**: Mention required packages in description
 6. **Test with teammates**: Verify activation and clarity
 7. **Version your Skills**: Document changes in content
 8. **Use progressive disclosure**: Put advanced details in separate files
 ## Validation checklist
 Before finalizing a Skill, verify:
 - [ ] Name is lowercase, hyphens only, max 64 chars
 - [ ] Description is specific and < 1024 chars
 - [ ] Description includes "what" and "when"
 - [ ] YAML frontmatter is valid
 - [ ] Instructions are step-by-step
 - [ ] Examples are concrete and realistic
 - [ ] Dependencies are documented
 - [ ] File paths use forward slashes
 - [ ] Skill activates on relevant queries
 - [ ] Claude follows instructions correctly
 ## Troubleshooting
 **Skill doesn't activate**:
 - Make description more specific with trigger words
 - Include file types and operations in description
 - Add "Use when..." clause with user phrases
 **Multiple Skills conflict**:
 - Make descriptions more distinct
 - Use different trigger words
 - Narrow the scope of each Skill
 **Skill has errors**:
 - Check YAML syntax (no tabs, proper indentation)
 - Verify file paths (use forward slashes)
 - Ensure scripts have execute permissions
 - List all dependencies
 ## Examples
 See the documentation for complete examples:
 - Simple single-file Skill (commit-helper)
 - Skill with tool permissions (code-reviewer)
 - Multi-file Skill (pdf-processing)
 ## Output format
 When creating a Skill, I will:
 1. Ask clarifying questions about scope and requirements
 2. Suggest a Skill name and location
 3. Create the SKILL.md file with proper frontmatter
 4. Include clear instructions and examples
 5. Add supporting files if needed
 6. Provide testing instructions
 7. Validate against all requirements
 The result will be a complete, working Skill that follows all best practices and validation rules.
--- a/.flake8
+++ b/.flake8
@ -7,12 +7,16 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
-    B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
+    B007,B008,B017,B019,B023,B028,B903,B904,B905,B906,B907,B908,B910
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
    G100,G101,G200
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
    # SIM104 is already covered by pyupgrade ruff
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -8,7 +8,6 @@ assignees: ''
 ---
 > NOTE: Remember to label this issue with "`ci: sev`"
 >       If you want autorevert to be disabled, keep the ci: disable-autorevert label
 <!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -1,7 +1,7 @@
 ---
-name: "D❌\U0001F519 ISABLE AUTOREVERT"
+name: DISABLE AUTOREVERT
 about: Disables autorevert when open
-title: "[DISABLE AUTOREVERT]"
+title: "❌\U0001F519 [DISABLE AUTOREVERT]"
 labels: 'ci: disable-autorevert'
 assignees: ''
--- a/.github/ISSUE_TEMPLATE/release-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/release-feature-request.yml
@ -1,11 +1,11 @@
-name: 🚀 New Feature for Release
+name: 🚀 Release highlight for proposed Feature
 description: Submit a Release highlight for proposed Feature
 labels: ["release-feature-request"]
 body:
 - type: textarea
  attributes:
-    label: New Feature for Release
+    label: Release highlight for proposed Feature
    description: >
      Example: “A torch.special module, analogous to SciPy's special module.”
 - type: input
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -65,7 +65,7 @@ runs:
          cd .ci/lumen_cli
          python3 -m pip install -e .
        )
-        MAX_JOBS="$(nproc --ignore=10)"
+        MAX_JOBS="$(nproc --ignore=6)"
        export MAX_JOBS
        # Split the comma-separated list and build each target
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -27,9 +27,7 @@ runs:
            docker system prune -af
            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
-                diskspace_cutoff_int=$((diskspace_cutoff + 0))
+                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                difference=$((100 - diskspace_cutoff_int))
                echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
                echo "$msg"
                exit 1
            else
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -274,6 +274,8 @@ runs:
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
        # Propagate download.pytorch.org IP to container
        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -38,9 +38,9 @@ runs:
      run: |
        python3 .github/scripts/pytest_cache.py \
          --download \
-          --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
+          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
-          --pr_identifier "$GITHUB_REF" \
+          --pr_identifier $GITHUB_REF \
-          --job_identifier "$JOB_IDENTIFIER" \
+          --job_identifier $JOB_IDENTIFIER \
-          --temp_dir "$RUNNER_TEMP" \
+          --temp_dir $RUNNER_TEMP \
-          --repo "$REPO" \
+          --repo $REPO \
-          --bucket "$BUCKET" \
+          --bucket $BUCKET \
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -47,11 +47,11 @@ runs:
      run: |
        python3 .github/scripts/pytest_cache.py \
          --upload \
-          --cache_dir "$GITHUB_WORKSPACE/$CACHE_DIR" \
+          --cache_dir $GITHUB_WORKSPACE/$CACHE_DIR \
-          --pr_identifier "$GITHUB_REF" \
+          --pr_identifier $GITHUB_REF \
-          --job_identifier "$JOB_IDENTIFIER" \
+          --job_identifier $JOB_IDENTIFIER \
-          --sha "$SHA" \
+          --sha $SHA \
-          --test_config "$TEST_CONFIG" \
+          --test_config $TEST_CONFIG \
-          --shard "$SHARD" \
+          --shard $SHARD \
-          --repo "$REPO" \
+          --repo $REPO \
-          --temp_dir "$RUNNER_TEMP" \
+          --temp_dir $RUNNER_TEMP \
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,10 +28,6 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"
    - name: Print GPU info (if present)
      shell: bash
      run: if [ -f /usr/bin/nvidia-smi ]; then nvidia-smi; fi
    - name: Check if in a container runner
      shell: bash
      id: check_container_runner
@ -86,6 +82,37 @@ runs:
        # Prune all of the docker images
        docker system prune -af
    - name: Manually resolve download.pytorch.org
      shell: bash
      continue-on-error: true
      run: |
        set +e
        set -x
        PT_DOMAIN=download.pytorch.org
        # TODO: Flaky access to download.pytorch.org https://github.com/pytorch/pytorch/issues/100400,
        # cleaning this up once the issue is fixed. There are more than one resolved IP here, the last
        # one is returned at random
        RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" | tail -n1)
        if [ -z "${RESOLVED_IP}" ]; then
          echo "Couldn't resolve ${PT_DOMAIN}, retrying with Google DNS..."
          RESOLVED_IP=$(dig -4 +short "${PT_DOMAIN}" @8.8.8.8 | tail -n1)
          if [ -z "${RESOLVED_IP}" ]; then
            echo "Couldn't resolve ${PT_DOMAIN}, exiting..."
            exit 1
          fi
        fi
        if grep -r "${PT_DOMAIN}" /etc/hosts; then
          # Clean up any old records first
          sudo sed -i "/${PT_DOMAIN}/d" /etc/hosts
        fi
        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
        cat /etc/hosts
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -111,23 +111,3 @@ runs:
        # This video group ID maps to subgid 1 inside the docker image due to the /etc/subgid entries.
        # The group name corresponding to group ID 1 can change depending on the OS, so both are necessary.
        echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd $DEVICE_FLAG --group-add video --group-add $render_gid --group-add daemon --group-add bin --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --network=host" >> "${GITHUB_ENV}"
    - name: configure aws credentials
      id: aws_creds
      uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
      with:
        role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
        aws-region: us-east-1
        role-duration-seconds: 18000
    - name: Login to Amazon ECR
      id: login-ecr
      continue-on-error: true
      uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
    - name: Preserve github env variables for use in docker
      shell: bash
      run: |
        env | grep '^GITHUB' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
        env | grep '^CI' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
        env | grep '^RUNNER' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -33,6 +33,10 @@ runs:
        )
        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
        docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
        # Generate test script
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ad5816f0eee1c873df1b7d371c69f1f811a89387
+87ff22e49ed0e92576c4935ccb8c143daac4a3cd
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-ccb801b88af136454798b945175c4c87e636ac33
+966da7e46f65d6d49df3e31214470a4fe5cc8e66
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-e5192819208c4d68194844b7dfafbc00020d0dea
+0ad9951c416d33c5da4f7a504fb162cbe62386f5
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a
+2a9138a26ee257fef05310ad3fecf7c55fe80d73
--- a/.github/ci_configs/vllm/Dockerfile.tmp_vllm
+++ b/.github/ci_configs/vllm/Dockerfile.tmp_vllm
@ -1,41 +1,59 @@
 # TODO(elainwy): remove this file after the torch nightly dockerfile is in sync in vllm repo
 # The vLLM Dockerfile is used to construct vLLM image against torch nightly and torch main that can be directly used for testing
 ARG CUDA_VERSION=12.8.1
 ARG PYTHON_VERSION=3.12
 # BUILD_BASE_IMAGE: used to setup python build xformers, and vllm wheels, It can be replaced with a different base image from local machine,
 # by default, it uses the torch-nightly-base stage from this docker image
 ARG BUILD_BASE_IMAGE=torch-nightly-base
 # FINAL_BASE_IMAGE: used to set up vllm-instaled environment and build flashinfer,
 # by default, it uses devel-ubuntu22.04 official image.
 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04
 # The logic is copied from https://github.com/vllm-project/vllm/blob/main/docker/Dockerfile
 ARG GET_PIP_URL="https://bootstrap.pypa.io/get-pip.py"
 #################### TORCH NIGHTLY BASE IMAGE ####################
 # A base image for building vLLM with devel ubuntu 22.04, this is mainly used to build vllm in vllm builtkite ci
 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 as torch-nightly-base
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG GET_PIP_URL
-# Install system dependencies and uv, then create Python virtual environment
+# Install Python and other dependencies
 RUN apt-get update -y \
-    && apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
+    && apt-get install -y ccache software-properties-common git curl wget sudo vim \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh \
+    && add-apt-repository -y ppa:deadsnakes/ppa \
-    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
+    && apt-get update -y \
-    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
+    && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
-    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
+    && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
-    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
+    && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
-    && ln -s /opt/venv/bin/pip /usr/bin/pip \
+    && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
    && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
    && python3 --version && python3 -m pip --version
 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
 # as it was causing spam when compiling the CUTLASS kernels
-RUN apt-get install -y gcc-10 g++-10
+# Ensure gcc >= 10 to avoid CUTLASS issues (bug 92519)
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
+RUN current_gcc_version=$(gcc -dumpversion | cut -f1 -d.) && \
-RUN <<EOF
+    if command -v apt-get >/dev/null; then \
-gcc --version
+        if [ "$current_gcc_version" -lt 10 ]; then \
-EOF
+            echo "GCC version is $current_gcc_version, installing gcc-10..."; \
            apt-get update \
            && apt-get install -y gcc-10 g++-10 \
            && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 100 \
            && update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-10 100; \
        else \
            echo "GCC version is $current_gcc_version, no need to install gcc-10."; \
        fi \
    fi \
    && gcc --version && g++ --version
-# Install uv for faster pip installs
+# install uv for faster pip installs
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv==0.8.4
@ -43,32 +61,36 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 #################### TORCH NIGHTLY  BASE IMAGE ####################
 #################### BASE BUILD IMAGE ####################
 # A base image for building vLLM with torch nightly or torch wheels
 # prepare basic build environment
 FROM ${BUILD_BASE_IMAGE} AS base
 USER root
 ARG CUDA_VERSION
 ARG PYTHON_VERSION
-# Only work with PyTorch manylinux builder
+# TODO (huydhn): Only work with PyTorch manylinux builder
 ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
 # Install some system dependencies and double check python version
 RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
-        && apt-get install -y ccache software-properties-common git wget sudo vim; \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim; \
    else \
-        dnf install -y git wget sudo; \
+        dnf install -y git curl wget sudo; \
    fi \
    && python3 --version && python3 -m pip --version
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
-    python3 -m pip install uv==0.8.4
+    if ! python3 -m uv --version >/dev/null 2>&1; then \
-
+        python3 -m pip install uv==0.8.4; \
    fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
@ -76,15 +98,15 @@ ENV UV_LINK_MODE=copy
 WORKDIR /workspace
-# Install build and runtime dependencies
+# install build and runtime dependencies
 COPY requirements/common.txt requirements/common.txt
 COPY use_existing_torch.py use_existing_torch.py
 COPY pyproject.toml pyproject.toml
-# Install build and runtime dependencies without stable torch version
+# install build and runtime dependencies without stable torch version
 RUN python3 use_existing_torch.py
-# Default mount file as placeholder, this just avoid the mount error
+# default mount file as placeholder, this just avoid the mount error
 # change to a different vllm folder if this does not exist anymore
 ARG TORCH_WHEELS_PATH="./requirements"
 ARG PINNED_TORCH_VERSION
@ -116,36 +138,56 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/common.txt
 # Must put before installing xformers, so it can install the correct version of xfomrers.
 ARG xformers_cuda_arch_list='7.5;8.0+PTX;9.0a'
 ENV TORCH_CUDA_ARCH_LIST=${xformers_cuda_arch_list}
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
-RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
+RUN echo ${TORCH_CUDA_ARCH_LIST}
-    export TORCH_CUDA_ARCH_LIST='7.5 8.0+PTX 9.0a'
+RUN echo ${MAX_JOBS}
-    git clone https://github.com/facebookresearch/xformers.git
+RUN pip freeze | grep -E 'ninja'
-    pushd xformers
+# Build xformers with cuda and torch nightly/wheel
-    git checkout v0.0.32.post2
+# following official xformers guidance: https://github.com/facebookresearch/xformers#build
-    git submodule update --init --recursive
+# sha for https://github.com/facebookresearch/xformers/tree/v0.0.32.post2
-    python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose
+ARG XFORMERS_COMMIT=5d4b92a5e5a9c6c6d4878283f47d82e17995b468
-    popd
+ENV CCACHE_DIR=/root/.cache/ccache
-    rm -rf xformers
+RUN --mount=type=cache,target=/root/.cache/ccache \
-BASH
+    --mount=type=cache,target=/root/.cache/uv \
    echo 'git clone xformers...' \
    && git clone https://github.com/facebookresearch/xformers.git --recursive \
    && cd xformers \
    && git checkout ${XFORMERS_COMMIT} \
    && git submodule update --init --recursive \
    && echo 'finish git clone xformers...' \
    && rm -rf build \
    && python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
    && cd .. \
    && rm -rf xformers
 RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install --system xformers-dist/*.whl
+    uv pip install --system xformers-dist/*.whl --verbose
 # Build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
 # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
 RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
 RUN cat torch_build_versions.txt
 RUN pip freeze | grep -E 'torch|xformers|torchvision|torchaudio'
 #################### BASE BUILD IMAGE ####################
 #################### WHEEL BUILD IMAGE ####################
 # Image used to build vllm wheel
 FROM base AS build
 ARG TARGETPLATFORM
 COPY . .
 RUN python3 use_existing_torch.py
 RUN --mount=type=cache,target=/root/.cache/uv \
@ -155,17 +197,20 @@ ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
 # Max jobs used by Ninja to build extensions
 ARG max_jobs=16
 ENV MAX_JOBS=${max_jobs}
-ARG nvcc_threads=8
+ARG nvcc_threads=4
 ENV NVCC_THREADS=$nvcc_threads
 ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 ARG USE_SCCACHE
 ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
 ARG SCCACHE_REGION_NAME=us-west-2
 ARG SCCACHE_S3_NO_CREDENTIALS=0
-# Use sccache to speed up compilation
+# if USE_SCCACHE is set, use sccache to speed up compilation
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "$USE_SCCACHE" = "1" ]; then \
@ -190,9 +235,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
        && sccache --show-stats; \
    fi
 ARG torch_cuda_arch_list='8.0 8.6 8.9 9.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 ARG vllm_target_device="cuda"
 ENV VLLM_TARGET_DEVICE=${vllm_target_device}
 ENV CCACHE_DIR=/root/.cache/ccache
@ -206,10 +248,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
        export VLLM_DOCKER_BUILD_CONTEXT=1 && \
        python3 setup.py bdist_wheel --dist-dir=vllm-dist --py-limited-api=cp38; \
    fi
 RUN echo "[INFO] Listing current directory:" && \
    ls -al && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt
 #################### WHEEL BUILD IMAGE ####################
 ################### VLLM INSTALLED IMAGE ####################
 # Setup clean environment for vLLM for test and api server using ubuntu22.04 with AOT flashinfer
 FROM ${FINAL_BASE_IMAGE} AS vllm-base
 USER root
@ -217,7 +266,7 @@ ARG CUDA_VERSION
 ARG PYTHON_VERSION
 ARG GET_PIP_URL
-# Only work with PyTorch manylinux builder
+# TODO (huydhn): Only work with PyTorch manylinux builder
 ENV PATH="/opt/python/cp312-cp312/bin:${PATH}"
 # prepare for environment starts
@ -226,19 +275,20 @@ WORKDIR /workspace
 # Install Python and other dependencies
 RUN if command -v apt-get >/dev/null; then \
        apt-get update -y \
-        && apt-get install -y ccache software-properties-common git sudo vim python3-pip; \
+        && apt-get install -y ccache software-properties-common git curl wget sudo vim \
        && add-apt-repository -y ppa:deadsnakes/ppa \
        && apt-get update -y \
        && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
        && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
        && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
        && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
        && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION}; \
    else \
-        dnf install -y git wget sudo; \
+        dnf install -y git curl wget sudo; \
    fi \
    && curl -LsSf https://astral.sh/uv/install.sh | sh \
    && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
    && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
    && ln -s /opt/venv/bin/python3 /usr/bin/python3 \
    && ln -s /opt/venv/bin/python3-config /usr/bin/python3-config \
    && ln -s /opt/venv/bin/pip /usr/bin/pip \
    && python3 --version && python3 -m pip --version
-# Get the torch versions, and whls used in previous stage
+# Get the torch versions, and whls used in previous stagtes for consistency
 COPY --from=base /workspace/torch_build_versions.txt ./torch_build_versions.txt
 COPY --from=base /workspace/xformers-dist /wheels/xformers
 COPY --from=build /workspace/vllm-dist /wheels/vllm
@ -247,29 +297,33 @@ RUN echo "[INFO] Listing current directory before torch install step:" && \
    echo "[INFO] Showing torch_build_versions.txt content:" && \
    cat torch_build_versions.txt
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    python3 -m pip install uv==0.8.4
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 # Install build and runtime dependencies, this is needed for flashinfer install
 COPY requirements/build.txt requirements/build.txt
 COPY use_existing_torch.py use_existing_torch.py
 RUN python3 use_existing_torch.py
 RUN cat requirements/build.txt
 # Install uv for faster pip installs if not existed
 RUN --mount=type=cache,target=/root/.cache/uv \
    if ! python3 -m uv --version > /dev/null 2>&1; then \
        python3 -m pip install uv==0.8.4; \
    fi
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/build.txt
 # Default mount file as placeholder, this just avoid the mount error
 ARG TORCH_WHEELS_PATH="./requirements"
-# Install torch, torchaudio and torchvision. If TORCH_WHEELS_PATH is default
+# Install torch, torchaudio and torchvision
-# to ./requirements, it will pull the nightly versions using pip. Otherwise,
+# if TORCH_WHEELS_PATH is default "./requirements", it will pull the nightly versions using pip using torch_build_versions.txt
-# it will use the local wheels from TORCH_WHEELS_PATH
+# otherwise, it will use the whls from TORCH_WHEELS_PATH from the host machine
 RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
    --mount=type=cache,target=/root/.cache/uv \
    if [ -n "$TORCH_WHEELS_PATH" ] && [ "$TORCH_WHEELS_PATH" != "./requirements" ] && [ -d "/dist" ] && ls /dist/torch*.whl >/dev/null 2>&1; then \
@ -283,9 +337,6 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --pre apache-tvm-ffi==0.1.0b15
 # Install the vllm wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/vllm/*.whl --verbose
@ -293,16 +344,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 # Install xformers wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/xformers/*.whl --verbose
-
+# Build flashinfer from source.
 # Build FlashInfer from source
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 # install package for build flashinfer
 # see issue: https://github.com/flashinfer-ai/flashinfer/issues/738
 RUN pip freeze | grep -E 'setuptools|packaging|build'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
-
+# Build flashinfer for torch nightly from source around 10 mins
 # TODO(elainewy): remove this once vllm commit is updated, and install flashinfer from pip
 # see https://github.com/pytorch/pytorch/pull/165274#issuecomment-3408531784
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with https://github.com/vllm-project/vllm/blob/main/requirements/cuda.txt
 ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 RUN --mount=type=cache,target=/root/.cache/uv \
    git clone --depth 1 --recursive --shallow-submodules \
        --branch ${FLASHINFER_GIT_REF} \
@ -314,7 +367,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    && cd .. \
    && rm -rf flashinfer
-# Install FlashInfer
+# install flashinfer python
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system wheels/flashinfer/*.whl --verbose
@ -324,6 +377,49 @@ RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio\|^xformers\|^vllm
 ################### VLLM INSTALLED IMAGE ####################
 #################### UNITTEST IMAGE #############################
 FROM vllm-base as test
 ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
 COPY tests/ tests/
 COPY examples examples
 COPY benchmarks benchmarks
 COPY ./vllm/collect_env.py .
 COPY requirements/common.txt requirements/common.txt
 COPY use_existing_torch.py use_existing_torch.py
 COPY pyproject.toml pyproject.toml
 # Install build and runtime dependencies without stable torch version
 COPY requirements/nightly_torch_test.txt requirements/nightly_torch_test.txt
 RUN python3 use_existing_torch.py
 # install packages
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/common.txt
 # enable fast downloads from hf (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/nightly_torch_test.txt
 # Logging to confirm the torch versions
 RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
 # Logging to confirm all the packages are installed
 RUN pip freeze
 #################### UNITTEST IMAGE #############################
 #################### EXPORT STAGE ####################
 FROM scratch as export-wheels
--- a/.github/copilot-instructions.md
+++ b/.github/copilot-instructions.md
@ -1,125 +0,0 @@
 # PyTorch Copilot Instructions
 This is the PyTorch machine learning framework codebase. These instructions help AI agents navigate and contribute effectively.
 ## Architecture Overview
 ### Core Components
 - **c10/** - Core library (C++-10 compatible) for essential, binary-size-conscious functionality
 - **aten/** - ATen tensor library (C++), PyTorch's foundation without autograd
  - `aten/src/ATen/native/` - Modern operator implementations (CPU/CUDA/MPS/sparse)
  - `aten/src/ATen/native/native_functions.yaml` - **Critical**: Declarative operator registry
 - **torch/** - Python bindings and public API
  - `torch/csrc/` - C++ Python bindings (hand-written and generated)
  - `torch/csrc/autograd/` - Reverse-mode automatic differentiation
  - `torch/csrc/jit/` - TorchScript JIT compiler
 - **torchgen/** - Code generation tooling that reads `native_functions.yaml`
 - **tools/** - Build scripts, autograd derivatives, code generation
 ### The Code Generation Workflow
 **Most operator changes require editing `native_functions.yaml`**, not direct C++ files. This YAML file:
 1. Declares operator signatures, variants (function/method), and dispatch behavior
 2. Gets processed by `torchgen/` to generate C++/Python bindings
 3. Produces headers in `build/aten/src/ATen/` during compilation
 Example entry structure:
 ```yaml
 - func: my_op(Tensor self, Scalar alpha=1) -> Tensor
  variants: function, method
  dispatch:
    CPU: my_op_cpu
    CUDA: my_op_cuda
 ```
 After editing `native_functions.yaml`, implement kernels in `aten/src/ATen/native/` (see `aten/src/ATen/native/README.md`).
 ## Development Workflows
 ### Building from Source
 **Never run `setup.py` directly** - use pip with editable install:
 ```bash
 python -m pip install --no-build-isolation -v -e .
 ```
 Speed up builds:
 - `DEBUG=1` - Debug symbols with `-g -O0`
 - `USE_CUDA=0` - Skip CUDA compilation
 - `BUILD_TEST=0` - Skip C++ test binaries
 - Install `ninja` (`pip install ninja`) for faster builds
 - Use `ccache` for incremental compilation caching
 Rebuild specific targets: `(cd build && ninja <target>)`
 ### Testing
 **Critical**: DO NOT run entire test suites. Run specific tests only:
 ```bash
 python test/test_torch.py TestTorch.test_specific_case
 ```
 **Test structure**: All tests use `torch.testing._internal.common_utils`:
 ```python
 from torch.testing._internal.common_utils import run_tests, TestCase
 class TestFeature(TestCase):
    def test_something(self):
        # Use self.assertEqual for tensor comparisons
        pass
 if __name__ == "__main__":
    run_tests()
 ```
 **For bug fixes**: Create a standalone reproduction script first, verify it fails, then fix and add to appropriate test file.
 ### Linting
 Run linter (not pre-commit): `lintrunner -a` (auto-applies fixes)
 ## Project-Specific Conventions
 ### Memory and Storage
 - **Storage is never nullptr** (but `StorageImpl.data` may be nullptr for unallocated outputs)
 - CUDA device info lives in storage objects
 ### Python-C++ Integration (`torch/csrc/`)
 - Always include `Python.h` **first** to avoid `_XOPEN_SOURCE` redefinition errors
 - Use `pybind11::gil_scoped_acquire` before calling Python API or using `THPObjectPtr`
 - Wrap entry points with `HANDLE_TH_ERRORS` / `END_HANDLE_TH_ERRORS` for exception conversion
 ### Dispatch System
 - PyTorch uses operator dispatch to route calls to backend-specific kernels
 - Prefer `CompositeExplicitAutograd` dispatch when writing device-agnostic compound ops
 - See `aten/src/ATen/native/README.md` for dispatch keyword guidance
 ## Git Workflow (AI Agent Specific)
 When preparing PRs from this environment:
 ```bash
 git stash -u
 git reset --hard $(cat /tmp/orig_work.txt)  # Reset to LOCAL branch
 git stash pop
 # Resolve conflicts if necessary
 ```
 ## Common Gotchas
 1. **Editing generated files** - If it's in `build/`, don't edit it. Edit the source template or `native_functions.yaml`
 2. **NVCC template compilation** - NVCC is stricter about C++ than gcc/clang; code working on Linux may fail Windows CI
 3. **Windows symbol visibility** - Use `TORCH_API` macros for exported symbols (required on Windows, optional on Linux)
 4. **No internet access** - DO NOT attempt to install dependencies during development
 ## Key Files Reference
 - `AGENTS.md` - Instructions specific to AI coding agents
 - `CONTRIBUTING.md` - Comprehensive human contributor guide
 - `GLOSSARY.md` - Terminology (ATen, kernels, operations, JIT, TorchScript)
 - `aten/src/ATen/native/README.md` - Operator implementation guide
 - `tools/autograd/derivatives.yaml` - Gradient definitions for autograd
 ## Performance Debugging
 Use `TORCH_SHOW_CPP_STACKTRACES=1` for C++ traces in Python errors. For profiling, prefer `py-spy` over manual instrumentation.
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -15,11 +15,6 @@
  - "module: reinplacing"
  then:
  - "module: pt2-dispatcher"
 - any:
  - "vllm-compile"
  then:
  - "module: vllm"
  - "oncall: pt2"
 - any:
  - "module: vmap"
  then:
@ -32,6 +27,10 @@
  - "module: pt2 optimizer"
  then:
  - "module: dynamo"
 - any:
  - "module: flex attention"
  then:
  - "module: higher order operators"
 - any:
  - "module: aotinductor"
  then:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -133,48 +133,3 @@
 "ciflow/vllm":
 - .github/ci_commit_pins/vllm.txt
 "ciflow/b200":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/*Blas.cpp
 - aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
 "ciflow/h100":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/*Blas.cpp
 - aten/src/ATen/cuda/CUDA*Blas.*
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
 "ciflow/rocm":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/*Blas.cpp
 - aten/src/ATen/cuda/CUDA*Blas.*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
 "ciflow/mps":
 - aten/src/ATen/mps/**
 - aten/src/ATen/native/mps/**
 - torch/_inductor/codegen/mps.py
 - test/test_mps.py
 - test/inductor/test_mps_basic.py
 "ciflow/h100-symm-mem":
 - torch/csrc/distributed/c10d/symm_mem/**
 - torch/distributed/_symmetric_memory/**
 - test/distributed/**/*mem*
 - test/distributed/**/*mem*/**
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -540,26 +540,6 @@
  - Lint
  - pull
 - name: PrivateUse1
  patterns:
  - torch/accelerator/**
  - torch/utils/backend_registration.py
  - torch/csrc/acc/**
  - torch/csrc/DeviceAccelerator.*
  - torch/csrc/profiler/standalone/privateuse1_observer.*
  - aten/src/ATen/DeviceAccelerator.*
  - aten/src/ATen/core/GeneratorForPrivateuseone.*
  - aten/src/ATen/detail/PrivateUse1HooksInterface.*
  - docs/source/accelerator/**
  - test/cpp_extensions/open_registration_extension/torch_openreg/**
  approved_by:
  - albanD
  - fffrog
  mandatory_checks_name:
  - EasyCLA
  - Lint
  - pull
 - name: superuser
  patterns:
  - '*'
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -10,4 +10,3 @@
  pathFilter:
    - 'torch/csrc/inductor/aoti_torch/c/*'
    - 'torch/csrc/inductor/aoti_torch/generated/*'
    - 'torch/csrc/stable/c/*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,7 +2,6 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
 - ciflow/b200-distributed
 - ciflow/b200-symm-mem
 - ciflow/binaries
 - ciflow/binaries_libtorch
@ -16,32 +15,23 @@ ciflow_push_tags:
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm-mi300
+- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-test-nightly-rocm-mi355
 - ciflow/inductor-perf-test-nightly-x86-zen
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
 - ciflow/inductor-rocm-mi200
 - ciflow/inductor-rocm-mi300
 - ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi200
 - ciflow/periodic-rocm-mi300
 - ciflow/pull
 - ciflow/quantization-periodic
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi200
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
 - ciflow/slow-rocm-mi200
 - ciflow/torchbench
 - ciflow/triton_binaries
 - ciflow/trunk
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -1,11 +1,10 @@
 # Delete old branches
 import os
 import re
 from collections.abc import Callable
 from datetime import datetime
 from functools import lru_cache
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -8,11 +8,10 @@ import re
 import subprocess
 import sys
 import warnings
 from collections.abc import Callable
 from enum import Enum
 from functools import cache
 from logging import info
-from typing import Any, Optional
+from typing import Any, Callable, Optional
 from urllib.request import Request, urlopen
 import yaml
@ -513,8 +512,6 @@ def perform_misc_tasks(
        "keep-going",
        branch == MAIN_BRANCH
        or bool(tag and re.match(r"^trunk/[a-f0-9]{40}$", tag))
        # Pattern for tags created via manual run on HUD
        or bool(tag and re.match(r"^ciflow/[^/]+/[a-f0-9]{40}$", tag))
        or check_for_setting(labels, pr_body, "keep-going"),
    )
    set_output(
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -11,33 +11,26 @@ architectures:
    * Latest XPU
 """
 import json
 import os
 import re
 from pathlib import Path
 from typing import Optional
-SCRIPT_DIR = Path(__file__).absolute().parent
+# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-REPO_ROOT = SCRIPT_DIR.parent.parent
+CUDA_ARCHES = ["12.6", "12.8", "13.0"]
 CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
    "12.9": "12.9.1",
    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
    "12.9": "9",
    "13.0": "9",
 }
-ROCM_ARCHES = ["7.0", "7.1"]
+# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
 ROCM_ARCHES = ["6.4", "7.0"]
 XPU_ARCHES = ["xpu"]
@ -45,7 +38,7 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]
 CPU_S390X_ARCH = ["cpu-s390x"]
-CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
@ -61,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
@ -78,44 +71,27 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
    ),
    "12.9": (
        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
-        "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
-        "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
-        "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
-        "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
-        "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
-        "nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
@ -142,48 +118,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
 }
 # Used by tools/nightly.py
 PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
 NIGHTLY_SOURCE_MATRIX = {
    "cpu": dict(
        name="cpu",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
        supported_platforms=["Linux", "macOS", "Windows"],
        accelerator="cpu",
    )
 }
 CUDA_NIGHTLY_SOURCE_MATRIX = {
    f"cuda-{major}.{minor}": dict(
        name=f"cuda-{major}.{minor}",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
        supported_platforms=["Linux", "Windows"],
        accelerator="cuda",
    )
    for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
 }
 ROCM_NIGHTLY_SOURCE_MATRIX = {
    f"rocm-{major}.{minor}": dict(
        name=f"rocm-{major}.{minor}",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
        supported_platforms=["Linux"],
        accelerator="rocm",
    )
    for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
 }
 XPU_NIGHTLY_SOURCE_MATRIX = {
    "xpu": dict(
        name="xpu",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
        supported_platforms=["Linux"],
        accelerator="xpu",
    )
 }
 NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
 NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
 NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
 def get_nccl_wheel_version(arch_version: str) -> str:
    import re
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
@ -191,14 +128,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:
 def read_nccl_pin(arch_version: str) -> str:
-    nccl_pin_path = (
+    from pathlib import Path
-        REPO_ROOT
+
-        / ".ci"
+    nccl_pin_path = os.path.join(
-        / "docker"
+        Path(__file__).absolute().parents[2],
-        / "ci_commit_pins"
+        ".ci",
-        / f"nccl-cu{arch_version[:2]}.txt"
+        "docker",
        "ci_commit_pins",
        f"nccl-cu{arch_version[:2]}.txt",
    )
-    return nccl_pin_path.read_text().strip()
+    with open(nccl_pin_path) as f:
        return f.read().strip()
 def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -206,8 +146,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
    wheel_ver = get_nccl_wheel_version(arch_version)
    if not nccl_release_tag.startswith(f"v{wheel_ver}"):
        raise RuntimeError(
-            f"{arch_version} NCCL release tag version {nccl_release_tag} "
+            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
            f"does not correspond to wheel version {wheel_ver}"
        )
@ -283,11 +222,7 @@ def generate_libtorch_matrix(
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
        elif os == "windows":
-            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            arches += CUDA_ARCHES
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -351,11 +286,7 @@ def generate_wheels_matrix(
        if os == "linux":
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
-            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            arches += CUDA_ARCHES + XPU_ARCHES
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches + XPU_ARCHES
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -391,7 +322,7 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
            if (
-                arch_version in ["13.0", "12.9", "12.8", "12.6"]
+                arch_version in ["13.0", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
@ -454,14 +385,6 @@ def generate_wheels_matrix(
    return ret
-arch_version = ""
+validate_nccl_dep_consistency("13.0")
-for arch_version in CUDA_ARCHES:
+validate_nccl_dep_consistency("12.8")
-    validate_nccl_dep_consistency(arch_version)
+validate_nccl_dep_consistency("12.6")
 del arch_version
 if __name__ == "__main__":
    # Used by tools/nightly.py
    (SCRIPT_DIR / "nightly_source_matrix.json").write_text(
        json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
    )
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,8 +11,7 @@ import sys
 import time
 import urllib
 import urllib.parse
-from collections.abc import Callable
+from typing import Any, Callable, Optional
 from typing import Any, Optional
 from urllib.request import Request, urlopen
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,9 +3,8 @@
 import json
 import os
 import warnings
 from collections.abc import Callable
 from dataclasses import dataclass
-from typing import Any, cast, Optional, Union
+from typing import Any, Callable, cast, Optional, Union
 from urllib.error import HTTPError
 from urllib.parse import quote
 from urllib.request import Request, urlopen
@ -19,7 +18,6 @@ class GitHubComment:
    body_text: str
    created_at: str
    author_login: str
    author_url: Optional[str]
    author_association: str
    editor_login: Optional[str]
    database_id: int
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -4,10 +4,10 @@ import os
 import re
 import tempfile
 from collections import defaultdict
-from collections.abc import Callable, Iterator
+from collections.abc import Iterator
 from datetime import datetime
 from functools import wraps
-from typing import Any, cast, Optional, TypeVar, Union
+from typing import Any, Callable, cast, Optional, TypeVar, Union
 T = TypeVar("T")
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -38,7 +38,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text="mock_body_text",
            created_at="",
            author_login="",
            author_url=None,
            author_association="",
            editor_login=None,
            database_id=1,
@ -49,7 +48,6 @@ def mock_get_comments() -> list[GitHubComment]:
            body_text=" #" + LABEL_ERR_MSG_TITLE.replace("`", ""),
            created_at="",
            author_login=BOT_AUTHORS[1],
            author_url=None,
            author_association="",
            editor_login=None,
            database_id=2,
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -32,7 +32,6 @@ from trymerge import (
    main as trymerge_main,
    MandatoryChecksMissingError,
    MergeRule,
    PostCommentError,
    RE_GHSTACK_DESC,
    read_merge_rules,
    remove_job_name_suffix,
@ -589,23 +588,6 @@ class TestTryMerge(TestCase):
            self.assertEqual(mock_merge_base, pr.get_merge_base())
            mocked_gh_fetch_merge_base.assert_called_once()
    def test_app_can_revert(self, *args: Any) -> None:
        pr = GitHubPR("pytorch", "pytorch", 164660)
        repo = DummyGitRepo()
        app_comment_id, impostor_comment_id = 3375785595, 3377647892
        # Check that app can revert
        self.assertIsNotNone(validate_revert(repo, pr, comment_id=app_comment_id))
        # But impostor can not
        self.assertRaises(
            PostCommentError,
            lambda: validate_revert(repo, pr, comment_id=impostor_comment_id),
        )
        # Despite it's name being the name of the bot
        self.assertEqual(
            pr.get_comment_by_id(impostor_comment_id).author_login,
            "pytorch-auto-revert",
        )
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -17,12 +17,12 @@ import re
 import time
 import urllib.parse
 from collections import defaultdict
-from collections.abc import Callable, Iterable
+from collections.abc import Iterable
 from dataclasses import dataclass
 from functools import cache
 from pathlib import Path
 from re import Pattern
-from typing import Any, cast, NamedTuple, Optional
+from typing import Any, Callable, cast, NamedTuple, Optional
 from warnings import warn
 import yaml
@ -234,7 +234,6 @@ query ($owner: String!, $name: String!, $number: Int!) {
          createdAt
          author {
            login
            url
          }
          authorAssociation
          editor {
@ -1092,9 +1091,8 @@ class GitHubPR:
        editor = node["editor"]
        return GitHubComment(
            body_text=node["bodyText"],
-            created_at=node.get("createdAt", ""),
+            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
            editor_login=editor["login"] if editor else None,
            database_id=node["databaseId"],
@ -2031,17 +2029,16 @@ def validate_revert(
    # For some reason, one can not be a member of private repo, only CONTRIBUTOR
    if pr.is_base_repo_private():
        allowed_reverters.append("CONTRIBUTOR")
    # Special case the pytorch-auto-revert app, whose does not have association
    # But should be able to issue revert command
    if comment.author_url == "https://github.com/apps/pytorch-auto-revert":
        allowed_reverters.append("NONE")
    if author_association not in allowed_reverters:
        raise PostCommentError(
            f"Will not revert as @{author_login} is not one of "
            f"[{', '.join(allowed_reverters)}], but instead is {author_association}."
        )
    # Raises exception if matching rule is not found, but ignores all status checks
    find_matching_merge_rule(
        pr, repo, skip_mandatory_checks=True, skip_internal_checks=True
    )
    commit_sha = get_pr_commit_sha(repo, pr)
    return (author_login, commit_sha)
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -177,9 +177,6 @@ jobs:
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config) }}
    permissions:
      id-token: write
      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -26,8 +26,9 @@ name: !{{ build_environment }}
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
+          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
 {%- endmacro %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -79,9 +79,9 @@ jobs:
    runs-on: "windows-11-arm64-preview"
    {%- else %}
    {%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -72,7 +72,7 @@ jobs:
            # Let's try to figure out how this can be improved
            timeout-minutes: 360
          - docs_type: python
-            runner: ${{ inputs.runner_prefix }}linux.c7i.2xlarge
+            runner: ${{ inputs.runner_prefix }}linux.2xlarge
            # It takes less than 30m to finish python docs unless there are issues
            timeout-minutes: 30
    # Set a fixed name for this job instead of using the current matrix-generated name, i.e. build-docs (cpp, linux.12xlarge, 180)
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -37,7 +37,7 @@ on:
      runner:
        required: false
        type: string
-        default: "linux.c7i.2xlarge"
+        default: "linux.2xlarge"
        description: |
          Label of the runner this job should run on.
      test-matrix:
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -224,46 +224,6 @@ jobs:
        continue-on-error: true
        uses: ./.github/actions/download-td-artifacts
      - name: Download Windows torch wheel for cross-compilation
        if: matrix.win_torch_wheel_artifact != ''
        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
        with:
          name: ${{ matrix.win_torch_wheel_artifact }}
          path: win-torch-wheel
      - name: Extract Windows wheel and setup CUDA libraries
        if: matrix.win_torch_wheel_artifact != ''
        shell: bash
        run: |
          set -x
          # Find the wheel file
          WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
          if [ -z "$WHEEL_FILE" ]; then
            echo "Error: No wheel file found in win-torch-wheel directory"
            exit 1
          fi
          echo "Found wheel file: $WHEEL_FILE"
          # Unzip the wheel file
          unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
          echo "Extracted wheel contents"
          # Setup CUDA libraries (cuda.lib and cudart.lib) directory
          mkdir -p win-torch-wheel-extracted/lib/x64
          if [ -f "win-torch-wheel/cuda.lib" ]; then
            mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
            echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
          fi
          if [ -f "win-torch-wheel/cudart.lib" ]; then
            mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
            echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
          fi
          # Verify CUDA libraries are present
          echo "CUDA libraries:"
          ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py
@ -429,6 +389,8 @@ jobs:
            "${DOCKER_IMAGE}" \
            ${DOCKER_SHELL_CMD}
          )
          # Propagate download.pytorch.org IP to container
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -97,11 +97,24 @@ jobs:
        shell: bash
        run: |
          ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
-          if [[ $ngpu -lt 2 ]]; then #We are temporarily reducing this down to 2 from 4 so that we can run tests on nodes with less gpus.
+          if [[ $ngpu -lt 4 ]]; then
-            echo "Error: only $ngpu GPU(s) detected, at least 2 GPUs are needed for distributed jobs"
+            echo "Error: only $ngpu GPU(s) detected, at least 4 GPUs are needed for distributed jobs"
            exit 1
          fi
      - name: configure aws credentials
        id: aws_creds
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
          role-duration-seconds: 18000
      - name: Login to Amazon ECR
        id: login-ecr
        continue-on-error: true
        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -168,31 +168,6 @@ jobs:
        run: |
          .ci/pytorch/win-build.sh
      # Collect Windows torch libs and CUDA libs for cross-compilation
      - name: Collect Windows CUDA libs for cross-compilation
        if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
        shell: bash
        run: |
          set -ex
          # Create directory structure if does not exist
          mkdir -p /c/${{ github.run_id }}/build-results
          # Copy CUDA libs
          CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
          if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
            cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
          fi
          if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
            cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
          fi
          # List collected files
          echo "Collected CUDA libs:"
          ls -lah /c/${{ github.run_id }}/build-results/*.lib
      # Upload to github so that people can click and download artifacts
      - name: Upload artifacts to s3
        if: steps.build.outcome != 'skipped'
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,10 +38,6 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
      dashboard-tag:
        required: false
        type: string
        default: ""
      disable-monitor:
        description: |
          [Experimental] Disable utilization monitoring for tests.
@ -62,11 +58,6 @@ on:
        required: false
        type: number
        default: 1
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
 permissions:
  id-token: write
  contents: read
@ -205,8 +196,6 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          # Fetch aws credential from IMDs
@ -257,8 +246,6 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
            -e ZE_AFFINITY_MASK \
            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
@ -344,21 +331,5 @@ jobs:
          if-no-files-found: ignore
          path: ./**/core.[1-9]*
      - name: Authenticate with AWS
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
          # The max duration enforced by the server side
          role-duration-seconds: 18000
          aws-region: us-east-1
      - name: Upload the benchmark results
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        with:
          benchmark-results-dir: test/test-reports
          dry-run: false
          schema-version: v3
          github-token: ${{ secrets.GITHUB_TOKEN }}
      - name: Teardown XPU
        uses: ./.github/actions/teardown-xpu
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@ -1,61 +0,0 @@
 name: CI for distributed tests on B200
 on:
  pull_request:
    paths:
      - .github/workflows/b200-distributed.yml
  workflow_dispatch:
  push:
    tags:
      - ciflow/b200-distributed/*
  schedule:
    - cron: 46 8 * * *  # about 1:46am PDT
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
        ]}
    secrets: inherit
  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
    name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
    uses: ./.github/workflows/_linux-test.yml
    needs:
      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
    with:
      timeout-minutes: 1200
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    secrets: inherit
--- a/.github/workflows/b200-symm-mem.yml
+++ b/.github/workflows/b200-symm-mem.yml
@ -37,6 +37,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm100-symm
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7`	`27664085f804afc83df26f740bb46c365854f2c4`
`@ -1 +1 @@`
	`ad5816f0eee1c873df1b7d371c69f1f811a89387`	`87ff22e49ed0e92576c4935ccb8c143daac4a3cd`
`@ -1 +1 @@`
	`ccb801b88af136454798b945175c4c87e636ac33`	`966da7e46f65d6d49df3e31214470a4fe5cc8e66`
`@ -1 +1 @@`
	`e5192819208c4d68194844b7dfafbc00020d0dea`	`0ad9951c416d33c5da4f7a504fb162cbe62386f5`
`@ -1 +1 @@`
	`e4d25697f9dc5eedaf8f0a5bf085c62c5455a53a`	`2a9138a26ee257fef05310ad3fecf7c55fe80d73`