Halves time spent in generating the key strings

Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.
2025-10-23 23:04:52 +08:00 · 2025-04-22 11:32:36 -07:00 · 2025-04-22 11:32:34 -07:00
1346 changed files with 24401 additions and 55279 deletions
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
  echo 'Skipping tests'
  exit 0
 fi
+if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+fi
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
    # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest

 # Set flags (see build.sh) and build image
-sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,6 +1,5 @@
 ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
 FROM amd64/almalinux:8 as base

 ENV LC_ALL en_US.UTF-8
@ -9,6 +8,10 @@ ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=11

+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
 RUN yum -y update
 RUN yum -y install epel-release
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
@ -38,12 +41,11 @@ RUN bash ./install_conda.sh && rm install_conda.sh

 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
@ -54,20 +56,18 @@ FROM cuda as cuda11.8
 RUN bash ./install_cuda.sh 11.8
 ENV DESIRED_CUDA=11.8

+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6

-FROM cuda as cuda12.8
-RUN bash ./install_cuda.sh 12.8
-ENV DESIRED_CUDA=12.8
-
-FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-ENV MKLROOT /opt/intel
-
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -75,8 +75,9 @@ RUN bash ./install_mnist.sh

 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
-COPY --from=cuda12.4  /usr/local/cuda-12.8 /usr/local/cuda-12.8

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -15,16 +15,9 @@ fi
 DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')

 CUDA_VERSION=""
-ROCM_VERSION=""
-EXTRA_BUILD_ARGS=""
 if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name and tag.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
 fi

 case ${DOCKER_TAG_PREFIX} in
@ -34,9 +27,6 @@ case ${DOCKER_TAG_PREFIX} in
  cuda*)
    BASE_TARGET=cuda${CUDA_VERSION}
    ;;
-  rocm*)
-    BASE_TARGET=rocm
-    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
    exit 1
@ -57,8 +47,8 @@ docker build \
  --target final \
  --progress plain \
  --build-arg "BASE_TARGET=${BASE_TARGET}" \
+  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
  --build-arg "DEVTOOLSET_VERSION=11" \
-  ${EXTRA_BUILD_ARGS} \
  -t ${tmp_tag} \
  $@ \
  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -85,6 +85,9 @@ elif [[ "$image" == *linter* ]]; then
  DOCKERFILE="linter/Dockerfile"
 fi

+# CMake 3.18 is needed to support CUDA17 language variant
+CMAKE_VERSION=3.18.5
+
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
@ -92,21 +95,21 @@ if [[ "$image" == *rocm* ]]; then
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi

-tag=$(echo $image | awk -F':' '{print $2}')
-
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
-case "$tag" in
+case "$image" in
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
@ -114,10 +117,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -126,10 +131,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -138,10 +145,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -150,10 +159,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
@ -161,10 +172,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -173,10 +186,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -185,10 +200,12 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
@ -197,54 +214,68 @@ case "$tag" in
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3-clang10-onnx)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    ONNX=yes
    ;;
  pytorch-linux-focal-py3.9-clang10)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3.11-clang10)
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3.9-gcc9)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-rocm-n-1-py3)
+  pytorch-linux-focal-rocm-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
-    ROCM_VERSION=6.3
+    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3)
+  pytorch-linux-focal-rocm-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
-    ROCM_VERSION=6.4
+    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -254,24 +285,30 @@ case "$tag" in
  pytorch-linux-jammy-xpu-2024.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
@ -281,30 +318,37 @@ case "$tag" in
    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
+    PROTOBUF=yes
    VISION=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-asan)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang15-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=15
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3-clang18-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=18
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    UNINSTALL_DILL=yes
@ -312,12 +356,14 @@ case "$tag" in
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
+    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
@ -325,6 +371,7 @@ case "$tag" in
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    TRITON_CPU=yes
    ;;
  pytorch-linux-focal-linter)
@ -332,16 +379,20 @@ case "$tag" in
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
    PYTHON_VERSION=3.9
+    PIP_CMAKE=yes
    ;;
  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
    PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
+    PIP_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -350,7 +401,9 @@ case "$tag" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -358,6 +411,7 @@ case "$tag" in
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
+    PROTOBUF=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -373,7 +427,8 @@ case "$tag" in
      TRITON=yes
      # To ensure that any ROCm config will build using conda cmake
      # and thus have LAPACK/MKL enabled
-      fi
+      CONDA_CMAKE=yes
+    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
    fi
@ -389,6 +444,9 @@ case "$tag" in
    if [[ "$image" == *glibc* ]]; then
      extract_version_from_image_name glibc GLIBC_VERSION
    fi
+    if [[ "$image" == *cmake* ]]; then
+      extract_version_from_image_name cmake CMAKE_VERSION
+    fi
  ;;
 esac

@ -415,6 +473,7 @@ docker build \
       ${no_cache_flag} \
       ${progress_flag} \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
+       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
@ -429,6 +488,7 @@ docker build \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
@ -436,6 +496,8 @@ docker build \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
+       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
+       --build-arg "PIP_CMAKE=${PIP_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -444,7 +506,6 @@ docker build \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
-       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -521,12 +582,3 @@ elif [ "$HAS_TRITON" = "yes" ]; then
  echo "expecting triton to not be installed, but it is"
  exit 1
 fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -40,6 +40,7 @@ RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -47,6 +48,13 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -74,6 +82,12 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-b173722085b3f555d6ba4533d6bbaddfd7c71144
+381ae5d57d35c165d98df728380b20fbde350392
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -99,6 +99,9 @@ install_centos() {

  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
  numpy_deps="gcc-gfortran"
+  # Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
+  # for Caffe2. That said, we still install them to make sure the build
+  # system opts to build/use protoc and libprotobuf from third-party.
  yum install -y \
    $ccache_deps \
    $numpy_deps \
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.10.0
+  git clone https://github.com/mozilla/sccache -b v0.9.1
  cd sccache
  echo "Building sccache"
  cargo build --release
--- a/.ci/docker/common/install_cmake.sh
+++ b/.ci/docker/common/install_cmake.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CMAKE_VERSION" ]
+
+# Remove system cmake install so it won't get used instead
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get remove cmake -y
+    ;;
+  centos)
+    yum remove cmake -y
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
+# Turn 3.6.3 into v3.6
+path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
+file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
+
+# Download and install specific CMake version in /usr/local
+pushd /tmp
+curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
+tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
+rm -f cmake-*.tar.gz
+popd
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -75,11 +75,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # and libpython-static for torch deploy
  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"

+  # Use conda cmake in some cases. Conda cmake will be newer than our supported
+  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
+  # following builds that we know should use conda. Specifically, Ubuntu bionic
+  # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda
+  if [ -n "${CONDA_CMAKE}" ]; then
+    conda_install cmake
+  fi
+
  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
+    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
  fi

  # Install some other packages, including those needed for Python test reporting
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,54 +2,64 @@

 set -ex

-arch_path=''
-targetarch=${TARGETARCH:-$(uname -m)}
-if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
-  arch_path='x86_64'
-else
-  arch_path='sbsa'
-fi
+CUDNN_VERSION=9.5.1.17

-function install_cuda {
-  version=$1
-  runfile=$2
-  major_minor=${version%.*}
-  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
-  if [[ ${arch_path} == 'sbsa' ]]; then
-      runfile="${runfile}_sbsa"
-  fi
-  runfile="${runfile}.run"
-  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
-  chmod +x ${runfile}
-  ./${runfile} --toolkit --silent
-  rm -f ${runfile}
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
 }

-function install_cudnn {
-  cuda_major_version=$1
-  cudnn_version=$2
-  mkdir tmp_cudnn && cd tmp_cudnn
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
-  tar xf ${filepath}.tar.xz
-  cp -a ${filepath}/include/* /usr/local/cuda/include/
-  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
 }

 function install_118 {
    CUDNN_VERSION=9.1.0.70
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
-    install_cuda 11.8.0 cuda_11.8.0_520.61.05_linux
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda

-    install_cudnn 11 $CUDNN_VERSION
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn

    CUDA_VERSION=11.8 bash install_nccl.sh

-    CUDA_VERSION=11.8 bash install_cusparselt.sh
+    install_cusparselt_040

    ldconfig
 }
@ -57,27 +67,52 @@ function install_118 {
 function install_124 {
  CUDNN_VERSION=9.1.0.70
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
-  install_cuda 12.4.1 cuda_12.4.1_550.54.15_linux
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+  chmod +x cuda_12.4.1_550.54.15_linux.run
+  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda

-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

  CUDA_VERSION=12.4 bash install_nccl.sh

-  CUDA_VERSION=12.4 bash install_cusparselt.sh
+  install_cusparselt_062

  ldconfig
 }

 function install_126 {
-  CUDNN_VERSION=9.5.1.17
  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
-  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.3 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
+  chmod +x cuda_12.6.3_560.35.05_linux.run
+  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
+  rm -f cuda_12.6.3_560.35.05_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda

-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

  CUDA_VERSION=12.6 bash install_nccl.sh

-  CUDA_VERSION=12.6 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }
@ -184,15 +219,26 @@ function prune_126 {
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
-  install_cuda 12.8.0 cuda_12.8.0_570.86.10_linux
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
+  chmod +x cuda_12.8.0_570.86.10_linux.run
+  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

  CUDA_VERSION=12.8 bash install_nccl.sh

-  CUDA_VERSION=12.8 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,55 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+CUDNN_VERSION=9.8.0.87
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_128 {
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  # install CUDA 12.8.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
+  chmod +x cuda_12.8.0_570.86.10_linux_sbsa.run
+  ./cuda_12.8.0_570.86.10_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  CUDA_VERSION=12.8 bash install_nccl.sh
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.8) install_128;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -13,7 +13,7 @@ clone_executorch() {
  # and fetch the target commit
  pushd executorch
  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init --recursive
+  git submodule update --init
  popd

  chown -R jenkins executorch
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -17,7 +17,7 @@ if [ -n "${UBUNTU_VERSION}" ];then
                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi

-pip_install numpy scipy imageio cmake ninja
+conda_install numpy scipy imageio cmake ninja

 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -14,9 +14,16 @@ function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)

+  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
+  # I'm using nightly here instead. We just need to package to be able to install
+  # TIMM. Removing this once vision has a release on 3.13
+  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
+    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
+  fi
+
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+  conda_run pip uninstall -y cmake torch torchvision triton
 }

 # Pango is needed for weasyprint which is needed for doctr
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,23 +1,26 @@
 #!/usr/bin/env bash
-# Script that installs magma from tarball inside conda environment.
-# It replaces anaconda magma-cuda package which is no longer published.
-# Execute it inside active conda environment.
-# See issue: https://github.com/pytorch/pytorch/issues/138506
+# Script that replaces the magma install from a conda package

 set -eou pipefail

-cuda_version_nodot=${1/./}
-anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+function do_install() {
+    cuda_version_nodot=${1/./}
+    anaconda_python_version=$2

-MAGMA_VERSION="2.6.1"
-magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
-(
-    set -x
-    tmp_dir=$(mktemp -d)
-    pushd ${tmp_dir}
-    curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-    tar -xvf "${magma_archive}"
-    mv include/* "${anaconda_dir}/include/"
-    mv lib/* "${anaconda_dir}/lib"
-    popd
-)
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mv include/* "${anaconda_dir}/include/"
+        mv lib/* "${anaconda_dir}/lib"
+        popd
+    )
+}
+
+do_install $1 $2
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -ex
+
+pb_dir="/usr/temp_pb_install_dir"
+mkdir -p $pb_dir
+
+# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+# else it will fail with
+#   g++: error: ./../lib64/crti.o: No such file or directory
+ln -s /usr/lib64 "$pb_dir/lib64"
+
+curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+
+tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+NPROC=$[$(nproc) - 2]
+pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
+popd
+rm -rf $pb_dir
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -13,3 +13,6 @@ source /var/lib/jenkins/ci_env/bin/activate

 python -mpip install --upgrade pip
 python -mpip install -r /opt/requirements-ci.txt
+if [ -n "${PIP_CMAKE}" ]; then
+  python -mpip install cmake==3.31.6
+fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -66,25 +66,17 @@ EOF
    done

    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
-    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]] || [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
-            HIP_BRANCH=rocm-6.3.x
-            VER_STR=6.3
-        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-        fi
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}-statco-hotfix
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
        popd
        rm -rf HIP clr
    fi
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -10,8 +10,12 @@ fi

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-get_pip_version() {
-  conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}'
+get_conda_version() {
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
+}
+
+conda_reinstall() {
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

 if [ -n "${XPU_VERSION}" ]; then
@ -33,9 +37,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
    apt-get install -y gpg-agent
 fi

-# Keep the current cmake and numpy version here, so we can reinstall them later
-CMAKE_VERSION=$(get_pip_version cmake)
-NUMPY_VERSION=$(get_pip_version numpy)
+if [ -n "${CONDA_CMAKE}" ]; then
+  # Keep the current cmake and numpy version here, so we can reinstall them later
+  CMAKE_VERSION=$(get_conda_version cmake)
+  NUMPY_VERSION=$(get_conda_version numpy)
+fi

 if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
@ -77,19 +83,17 @@ cp dist/*.whl /opt/triton
 # Install the wheel for docker builds that don't use multi stage
 pip_install dist/*.whl

-# TODO: This is to make sure that the same cmake and numpy version from install conda
-# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
-# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
-# this can be removed.
-#
-# The correct numpy version also needs to be set here because conda claims that it
-# causes inconsistent environment.  Without this, conda will attempt to install the
-# latest numpy version, which fails ASAN tests with the following import error: Numba
-# needs NumPy 1.20 or less.
-# Note that we install numpy with pip as conda might not have the version we want
-if [ -n "${CMAKE_VERSION}" ]; then
-  pip_install "cmake==${CMAKE_VERSION}"
-fi
-if [ -n "${NUMPY_VERSION}" ]; then
-  pip_install "numpy==${NUMPY_VERSION}"
+if [ -n "${CONDA_CMAKE}" ]; then
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # this can be removed.
+  #
+  # The correct numpy version also needs to be set here because conda claims that it
+  # causes inconsistent environment.  Without this, conda will attempt to install the
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
+  # needs NumPy 1.20 or less.
+  conda_reinstall cmake="${CMAKE_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 fi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -51,7 +51,6 @@ ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME /usr/local/cuda

 FROM cuda as cuda11.8
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -32,8 +32,7 @@ ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -16,6 +16,7 @@ RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG PYTHON_VERSION
+ARG PIP_CMAKE
 ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
 ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
 COPY requirements-ci.txt /opt/requirements-ci.txt
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,202 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum --enablerepo=extras install -y epel-release
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+COPY ./common/install_nccl.sh install_nccl.sh
+COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+ENV PATH /opt/conda/bin:$PATH
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -7,8 +7,8 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=13
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

@ -33,13 +33,12 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6

 FROM base as cuda
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*

 FROM base as intel
 # MKL
@ -47,7 +46,7 @@ ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh

 FROM base as magma
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@ -64,7 +63,7 @@ ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh

 FROM ${GPU_IMAGE} as common
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
@ -87,12 +86,13 @@ RUN yum install -y \
        wget \
        which \
        xz \
-        glibc-langpack-en \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm

+RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
@ -116,8 +116,8 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h

 FROM common as cpu_final
-ARG BASE_CUDA_VERSION=12.6
-ARG DEVTOOLSET_VERSION=13
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
@ -156,11 +156,8 @@ ENV ROCM_PATH /opt/rocm
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
-# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-# ROCm 6.4 rocm-smi depends on system drm.h header
-RUN yum install -y libdrm-devel
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -1,6 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

-ARG GCCTOOLSET_VERSION=13
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11

 # Language variabes
 ENV LC_ALL=en_US.UTF-8
@ -35,10 +36,7 @@ RUN yum install -y \
  yasm \
  zstd \
  sudo \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gdb
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -1,7 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

 # Cuda ARM build needs gcc 11
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11

 # Language variables
 ENV LC_ALL=en_US.UTF-8
@ -34,10 +34,7 @@ RUN yum install -y \
  zstd \
  libgomp \
  sudo \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain

 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -69,11 +66,10 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
-ADD ./common/install_cuda.sh install_cuda.sh
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
 COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*

 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -32,13 +32,19 @@ case ${image} in
    manylinux2_28-builder:cpu)
        TARGET=cpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
+    manylinuxaarch64-builder:cpu-aarch64)
+        TARGET=final
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
+        ;;
    manylinux2_28_aarch64-builder:cpu-aarch64)
        TARGET=final
        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
@ -53,27 +59,23 @@ case ${image} in
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda11*)
+    manylinux2_28-builder:cuda*)
        TARGET=cuda_final
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28-builder:cuda12*)
-        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
-        MANY_LINUX_VERSION="2_28"
-        ;;
    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    manylinux2_28-builder:rocm*)
        TARGET=rocm_final
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        DEVTOOLSET_VERSION="9"
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -379,6 +379,3 @@ dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
-
-cmake==4.0.0
-#Description: required for building
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -26,6 +26,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -42,6 +43,13 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -75,6 +83,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 ARG TRITON

 FROM base as triton-builder
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -27,6 +27,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -42,6 +43,13 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -100,6 +108,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@ -83,6 +84,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@ -53,8 +54,7 @@ ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
 COPY ./common/install_nccl.sh install_nccl.sh
 COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
 # No effect if cuda not installed
@ -74,6 +74,13 @@ ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -12,12 +12,13 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-w /builder \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
 	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
+	"pytorch/manylinux2_28-builder:rocm${DESIRED_ROCM}-main" \
 	magma-rocm/build_magma.sh

 .PHONY: all
 all: magma-rocm64
 all: magma-rocm63
+all: magma-rocm624

 .PHONY:
 clean:
@ -33,3 +34,8 @@ magma-rocm64:
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
 	$(DOCKER_RUN)
+
+.PHONY: magma-rocm624
+magma-rocm624: DESIRED_ROCM := 6.2.4
+magma-rocm624:
+	$(DOCKER_RUN)
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -12,12 +12,13 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh

 .PHONY: all
 all: magma-cuda128
 all: magma-cuda126
+all: magma-cuda124
 all: magma-cuda118

 .PHONY:
@ -36,6 +37,11 @@ magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)

+.PHONY: magma-cuda124
+magma-cuda124: DESIRED_CUDA := 12.4
+magma-cuda124:
+	$(DOCKER_RUN)
+
 .PHONY: magma-cuda118
 magma-cuda118: DESIRED_CUDA := 11.8
 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -171,12 +171,6 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/ccl/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # Enable XCCL build
-  export USE_XCCL=1
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -302,22 +302,19 @@ except RuntimeError as e:
 fi

 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
+# Check for C++ ABI compatibility to GCC-11
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
-  # gcc 11 - CUDA 11.8, xpu, rocm
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
-  if [[ "$(uname -m)" == "s390x" ]]; then
-    cxx_abi="19"
-  elif [[ "$DESIRED_CUDA" != 'cu118' && "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
-    cxx_abi="18"
-  else
-    cxx_abi="16"
+  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html gcc-11 is ABI16
+  # Though manylinux_2.28 should have been build with gcc-14, per
+  # https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based
+  # On s390x gcc 14 is used because it contains fix for interaction
+  # between precompiled headers and vectorization builtins.
+  # This fix is not available in earlier gcc versions.
+  # gcc-14 uses ABI19.
+  if [[ "$(uname -m)" != "s390x" ]]; then
+    python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1016' else 1)"
  fi
-  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
  popd
 fi
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
  # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
  unset HIP_PLATFORM
  export PYTORCH_TEST_WITH_ROCM=1
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+  # improve rccl performance for distributed tests
+  export HSA_FORCE_FINE_GRAIN_PCIE=1
 fi

 # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,50 +1,31 @@
 #!/bin/bash

 # Script for installing sccache on the xla build job, which uses xla's docker
-# image, which has sccache installed but doesn't write the stubs.  This is
-# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
-# that will always return the same thing, ex checks for for rocm, CUDA, changing
-# the path where sccache is installed, not changing /etc/environment, and not
-# installing/downloading sccache as it is already in the docker image.
+# image and doesn't have sccache installed on it.  This is mostly copied from
+# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
+# return the same thing, ex checks for for rocm, CUDA, and changing the path
+# where sccache is installed, and not changing /etc/environment.

 set -ex -o pipefail

+install_binary() {
+  echo "Downloading sccache binary from S3 repo"
+  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
+}
+
 mkdir -p /tmp/cache/bin
+mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"

+install_binary
+chmod a+x /tmp/cache/bin/sccache
+
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ "$1" == "gcc" ]; then
-    # Do not call sccache recursively when dumping preprocessor argument
-    # For some reason it's very important for the first cached nvcc invocation
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
-for arg in "\$@"; do
-  if [ "\$arg" = "-E" ]; then
-    exec $(which "$1") "\$@"
-  fi
-done
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  else
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  fi
+  # shellcheck disable=SC2086
+  # shellcheck disable=SC2059
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
  chmod a+x "/tmp/cache/bin/$1"
 }

--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -42,16 +42,6 @@ test_python_all() {
  assert_git_not_dirty
 }

-test_python_mps() {
-  setup_test_python
-
-  time python test/run_test.py --verbose --mps
-  MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
-
-  assert_git_not_dirty
-}
-
-
 test_python_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -247,11 +237,6 @@ test_torchbench_smoketest() {
        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
-        if [ "$backend" == "inductor" ]; then
-          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-            --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-        fi
      done
    done

@ -320,8 +305,6 @@ elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
  test_torchbench_smoketest
-elif [[ $TEST_CONFIG == *"mps"* ]]; then
-  test_python_mps
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
--- a/.ci/pytorch/perf_test/common.sh
+++ b/.ci/pytorch/perf_test/common.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+run_test () {
+  rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
+  "$@"
+  cd .. && rm -rf test_tmp/
+}
+
+get_runtime_of_command () {
+  TIMEFORMAT=%R
+
+  # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
+  runtime=$( { time "$@"; } 2>&1 1>/dev/null)
+  if [[ $runtime == *"Error"* ]]; then
+    exit 1
+  fi
+  runtime=${runtime#+++ $@}
+  runtime=$(python -c "print($runtime)")
+
+  echo "$runtime"
+}
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -0,0 +1,91 @@
+import argparse
+import json
+import math
+import sys
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--test-name", dest="test_name", action="store", required=True, help="test name"
+)
+parser.add_argument(
+    "--sample-stats",
+    dest="sample_stats",
+    action="store",
+    required=True,
+    help="stats from sample",
+)
+parser.add_argument(
+    "--update",
+    action="store_true",
+    help="whether to update baseline using stats from sample",
+)
+args = parser.parse_args()
+
+test_name = args.test_name
+
+if "cpu" in test_name:
+    backend = "cpu"
+elif "gpu" in test_name:
+    backend = "gpu"
+
+data_file_path = f"../{backend}_runtime.json"
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+if test_name in data:
+    mean = float(data[test_name]["mean"])
+    sigma = float(data[test_name]["sigma"])
+else:
+    # Let the test pass if baseline number doesn't exist
+    mean = sys.maxsize
+    sigma = 0.001
+
+print("population mean: ", mean)
+print("population sigma: ", sigma)
+
+# Let the test pass if baseline number is NaN (which happened in
+# the past when we didn't have logic for catching NaN numbers)
+if math.isnan(mean) or math.isnan(sigma):
+    mean = sys.maxsize
+    sigma = 0.001
+
+sample_stats_data = json.loads(args.sample_stats)
+
+sample_mean = float(sample_stats_data["mean"])
+sample_sigma = float(sample_stats_data["sigma"])
+
+print("sample mean: ", sample_mean)
+print("sample sigma: ", sample_sigma)
+
+if math.isnan(sample_mean):
+    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
+elif math.isnan(sample_sigma):
+    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
+
+z_value = (sample_mean - mean) / sigma
+
+print("z-value: ", z_value)
+
+if z_value >= 3:
+    raise Exception(  # noqa: TRY002
+        f"""\n
+z-value >= 3, there is high chance of perf regression.\n
+To reproduce this regression, run
+`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
+and compare the runtime before/after your code change.
+"""
+    )
+else:
+    print("z-value < 3, no perf regression detected.")
+    if args.update:
+        print("We will use these numbers as new baseline.")
+        new_data_file_path = f"../new_{backend}_runtime.json"
+        with open(new_data_file_path) as new_data_file:
+            new_data = json.load(new_data_file)
+        new_data[test_name] = {}
+        new_data[test_name]["mean"] = sample_mean
+        new_data[test_name]["sigma"] = max(sample_sigma, sample_mean * 0.1)
+        with open(new_data_file_path, "w") as new_data_file:
+            json.dump(new_data, new_data_file, indent=4)
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -0,0 +1,18 @@
+import json
+import sys
+
+import numpy
+
+
+sample_data_list = sys.argv[1:]
+sample_data_list = [float(v.strip()) for v in sample_data_list]
+
+sample_mean = numpy.mean(sample_data_list)
+sample_sigma = numpy.std(sample_data_list)
+
+data = {
+    "mean": sample_mean,
+    "sigma": sample_sigma,
+}
+
+print(json.dumps(data))
--- a/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mini_sequence_labeler () {
+  echo "Testing: mini sequence labeler, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 726567a455edbfda6199445922a8cfee82535664
+
+  cd scripts/mini_sequence_labeler
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py)
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mini_sequence_labeler "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mnist () {
+  echo "Testing: MNIST, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision-cpu
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch () {
+  echo "Testing: torch.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch_tensor () {
+  echo "Testing: torch.Tensor.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch_tensor "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_cudnn_lstm () {
+  echo "Testing: CuDNN LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_cudnn_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_lstm () {
+  echo "Testing: LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mlstm () {
+  echo "Testing: MLSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mlstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mnist () {
+  echo "Testing: MNIST, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  # Needs warm up to get accurate number
+  python main.py --epochs 1 --no-log
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_word_language_model () {
+  echo "Testing: word language model on Wikitext-2, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/word_language_model
+
+  cd data/wikitext-2
+
+  # Reduce dataset size, so that we can have more runs per test
+  sed -n '1,200p' test.txt > test_tmp.txt
+  sed -n '1,1000p' train.txt > train_tmp.txt
+  sed -n '1,200p' valid.txt > valid_tmp.txt
+
+  mv test_tmp.txt test.txt
+  mv train_tmp.txt train.txt
+  mv valid_tmp.txt valid.txt
+
+  cd ../..
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --cuda --epochs 1)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_word_language_model "$@"
+fi
--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -0,0 +1,14 @@
+import json
+import sys
+
+
+data_file_path = sys.argv[1]
+commit_hash = sys.argv[2]
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+data["commit"] = commit_hash
+
+with open(data_file_path, "w") as data_file:
+    json.dump(data, data_file)
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@ -76,7 +76,7 @@ fi
 # Environment initialization
 if [[ "$(uname)" == Darwin ]]; then
    # Install the testing dependencies
-    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+    retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
 else
    retry pip install -qr requirements.txt || true
    retry pip install -q hypothesis protobuf pytest setuptools || true
@ -91,6 +91,7 @@ fi

 echo "Testing with:"
 pip freeze
+conda list || true

 ##############################################################################
 # Smoke tests
--- a/.ci/pytorch/short-perf-test-cpu.sh
+++ b/.ci/pytorch/short-perf-test-cpu.sh
@ -0,0 +1,71 @@
+#!/bin/bash
+
+SCRIPT_PARENT_DIR=$(dirname "${BASH_SOURCE[0]}")
+
+# shellcheck source=.ci/pytorch/common.sh
+source "$SCRIPT_PARENT_DIR/common.sh"
+
+cd .ci/pytorch/perf_test
+
+echo "Running CPU perf test for PyTorch..."
+
+pip install -q awscli
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
+fi
+
+# Find the default branch commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+while IFS='' read -r commit_id; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/cpu_runtime/"${commit_id}".json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
+aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/"${LATEST_TESTED_COMMIT}".json cpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Prepare new baseline file
+    cp cpu_runtime.json new_cpu_runtime.json
+    python update_commit_hash.py new_cpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
+fi
+
+# Include tests
+# shellcheck source=./perf_test/test_cpu_speed_mini_sequence_labeler.sh
+. ./test_cpu_speed_mini_sequence_labeler.sh
+# shellcheck source=./perf_test/test_cpu_speed_mnist.sh
+. ./test_cpu_speed_mnist.sh
+# shellcheck source=./perf_test/test_cpu_speed_torch.sh
+. ./test_cpu_speed_torch.sh
+# shellcheck source=./perf_test/test_cpu_speed_torch_tensor.sh
+. ./test_cpu_speed_torch_tensor.sh
+
+# Run tests
+export TEST_MODE="compare_with_baseline"
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    export TEST_MODE="compare_and_update"
+fi
+
+# Operator tests
+run_test test_cpu_speed_torch ${TEST_MODE}
+run_test test_cpu_speed_torch_tensor ${TEST_MODE}
+
+# Sample model tests
+run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE}
+run_test test_cpu_speed_mnist 20 ${TEST_MODE}
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
+fi
--- a/.ci/pytorch/short-perf-test-gpu.sh
+++ b/.ci/pytorch/short-perf-test-gpu.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+pushd .ci/pytorch/perf_test
+
+echo "Running GPU perf test for PyTorch..."
+
+# Trying to uninstall PyYAML can cause problem. Workaround according to:
+# https://github.com/pypa/pip/issues/5247#issuecomment-415571153
+pip install -q awscli --ignore-installed PyYAML
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
+fi
+
+# Find the default branch commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+while IFS='' read -r commit_id; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/gpu_runtime/"${commit_id}".json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
+aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/"${LATEST_TESTED_COMMIT}".json gpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Prepare new baseline file
+    cp gpu_runtime.json new_gpu_runtime.json
+    python update_commit_hash.py new_gpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
+fi
+
+# Include tests
+# shellcheck source=./perf_test/test_gpu_speed_mnist.sh
+. ./test_gpu_speed_mnist.sh
+# shellcheck source=./perf_test/test_gpu_speed_word_language_model.sh
+. ./test_gpu_speed_word_language_model.sh
+# shellcheck source=./perf_test/test_gpu_speed_cudnn_lstm.sh
+. ./test_gpu_speed_cudnn_lstm.sh
+# shellcheck source=./perf_test/test_gpu_speed_lstm.sh
+. ./test_gpu_speed_lstm.sh
+# shellcheck source=./perf_test/test_gpu_speed_mlstm.sh
+. ./test_gpu_speed_mlstm.sh
+
+# Run tests
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    run_test test_gpu_speed_mnist 20 compare_and_update
+    run_test test_gpu_speed_word_language_model 20 compare_and_update
+    run_test test_gpu_speed_cudnn_lstm 20 compare_and_update
+    run_test test_gpu_speed_lstm 20 compare_and_update
+    run_test test_gpu_speed_mlstm 20 compare_and_update
+else
+    run_test test_gpu_speed_mnist 20 compare_with_baseline
+    run_test test_gpu_speed_word_language_model 20 compare_with_baseline
+    run_test test_gpu_speed_cudnn_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_mlstm 20 compare_with_baseline
+fi
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
+fi
+
+popd
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -191,10 +191,6 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
    # shellcheck disable=SC1091
    source /opt/intel/oneapi/umf/latest/env/vars.sh
  fi
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/ccl/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/mpi/latest/env/vars.sh
  # Check XPU status before testing
  xpu-smi discovery
 fi
@ -318,12 +314,6 @@ test_python() {
  assert_git_not_dirty
 }

-test_python_smoke() {
-  # Smoke tests for H100
-  time python test/run_test.py --include test_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
-  assert_git_not_dirty
-}
-
 test_lazy_tensor_meta_reference_disabled() {
  export TORCH_DISABLE_FUNCTIONALIZATION_META_REFERENCE=1
  echo "Testing lazy tensor operations without meta reference"
@ -408,15 +398,8 @@ test_inductor_aoti() {
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
-  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
-    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
-    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
-  else
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
-  fi
+  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }

 test_inductor_cpp_wrapper_shard() {
@ -1493,6 +1476,8 @@ test_executorch() {
  export PYTHON_EXECUTABLE=python
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

+  # For llama3
+  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
  bash .ci/scripts/setup-linux.sh --build-tool cmake
@ -1535,7 +1520,7 @@ test_linux_aarch64() {
       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
-       inductor/test_split_cat_fx_passes inductor/test_compile inductor/test_torchinductor \
+       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
       inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
@ -1721,8 +1706,6 @@ elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  test_python
  test_aten
  test_xpu_bin
-elif [[ "${TEST_CONFIG}" == smoke ]]; then
-  test_python_smoke
 else
  install_torchvision
  install_monkeytype
--- a/.ci/pytorch/windows/build_pytorch.bat
+++ b/.ci/pytorch/windows/build_pytorch.bat
@ -1,7 +1,7 @@
@echo off

-:: This script parses args, installs required libraries (MKL, Magma, libuv)
-:: and then delegates to cpu.bat, cuda80.bat, etc.
+:: This script parses args, installs required libraries (miniconda, MKL,
+:: Magma), and then delegates to cpu.bat, cuda80.bat, etc.

 if not "%CUDA_VERSION%" == "" if not "%PYTORCH_BUILD_VERSION%" == "" if not "%PYTORCH_BUILD_NUMBER%" == "" goto env_end
 if "%~1"=="" goto arg_error
@ -36,18 +36,28 @@ set DESIRED_PYTHON_PREFIX=py%DESIRED_PYTHON_PREFIX:;=;py%
 set SRC_DIR=%~dp0
 pushd %SRC_DIR%

+:: Install Miniconda3
+set "CONDA_HOME=%CD%\conda"
+set "tmp_conda=%CONDA_HOME%"
+set "miniconda_exe=%CD%\miniconda.exe"
+rmdir /s /q conda
+del miniconda.exe
+curl --retry 3 -k https://repo.anaconda.com/miniconda/Miniconda3-py311_23.9.0-0-Windows-x86_64.exe -o "%miniconda_exe%"
+start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
+if ERRORLEVEL 1 exit /b 1
 set "ORIG_PATH=%PATH%"
+set "PATH=%CONDA_HOME%;%CONDA_HOME%\scripts;%CONDA_HOME%\Library\bin;%PATH%"

-:: setup build environment
+:: create a new conda environment and install packages
 :try
 SET /A tries=3
 :loop
 IF %tries% LEQ 0 GOTO :exception
-call setup_build.bat
+call condaenv.bat
 IF %ERRORLEVEL% EQU 0 GOTO :done
 SET /A "tries=%tries%-1"
 :exception
-echo "Failed to setup build environment"
+echo "Failed to create conda env"
 exit /B 1
 :done

@ -63,7 +73,7 @@ if "%DEBUG%" == "1" (
 if not "%CUDA_VERSION%" == "cpu" if not "%CUDA_VERSION%" == "xpu" (
    rmdir /s /q magma_%CUDA_PREFIX%_%BUILD_TYPE%
    del magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
-    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z %= @lint-ignore =%
+    curl -k https://s3.amazonaws.com/ossci-windows/magma_%MAGMA_VERSION%_%CUDA_PREFIX%_%BUILD_TYPE%.7z -o magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z
    7z x -aoa magma_%CUDA_PREFIX%_%BUILD_TYPE%.7z -omagma_%CUDA_PREFIX%_%BUILD_TYPE%
 )

@ -97,20 +107,19 @@ set TH_BINARY_BUILD=1
 set INSTALL_TEST=0

 for %%v in (%DESIRED_PYTHON_PREFIX%) do (
-
-    :: Set Environment vars for the build
-    set "CMAKE_PREFIX_PATH=%CD%\Python\Library\;%PATH%"
-    set "PYTHON_LIB_PATH=%CD%\Python\Library\bin"
-
+    :: Activate Python Environment
+    set PYTHON_PREFIX=%%v
+    set "CONDA_LIB_PATH=%CONDA_HOME%\envs\%%v\Library\bin"
    if not "%ADDITIONAL_PATH%" == "" (
-        set "PATH=%ADDITIONAL_PATH%;%PATH%"
+        set "PATH=%ADDITIONAL_PATH%;%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
+    ) else (
+        set "PATH=%CONDA_HOME%\envs\%%v;%CONDA_HOME%\envs\%%v\scripts;%CONDA_HOME%\envs\%%v\Library\bin;%ORIG_PATH%"
    )
-
    pip install ninja
    @setlocal
    :: Set Flags
    if not "%CUDA_VERSION%"=="cpu" if not "%CUDA_VERSION%" == "xpu" (
-        set "MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%"
+        set MAGMA_HOME=%cd%\magma_%CUDA_PREFIX%_%BUILD_TYPE%
    )
    echo "Calling arch build script"
    call %CUDA_PREFIX%.bat
--- a/.ci/pytorch/windows/condaenv.bat
+++ b/.ci/pytorch/windows/condaenv.bat
@ -0,0 +1,27 @@
+IF "%DESIRED_PYTHON%"=="" (
+    echo DESIRED_PYTHON is NOT defined.
+    exit /b 1
+)
+
+:: Create a new conda environment
+setlocal EnableDelayedExpansion
+FOR %%v IN (%DESIRED_PYTHON%) DO (
+    set PYTHON_VERSION_STR=%%v
+    set PYTHON_VERSION_STR=!PYTHON_VERSION_STR:.=!
+    conda remove -n py!PYTHON_VERSION_STR! --all -y || rmdir %CONDA_HOME%\envs\py!PYTHON_VERSION_STR! /s
+    if "%%v" == "3.9" call conda create -n py!PYTHON_VERSION_STR! -y numpy=2.0.1 boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
+    if "%%v" == "3.10" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
+    if "%%v" == "3.11" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
+    if "%%v" == "3.12" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.0.1  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
+    if "%%v" == "3.13" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.1.2  boto3 cmake ninja typing_extensions setuptools=72.1.0 python=%%v
+    if "%%v" == "3.13t" call conda create -n py!PYTHON_VERSION_STR! -y -c=conda-forge numpy=2.1.2 boto3 cmake ninja typing_extensions setuptools=72.1.0 python-freethreading python=3.13
+    call conda run -n py!PYTHON_VERSION_STR! pip install pyyaml
+    call conda run -n py!PYTHON_VERSION_STR! pip install mkl-include
+    call conda run -n py!PYTHON_VERSION_STR! pip install mkl-static
+)
+endlocal
+
+:: Install libuv
+conda install -y -q -c conda-forge libuv=1.39
+set libuv_ROOT=%CONDA_HOME%\Library
+echo libuv_ROOT=%libuv_ROOT%
--- a/.ci/pytorch/windows/cuda124.bat
+++ b/.ci/pytorch/windows/cuda124.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V124%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V126%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -37,7 +37,7 @@ IF "%CUDA_PATH_V128%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=5.0;6.0;6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@ -10,7 +10,7 @@ copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib

 copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
-copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib

 :: Should be set in build_pytorch.bat
 copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
--- a/.ci/pytorch/windows/internal/copy_cpu.bat
+++ b/.ci/pytorch/windows/internal/copy_cpu.bat
@ -1,3 +1,3 @@
-copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
+copy "%CONDA_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib
 :: Should be set in build_pytorch.bat
-copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
+copy "%libuv_ROOT%\bin\uv.dll" pytorch\torch\lib
--- a/.ci/pytorch/windows/internal/env_fix.bat
+++ b/.ci/pytorch/windows/internal/env_fix.bat
@ -0,0 +1,38 @@
+@echo off
+
+:: Caution: Please don't use this script locally
+:: It may destroy your build environment.
+
+setlocal
+
+if not exist "%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" (
+    echo Visual Studio %VC_YEAR% C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+set VC_VERSION_LOWER=17
+set VC_VERSION_UPPER=18
+if "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15INSTALLDIR=%%i"
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto vswhere
+    )
+)
+
+:vswhere
+
+if "%VS15VCVARSALL%"=="" (
+    echo Visual Studio %VC_YEAR% C++ BuildTools is required to compile PyTorch on Windows
+    exit /b 1
+)
+
+call "%VS15VCVARSALL%" x86_amd64
+for /f "usebackq tokens=*" %%i in (`where link.exe`) do move "%%i" "%%i.bak"
+
+endlocal
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -1,20 +0,0 @@
-set ADDITIONAL_OPTIONS=""
-set PYTHON_EXEC="python"
-if "%DESIRED_PYTHON%" == "3.13t" (
-    echo Python version is set to 3.13t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
-    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
-    set PYTHON_EXEC="python3.13t"
-) else (
-    echo DESIRED_PYTHON not defined, Python version is set to %DESIRED_PYTHON%
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/%DESIRED_PYTHON%.0/python-%DESIRED_PYTHON%.0-amd64.exe" %= @lint-ignore =%
-)
-
-del python-amd64.exe
-curl --retry 3 -kL "%PYTHON_INSTALLER_URL%" --output python-amd64.exe
-if errorlevel 1 exit /b 1
-
-start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_test=0 %ADDITIONAL_OPTIONS% TargetDir=%CD%\Python
-if errorlevel 1 exit /b 1
-
-set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
--- a/.ci/pytorch/windows/internal/setup.bat
+++ b/.ci/pytorch/windows/internal/setup.bat
@ -51,7 +51,7 @@ mkdir libtorch\test

 mkdir build
 pushd build
-%PYTHON_EXEC% ../tools/build_libtorch.py
+python ../tools/build_libtorch.py
 popd

 IF ERRORLEVEL 1 exit /b 1
@ -86,7 +86,7 @@ copy /Y "%LIBTORCH_PREFIX%-%PYTORCH_BUILD_VERSION%.zip" "%PYTORCH_FINAL_PACKAGE_
 goto build_end

 :pytorch
-%PYTHON_EXEC% setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"
+python setup.py bdist_wheel -d "%PYTORCH_FINAL_PACKAGE_DIR%"

 :build_end
 IF ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -35,8 +35,36 @@ exit /b 1
 :wheel
 echo "install wheel package"

-call "internal\install_python.bat"
+set PYTHON_INSTALLER_URL=
+if "%DESIRED_PYTHON%" == "3.13t" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.13" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.12" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.12.0/python-3.12.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.11" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.11.0/python-3.11.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.10" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.9" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.9.0/python-3.9.0-amd64.exe"
+if "%PYTHON_INSTALLER_URL%" == "" (
+    echo Python %DESIRED_PYTHON% not supported yet
+)

+set ADDITIONAL_OPTIONS=""
+set PYTHON_EXEC="python"
+if "%DESIRED_PYTHON%" == "3.13t" (
+    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
+    set PYTHON_EXEC="python3.13t"
+)
+
+del python-amd64.exe
+curl --retry 3 -kL "%PYTHON_INSTALLER_URL%" --output python-amd64.exe
+if errorlevel 1 exit /b 1
+
+:: According to https://docs.python.org/3/using/windows.html, setting PrependPath to 1 will prepend
+:: the installed Python to PATH system-wide. Even calling set PATH=%ORIG_PATH% later on won't make
+:: a change. As the builder directory will be removed after the smoke test, all subsequent non-binary
+:: jobs will fail to find any Python executable there
+start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_test=0 %ADDITIONAL_OPTIONS% TargetDir=%CD%\Python
+if errorlevel 1 exit /b 1
+
+set "PATH=%CD%\Python%PYTHON_VERSION%\Scripts;%CD%\Python;%PATH%"
 if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install --pre numpy==2.2.1 protobuf
 if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install --pre numpy==2.1.2 protobuf
 if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install --pre numpy==2.0.2 protobuf
@ -53,7 +81,7 @@ if "%PYTORCH_BUILD_VERSION:dev=%" NEQ "%PYTORCH_BUILD_VERSION%" (
 )

 set "EXTRA_INDEX= "
-if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"  %= @lint-ignore =%
+if "%CUDA_VERSION%" == "xpu" set "EXTRA_INDEX=--index-url https://download.pytorch.org/whl/%CHANNEL%/xpu"

 for /F "delims=" %%i in ('where /R "%PYTORCH_FINAL_PACKAGE_DIR:/=\%" *.whl') do %PYTHON_EXEC% -m pip install "%%i" %EXTRA_INDEX%
 if errorlevel 1 exit /b 1
@ -99,7 +127,6 @@ goto end
 :libtorch
 echo "install and test libtorch"

-if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1

 if ERRORLEVEL 1 exit /b 1
@ -111,10 +138,6 @@ pushd tmp\libtorch

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
-IF "%VC_YEAR%" == "2019" (
-    set VC_VERSION_LOWER=16
-    set VC_VERSION_UPPER=17
-)

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -70,7 +70,6 @@ echo "install and test libtorch"
 pip install cmake
 echo "installing cmake"

-if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1

 if ERRORLEVEL 1 exit /b 1
@ -83,10 +82,6 @@ pushd tmp\libtorch

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
-IF "%VC_YEAR%" == "2019" (
-    set VC_VERSION_LOWER=16
-    set VC_VERSION_UPPER=17
-)

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/vc_install_helper.bat
+++ b/.ci/pytorch/windows/internal/vc_install_helper.bat
@ -1,12 +1,8 @@
-if "%VC_YEAR%" == "2019" powershell windows/internal/vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell windows/internal/vs2022_install.ps1

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
-if "%VC_YEAR%" == "2019" (
-    set VC_VERSION_LOWER=16
-    set VC_VERSION_UPPER=17
-)
+

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"  -products Microsoft.VisualStudio.Product.BuildTools -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/vs2019_install.ps1
+++ b/.ci/pytorch/windows/internal/vs2019_install.ps1
@ -1,48 +0,0 @@
-# https://developercommunity.visualstudio.com/t/install-specific-version-of-vs-component/1142479
-# https://docs.microsoft.com/en-us/visualstudio/releases/2019/history#release-dates-and-build-numbers
-
-# 16.8.6 BuildTools
-$VS_DOWNLOAD_LINK = "https://ossci-windows.s3.us-east-1.amazonaws.com/vs16.8.6_BuildTools.exe"
-$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2019 Version 16.8.5 installer failed"
-    exit 1
-}
-
-if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe") {
-    $existingPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -version "[16, 17)" -property installationPath
-    if ($existingPath -ne $null) {
-        if (!${env:CIRCLECI}) {
-            echo "Found correctly versioned existing BuildTools installation in $existingPath"
-            exit 0
-        }
-        echo "Found existing BuildTools installation in $existingPath, keeping it"
-    }
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
-    curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe
-    if ($LASTEXITCODE -ne 0) {
-        echo "Download of the VS Collect tool failed."
-        exit 1
-    }
-    Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru
-    New-Item -Path "C:\w\build-results" -ItemType "directory" -Force
-    Copy-Item -Path "C:\Users\${env:USERNAME}\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\"
-    exit 1
-}
--- a/.ci/pytorch/windows/internal/vs_install.bat
+++ b/.ci/pytorch/windows/internal/vs_install.bat
@ -0,0 +1,28 @@
+@echo off
+
+set VS_DOWNLOAD_LINK=https://download.visualstudio.microsoft.com/download/pr/8f480125-28b8-4a2c-847c-c2b02a8cdd1b/64be21d4ada005d7d07896ed0b004c322409bd04d6e8eba4c03c9fa39c928e7a/vs_BuildTools.exe
+IF "%VS_LATEST%" == "1" (
+   set VS_INSTALL_ARGS= --nocache --norestart --quiet --wait --add Microsoft.VisualStudio.Workload.VCTools
+   set VSDEVCMD_ARGS=
+) ELSE (
+   set VS_INSTALL_ARGS=--nocache --quiet --wait --add Microsoft.VisualStudio.Workload.VCTools ^
+                                                --add Microsoft.VisualStudio.Component.VC.Tools.14.34 ^
+                                                --add Microsoft.Component.MSBuild ^
+                                                --add Microsoft.VisualStudio.Component.Roslyn.Compiler ^
+                                                --add Microsoft.VisualStudio.Component.TextTemplating ^
+                                                --add Microsoft.VisualStudio.Component.VC.CoreIde ^
+                                                --add Microsoft.VisualStudio.Component.VC.Redist.14.Latest ^
+                                                --add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core ^
+                                                --add Microsoft.VisualStudio.Component.VC.Tools.x86.x64 ^
+                                                --add Microsoft.VisualStudio.Component.VC.Tools.14.34 ^
+                                                --add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81
+   set VSDEVCMD_ARGS=-vcvars_ver=14.34
+)
+
+curl -k -L %VS_DOWNLOAD_LINK% --output vs_installer.exe
+if errorlevel 1 exit /b 1
+
+start /wait .\vs_installer.exe %VS_INSTALL_ARGS%
+if not errorlevel 0 exit /b 1
+if errorlevel 1 if not errorlevel 3010 exit /b 1
+if errorlevel 3011 exit /b 1
--- a/.ci/pytorch/windows/setup_build.bat
+++ b/.ci/pytorch/windows/setup_build.bat
@ -1,27 +0,0 @@
-IF "%DESIRED_PYTHON%"=="" (
-    echo DESIRED_PYTHON is NOT defined.
-    exit /b 1
-)
-
-call "internal\install_python.bat"
-
-%PYTHON_EXEC% --version
-set "PATH=%CD%\Python\Lib\site-packages\cmake\data\bin;%CD%\Python\Scripts;%CD%\Python;%PATH%"
-if "%DESIRED_PYTHON%" == "3.13t" %PYTHON_EXEC% -m pip install numpy==2.2.1 cmake
-if "%DESIRED_PYTHON%" == "3.13" %PYTHON_EXEC% -m pip install numpy==2.1.2 cmake
-if "%DESIRED_PYTHON%" == "3.12" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
-if "%DESIRED_PYTHON%" == "3.11" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
-if "%DESIRED_PYTHON%" == "3.10" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
-if "%DESIRED_PYTHON%" == "3.9" %PYTHON_EXEC% -m pip install numpy==2.0.2 cmake
-
-%PYTHON_EXEC% -m pip install pyyaml
-%PYTHON_EXEC% -m pip install mkl-include mkl-static
-%PYTHON_EXEC% -m pip install boto3 ninja typing_extensions setuptools==72.1.0
-
-where cmake.exe
-
-:: Install libuv
-curl -k https://s3.amazonaws.com/ossci-windows/libuv-1.40.0-h8ffe710_0.tar.bz2 -o libuv-1.40.0-h8ffe710_0.tar.bz2
-7z x -aoa libuv-1.40.0-h8ffe710_0.tar.bz2
-tar -xvf libuv-1.40.0-h8ffe710_0.tar -C %CD%\Python\
-set libuv_ROOT=%CD%\Python\Library
--- a/.circleci/scripts/binary_ios_build.sh
+++ b/.circleci/scripts/binary_ios_build.sh
@ -0,0 +1,47 @@
+#!/bin/bash
+set -ex -o pipefail
+
+echo ""
+echo "DIR: $(pwd)"
+WORKSPACE=/Users/distiller/workspace
+PROJ_ROOT=/Users/distiller/project
+export TCLLIBPATH="/usr/local/lib"
+
+# Install conda
+curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
+chmod +x ~/conda.sh
+/bin/bash ~/conda.sh -b -p ~/anaconda
+export PATH="~/anaconda/bin:${PATH}"
+source ~/anaconda/bin/activate
+
+# Install dependencies
+conda install numpy ninja pyyaml mkl mkl-include setuptools cmake requests typing-extensions --yes
+conda install -c conda-forge valgrind --yes
+export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+
+# sync submodules
+cd ${PROJ_ROOT}
+git submodule sync
+git submodule update --init --recursive
+
+# run build script
+chmod a+x ${PROJ_ROOT}/scripts/build_ios.sh
+echo "########################################################"
+cat ${PROJ_ROOT}/scripts/build_ios.sh
+echo "########################################################"
+echo "IOS_ARCH: ${IOS_ARCH}"
+echo "IOS_PLATFORM: ${IOS_PLATFORM}"
+echo "USE_PYTORCH_METAL: ${USE_PYTORCH_METAL}"
+echo "USE_COREML_DELEGATE: ${USE_COREML_DELEGATE}"
+export IOS_ARCH=${IOS_ARCH}
+export IOS_PLATFORM=${IOS_PLATFORM}
+export USE_PYTORCH_METAL=${USE_PYTORCH_METAL}
+export USE_COREML_DELEGATE=${USE_COREML_DELEGATE}
+unbuffer ${PROJ_ROOT}/scripts/build_ios.sh 2>&1 | ts
+
+#store the binary
+cd ${WORKSPACE}
+DEST_DIR=${WORKSPACE}/ios
+mkdir -p ${DEST_DIR}
+cp -R ${PROJ_ROOT}/build_ios/install ${DEST_DIR}
+mv ${DEST_DIR}/install ${DEST_DIR}/${IOS_ARCH}
--- a/.circleci/scripts/binary_ios_test.sh
+++ b/.circleci/scripts/binary_ios_test.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+set -ex -o pipefail
+
+if ! [ "$IOS_PLATFORM" == "SIMULATOR" ]; then
+    exit 0
+fi
+
+echo ""
+echo "DIR: $(pwd)"
+PROJ_ROOT=/Users/distiller/project
+cd ${PROJ_ROOT}/ios/TestApp
+# install fastlane
+sudo gem install bundler && bundle install
+# run the ruby build script
+if ! [ -x "$(command -v xcodebuild)" ]; then
+    echo 'Error: xcodebuild is not installed.'
+    exit 1
+fi
+ruby ${PROJ_ROOT}/scripts/xcode_build.rb -i ${PROJ_ROOT}/build_ios/install -x ${PROJ_ROOT}/ios/TestApp/TestApp.xcodeproj -p ${IOS_PLATFORM}
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -0,0 +1,75 @@
+#!/bin/bash
+set -ex -o pipefail
+
+echo ""
+echo "DIR: $(pwd)"
+WORKSPACE=/Users/distiller/workspace
+PROJ_ROOT=/Users/distiller/project
+ARTIFACTS_DIR=${WORKSPACE}/ios
+ls ${ARTIFACTS_DIR}
+ZIP_DIR=${WORKSPACE}/zip
+mkdir -p ${ZIP_DIR}/install/lib
+mkdir -p ${ZIP_DIR}/src
+# copy header files
+cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
+# build a FAT bianry
+cd ${ZIP_DIR}/install/lib
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
+for lib in ${target_libs[*]}
+do
+    if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
+        libs=("${ARTIFACTS_DIR}/x86_64/lib/${lib}" "${ARTIFACTS_DIR}/arm64/lib/${lib}")
+        lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/${lib}
+    fi
+done
+lipo -i ${ZIP_DIR}/install/lib/*.a
+echo "BUILD_LITE_INTERPRETER: ${BUILD_LITE_INTERPRETER}"
+# copy the umbrella header and license
+if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
+    cp ${PROJ_ROOT}/ios/LibTorch-Lite.h ${ZIP_DIR}/src/
+else
+    cp ${PROJ_ROOT}/ios/LibTorch.h ${ZIP_DIR}/src/
+fi
+cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
+# zip the library
+export DATE="$(date -u +%Y%m%d)"
+export IOS_NIGHTLY_BUILD_VERSION="2.2.0.${DATE}"
+if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
+    # libtorch_lite_ios_nightly_1.11.0.20210810.zip
+    ZIPFILE="libtorch_lite_ios_nightly_${IOS_NIGHTLY_BUILD_VERSION}.zip"
+else
+    ZIPFILE="libtorch_ios_nightly_build.zip"
+fi
+cd ${ZIP_DIR}
+#for testing
+touch version.txt
+echo "${IOS_NIGHTLY_BUILD_VERSION}" > version.txt
+zip -r ${ZIPFILE} install src version.txt LICENSE
+# upload to aws
+# Install conda then 'conda install' awscli
+curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh
+chmod +x ~/conda.sh
+/bin/bash ~/conda.sh -b -p ~/anaconda
+export PATH="~/anaconda/bin:${PATH}"
+source ~/anaconda/bin/activate
+conda install -c conda-forge awscli --yes
+set +x
+export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD}
+export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD}
+set +x
+# echo "AWS KEY: ${AWS_ACCESS_KEY_ID}"
+# echo "AWS SECRET: ${AWS_SECRET_ACCESS_KEY}"
+aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read
+
+if [ "${BUILD_LITE_INTERPRETER}" == "1" ]; then
+    # create a new LibTorch-Lite-Nightly.podspec from the template
+    echo "cp ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec.template ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec"
+    cp ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec.template ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec
+
+    # update pod version
+    sed -i '' -e "s/IOS_NIGHTLY_BUILD_VERSION/${IOS_NIGHTLY_BUILD_VERSION}/g" ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec
+    cat ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec
+
+    # push the new LibTorch-Lite-Nightly.podspec to CocoaPods
+    pod trunk push --verbose --allow-warnings --use-libraries --skip-import-validation ${PROJ_ROOT}/ios/LibTorch-Lite-Nightly.podspec
+fi
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -9,11 +9,10 @@ if [[ "$OS" != "windows-arm64" ]]; then
    export USE_SCCACHE=1
    export SCCACHE_BUCKET=ossci-compiler-cache
    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-    export VC_YEAR=2019
+    export VC_YEAR=2022
 fi

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-    export VC_YEAR=2022
    export USE_SCCACHE=0
    export XPU_VERSION=2025.0
    export XPU_ENABLE_KINETO=1
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -4,10 +4,9 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"

 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export VC_YEAR=2019
+export VC_YEAR=2022

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-    export VC_YEAR=2022
    export XPU_VERSION=2025.0
 fi

--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@ -1,51 +1,34 @@
-FROM mcr.microsoft.com/vscode/devcontainers/base:ubuntu-22.04
+FROM mcr.microsoft.com/vscode/devcontainers/miniconda:0-3

-# Tools needed for development
-RUN apt-get -y update && \
-    apt-get install -y \
-    build-essential \
-    cmake \
-    ninja-build \
-    git \
-    python3 \
-    python3-pip \
-    python3-dev \
-    python3-venv \
-    libopenblas-dev
+#  I am suprised this is needed
+RUN conda init
+
+# Copy environment.yml (if found) to a temp location so we update the environment. Also
+# copy "noop.txt" so the COPY instruction does not fail if no environment.yml exists.
+COPY .devcontainer/cuda/environment.yml .devcontainer/noop.txt /tmp/conda-tmp/
+RUN if [ -f "/tmp/conda-tmp/environment.yml" ]; then umask 0002 && /opt/conda/bin/conda env update -n base -f /tmp/conda-tmp/environment.yml; fi \
+    && sudo rm -rf /tmp/conda-tmp

 # Tools needed for llvm
-RUN apt-get install --no-install-recommends -y lsb-release wget software-properties-common gnupg && \
-    sudo apt-get clean -y
-
-# Create Python virtual environment
-# RUN python3 -m venv /opt/venv
-# ENV PATH="/opt/venv/bin:$PATH"
-RUN pip3 install --upgrade pip
+RUN sudo apt-get -y update
+RUN sudo apt install -y lsb-release wget software-properties-common gnupg

 # Install CLANG if version is specified
 ARG CLANG_VERSION
 RUN if [ -n "$CLANG_VERSION" ]; then \
-    wget https://apt.llvm.org/llvm.sh; \
+    sudo wget https://apt.llvm.org/llvm.sh; \
    chmod +x llvm.sh; \
-    ./llvm.sh "${CLANG_VERSION}"; \
+    sudo ./llvm.sh "${CLANG_VERSION}"; \
    echo 'export CC=clang' >> ~/.bashrc; \
    echo 'export CXX=clang++' >> ~/.bashrc; \
-    apt-get install --no-install-recommends -y clang libomp-dev && \
-    apt-get clean -y; \
+    sudo apt update; \
+    sudo apt install -y clang; \
+    sudo apt install -y libomp-dev; \
    fi


-# Install CUDA if version is specified
+# Install cuda if version is specified
 ARG CUDA_VERSION
 RUN if [ -n "$CUDA_VERSION" ]; then \
-    CUDA_REPO_VERSION=$(echo ${CUDA_VERSION} | sed 's/\./\-/g'); \
-    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \
-    dpkg -i cuda-keyring_1.0-1_all.deb && \
-    apt-get install --no-install-recommends -y cuda-toolkit-${CUDA_VERSION} && \
-    apt-get clean -y; \
+       conda install -y cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \
    fi
-
-# Set PATH for CUDA
-ENV PATH="/usr/local/cuda/bin:${PATH}"
-ENV LD_LIBRARY_PATH="/usr/local/cuda/lib64:${LD_LIBRARY_PATH}"
-ENV PIP_BREAK_SYSTEM_PACKAGES=1
--- a/.devcontainer/cpu/devcontainer.json
+++ b/.devcontainer/cpu/devcontainer.json
@ -3,7 +3,7 @@
 {
  "name": "PyTorch - CPU",
  "build": {
-    "context": "./",
+    "context": "../..",
    "dockerfile": "../Dockerfile",
    "args": {
      "USERNAME": "vscode",
@ -11,12 +11,6 @@
      "CLANG_VERSION": ""
    }
  },
-  // Mount the full repo only after the container starts
-  "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached",
-  "workspaceFolder": "/workspace/pytorch",
-  "containerEnv": {
-    "PIP_USER": "0" // <‑‑ disable implicit --user
-  },

  // Features to add to the dev container. More info: https://containers.dev/features.
  "features": {
--- a/.devcontainer/cpu/environment.yml
+++ b/.devcontainer/cpu/environment.yml
@ -0,0 +1,6 @@
+# This environment is specific to Debian
+name: PyTorch
+dependencies:
+  - cmake
+  - ninja
+  - libopenblas
--- a/.devcontainer/cuda/devcontainer.json
+++ b/.devcontainer/cuda/devcontainer.json
@ -3,22 +3,16 @@
 {
  "name": "PyTorch - CUDA",
  "build": {
-    "context": "./",
+    "context": "../..",
    "dockerfile": "../Dockerfile",
    "args": {
      "USERNAME": "vscode",
      "BUILDKIT_INLINE_CACHE": "0",
-      "CUDA_VERSION": "12.8.0",
+      "CUDA_VERSION": "11.8.0",
      "CLANG_VERSION": ""
    }
  },
-  "runArgs": ["--runtime", "nvidia", "--gpus", "all"],
-  // Mount the full repo only after the container starts
-  "workspaceMount": "source=${localWorkspaceFolder},target=/workspace/pytorch,type=bind,consistency=cached",
-  "workspaceFolder": "/workspace/pytorch",
-  "containerEnv": {
-    "PIP_USER": "0" // <‑‑ disable implicit --user
-  },
+  "runArgs": ["--gpus", "all"],
 // Use 'forwardPorts' to make a list of ports inside the container available locally.
  // "forwardPorts": [],

--- a/.devcontainer/cuda/environment.yml
+++ b/.devcontainer/cuda/environment.yml
@ -0,0 +1,6 @@
+# This environment is specific to Debian
+name: PyTorch
+dependencies:
+  - cmake
+  - ninja
+  - libopenblas
--- a/.devcontainer/cuda/requirements.txt
+++ b/.devcontainer/cuda/requirements.txt
@ -1,2 +0,0 @@
-cmake
-ninja
--- a/.devcontainer/noop.txt
+++ b/.devcontainer/noop.txt
@ -0,0 +1,3 @@
+This file copied into the container along with environment.yml* from the parent
+folder. This file is included to prevents the Dockerfile COPY instruction from
+failing if no environment.yml is found.
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@ -8,6 +8,6 @@ git submodule update --init --recursive
 make setup-lint

 # Add CMAKE_PREFIX_PATH to bashrc
-echo 'export CMAKE_PREFIX_PATH=/usr/local' >> ~/.bashrc
+echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
 # Add linker path so that cuda-related libraries can be found
-echo 'export LDFLAGS="-L/usr/local/cuda/lib64/ $LDFLAGS"' >> ~/.bashrc
+echo 'export LDFLAGS="-L${CONDA_PREFIX}/lib/ $LDFLAGS"' >> ~/.bashrc
--- a/.flake8
+++ b/.flake8
@ -19,8 +19,6 @@ ignore =
    G100,G101,G200
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
-    # SIM104 is already covered by pyupgrade ruff
-    SIM104,
    # flake8-simplify code styles
    SIM102,SIM103,SIM106,SIM112,
    # TorchFix codes that don't make sense for PyTorch itself:
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -5,7 +5,7 @@ title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
 labels: "module: ci"
 ---

-> For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
+> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
 > created, the job will be disabled within 15 minutes. You can check the
 > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json

--- a/.github/actions/check-old-whl/action.yml
+++ b/.github/actions/check-old-whl/action.yml
@ -1,31 +0,0 @@
-name: Reuse old wheel if possible
-
-description:
-  Reuse old wheel if possible
-
-inputs:
-  build-environment:
-    description: Build environment
-    required: true
-  workflow-id:
-    description: Workflow ID
-    required: true
-
-outputs:
-  changes:
-    description: Whether the wheel is reused or not
-    value: ${{ steps.check-file-changes.outputs.changes }}
-
-runs:
-  using: composite
-
-  steps:
-    # Check out pytorch with fetch depth 0
-    - name: Check file changes
-      id: check-file-changes
-      run: |
-        set -x
-        python .github/actions/check-old-whl/check_old_whl.py --build-environment "${{ inputs.build-environment }}" --workflow-id "${{ inputs.workflow-id }}"
-        if [ $? -ne 0 ]; then
-          echo "changes=true" >> "$GITHUB_OUTPUT"
-        fi
--- a/.github/actions/check-old-whl/reuse_old_whl.py
+++ b/.github/actions/check-old-whl/reuse_old_whl.py
@ -1,145 +0,0 @@
-from functools import lru_cache
-import os
-import subprocess
-import sys
-from pathlib import Path
-from typing import Optional
-import argparse
-import zipfile
-
-import requests
-
-@lru_cache
-def get_merge_base() -> str:
-    merge_base = subprocess.check_output(
-        ["git", "merge-base", "HEAD", "origin/main"],
-        text=True,
-        stderr=subprocess.DEVNULL,
-    ).strip()
-    return merge_base
-
-def ok_changed_file(file: str) -> bool:
-    if file.startswith("torch/") and file.endswith(".py") and not file.startswith("torch/csrc/"):
-        return True
-    if file.startswith("test/") and file.endswith(".py"):
-        return True
-    return False
-
-def check_changed_files():
-    merge_base = get_merge_base()
-    changed_files = subprocess.check_output(
-        ["git", "diff", "--name-only", merge_base, "HEAD"],
-        text=True,
-        stderr=subprocess.DEVNULL,
-    ).strip().split()
-
-    for file in changed_files:
-        if not ok_changed_file(file):
-            print(f"File {file} is not allowed to be changed.")
-            return False
-        else:
-            print(f"File {file} is allowed to be changed.")
-    return True
-
-def query_github_api(url: str) -> dict:
-    headers = {
-        "Accept": "application/vnd.github.v3+json",
-        "Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
-    }
-    response = requests.get(url, headers=headers)
-    return response.json()
-
-def find_old_whl(workflow_id: str, build_environment: str) -> bool:
-    if build_environment is None:
-        print("BUILD_ENVIRONMENT is not set.")
-        return False
-    merge_base = get_merge_base()
-
-    workflow_runs = query_github_api(
-        f"https://api.github.com/repos/pytorch/pytorch/actions/workflows/{workflow_id}/runs?head_sha={merge_base}&branch=main&status=completed&per_page=100"
-    )
-    if workflow_runs.get("total_count", 0) == 0:
-        print("No workflow runs found.")
-        return False
-    for run in workflow_runs.get("workflow_runs", []):
-        # Look in s3 for the old whl
-        run_id = run["id"]
-        try:
-            url = f"https://gha-artifacts.s3.amazonaws.com/pytorch/pytorch/{run_id}/{build_environment}/artifacts.zip"
-            response = requests.get(
-                url,
-            )
-            if response.status_code == 200:
-                os.makedirs("/tmp", exist_ok=True)
-                with open("/tmp/artifacts.zip", "wb") as f:
-                    f.write(response.content)
-                    print(f"Found old whl file from s3: {url}")
-                    return True
-        except requests.RequestException as e:
-            print(f"Error checking for old whl: {e}")
-            continue
-    return False
-
-
-def unzip_artifact_and_replace_files():
-    # Unzip the artifact and replace files
-    with zipfile.ZipFile("/tmp/artifacts.zip", "r") as zip_ref:
-        zip_ref.extractall("/tmp/artifacts")
-
-    # Rename wheel into zip
-    wheel_path = Path("/tmp/artifacts/dist").glob("*.whl")
-    print(wheel_path)
-    for path in wheel_path:
-        new_path = path.with_suffix(".zip")
-        os.rename(path, new_path)
-        print(f"Renamed {path} to {new_path}")
-        # Unzip the wheel
-        with zipfile.ZipFile(new_path, "r") as zip_ref:
-            print(f"Extracting {new_path} to /tmp/artifacts/dist/{new_path.stem}")
-            zip_ref.extractall(f"/tmp/artifacts/dist/{new_path.stem}")
-
-        # Copy python files into the artifact
-        subprocess.check_output(
-            ["rsync", "-avz", "torch", f"/tmp/artifacts/dist/{new_path.stem}/torch"],
-        )
-
-        # Zip the wheel back
-        with zipfile.ZipFile(new_path, "w") as zip_ref:
-            for root, _, files in os.walk(f"/tmp/artifacts/dist/{new_path.stem}"):
-                for file in files:
-                    file_path = os.path.join(root, file)
-                    zip_ref.write(file_path, os.path.relpath(file_path, f"/tmp/artifacts/dist/{new_path.stem}"))
-
-        # Reame back to whl
-        os.rename(new_path, path)
-
-        # Remove the extracted folder
-        subprocess.check_output(
-            ["rm", "-rf", f"/tmp/artifacts/dist/{new_path.stem}"],
-        )
-
-    # Rezip the artifact
-    with zipfile.ZipFile("/tmp/artifacts.zip", "w") as zip_ref:
-        for root, _, files in os.walk("/tmp/artifacts"):
-            for file in files:
-                file_path = os.path.join(root, file)
-                zip_ref.write(file_path, os.path.relpath(file_path, "/tmp/artifacts"))
-    # move artifact to the current directory
-    os.rename("/tmp/artifacts.zip", "artifacts.zip")
-
-    return None
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Check for old whl files.")
-    parser.add_argument("--workflow-id", type=str, required=True, help="Workflow ID")
-    parser.add_argument("--build-environment", type=str, required=True, help="Build environment")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    can_use_old_whl = check_changed_files()
-    if not find_old_whl(args.workflow_id, args.build_environment):
-        exit(1)
-    unzip_artifact_and_replace_files()
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -5,12 +5,6 @@ description: Set up ROCm host for CI
 runs:
  using: composite
  steps:
-    - name: Runner ROCm version
-      if: always()
-      shell: bash
-      run: |
-        dpkg -l | grep -E "  rocm"
-
    - name: Stop all running docker containers
      if: always()
      shell: bash
--- a/.github/actions/upload-sccache-stats/action.yml
+++ b/.github/actions/upload-sccache-stats/action.yml
@ -22,3 +22,18 @@ runs:
        retention-days: 14
        if-no-files-found: warn
        path: sccache-stats-*.json
+
+    - name: Format sccache stats
+      shell: bash
+      run: |
+        python3 -m tools.stats.sccache_stats_to_benchmark_format
+      env:
+        BUILD_TIME: ${{ inputs.build-time }}
+
+    - name: Upload sccache stats as benchmark
+      uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+      with:
+        benchmark-results-dir: test/test-reports
+        dry-run: false
+        schema-version: v3
+        github-token: ${{ inputs.github-token }}
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-2f78c953bb4a81d269c7d4b7b36e218a1f090fab
+bccaa454a54c3c648697cc2f46a4fb0500b1f01b
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-8d9e34b352af09c81ff8df448fd27f9c4aae1382
+ac9a39f4b768cef09b9d2be8e074be496d7783b6
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -42,7 +42,7 @@
  - "module: aotinductor"
  - "module: cudagraphs"
  - "oncall: export"
-  - "module: compile-time"
+  - "module: startup-tracing-compile"
  - "module: compiled autograd"
  - "module: flex attention"
  - "module: dynamic shapes"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,8 +27,6 @@ ciflow_push_tags:
 - ciflow/torchbench
 - ciflow/autoformat
 - ciflow/op-benchmark
- ciflow/pull
- ciflow/h100
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-# ruff: noqa: LOG015

 import json
 import logging
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Joona Havukainen	918fe1d358	Halves time spent in generating the key strings	2025-04-22 11:32:36 -07:00
Joona Havukainen	2f6940cc55	Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.	2025-04-22 11:32:34 -07:00