[compiled autograd] torch.compile API

ghstack-source-id: 28065ffb2ed01641b4dcd31fb8fc0e729192f9ec Pull Request resolved: https://github.com/pytorch/pytorch/pull/125880
[compiled autograd] clear compiled_autograd_verbose once test is done
2025-10-28 10:34:54 +08:00 · 2024-05-16 01:00:03 -07:00 · 2024-05-14 08:30:58 -07:00 · 2024-05-14 08:30:57 -07:00 · 2024-05-14 08:30:57 -07:00
7513 changed files with 374113 additions and 364027 deletions
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -1,4 +1,4 @@
-# Docker images for GitHub CI and CD
+# Docker images for GitHub CI
 This directory contains everything needed to build the Docker images
 that are used in our CI.
@ -12,7 +12,7 @@ each image as the `BUILD_ENVIRONMENT` environment variable.
 See `build.sh` for valid build environments (it's the giant switch).
-## Docker CI builds
+## Contents
 * `build.sh` -- dispatch script to launch all builds
 * `common` -- scripts used to execute individual Docker build stages
@ -21,12 +21,6 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
 * `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support
 ### Docker CD builds
 * `conda` - Dockerfile and build.sh to build Docker images used in nightly conda builds
 * `manywheel` - Dockerfile and build.sh to build Docker images used in nightly manywheel builds
 * `libtorch` - Dockerfile and build.sh to build Docker images used in nightly libtorch builds
 ## Usage
 ```bash
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +0,0 @@
 0.7b
 manylinux_2_17
 rocm6.2
 9be04068c3c0857a4cfd17d7e39e71d0423ebac2
 3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,30 +84,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5
-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
+_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -119,24 +105,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -149,39 +120,9 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -193,37 +134,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -236,7 +149,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-py3-clang10-onnx)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -245,7 +158,7 @@ case "$image" in
    ONNX=yes
    ;;
  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=9
    LLVMDEV=yes
    PROTOBUF=yes
@ -254,8 +167,8 @@ case "$image" in
    GRADLE_VERSION=6.8.3
    NINJA_VERSION=1.9.0
    ;;
-  pytorch-linux-focal-py3.9-clang10)
+  pytorch-linux-focal-py3.8-clang10)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -276,8 +189,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-py3.9-gcc9)
+  pytorch-linux-focal-py3.8-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -286,7 +199,18 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    ROCM_VERSION=6.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-py3)
    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -296,30 +220,19 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    ROCM_VERSION=6.2
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    XPU_VERSION=0.5
+    BASEKIT_VERSION=2024.0.0-49522
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
+    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -330,10 +243,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -355,8 +268,8 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc11)
+  pytorch-linux-jammy-py3.8-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -373,14 +286,6 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -388,7 +293,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
@ -408,22 +313,6 @@ case "$image" in
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping sccache due to the following issue
    # https://github.com/pytorch/pytorch/issues/121559
    SKIP_SCCACHE_INSTALL=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
@ -471,7 +360,7 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 9 ]]; then
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi
@ -514,8 +403,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "HALIDE=${HALIDE}" \
+       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -524,7 +412,7 @@ docker build \
       "$@" \
       .
-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
 RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -108,17 +105,10 @@ ENV CMAKE_C_COMPILER cc
 ENV CMAKE_CXX_COMPILER c++
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
 # Install AOTriton (Early fail)
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-cd1c833b079adb324871dcbbe75b43d42ffc0ade
+d4b3e5cc607e97afdba79dc90f8ef968142f347c
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +0,0 @@
 461c12871f336fe6f57b55d6a297f13ef209161b
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-ac3470188b914c5d7a5058a7e28b9eb685a62427
+730b907b4d45a4713cbc425cbf224c46089fd514
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -0,0 +1 @@
 bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-91b14bf5593cf58a8541f3e6b9125600a867d4ef
+b8c64f64c18d8cac598b3adb355c21e7439c21de
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5fe38ffd73c2ac6ed6323b554205186696631c6f
+45fff310c891f5a92d55445adf8cc9d29df5841e
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail
-readonly version=v24.04
+readonly version=v23.08
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary
--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -1,5 +0,0 @@
 #!/bin/bash
 set -ex
 cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -1,23 +0,0 @@
 #!/bin/bash
 set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 TARBALL='aotriton.tar.gz'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
 AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
 curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
 ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
 if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
  echo " which does not match the expected value ${SHA256}."
  exit
 fi
 tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex
 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -5,22 +5,32 @@ set -ex
 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
  fi
  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
  MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)
 if [[ $(uname -m) == "aarch64" ]]; then
  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
  case "$MAJOR_PYTHON_VERSION" in
-    3);;
+    3)
      CONDA_FILE="Miniforge3-Linux-aarch64.sh"
    ;;
    *)
      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
      exit 1
      ;;
  esac
 else
  case "$MAJOR_PYTHON_VERSION" in
    3)
      CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
    ;;
    *)
      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
      exit 1
      ;;
  esac
 fi
  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda
@ -68,20 +78,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
-      NUMPY_VERSION=1.24.4
+      conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS}
    else
-      NUMPY_VERSION=1.26.2
+      conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS}
    fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
-      NUMPY_VERSION=1.26.0
+      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
-      NUMPY_VERSION=1.21.2
+      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
    fi
  fi
  conda_install ${CONDA_COMMON_DEPS}
  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -103,7 +112,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
-  pip_install numpy=="$NUMPY_VERSION"
+
  pip_install -U scikit-learn
  if [ -n "$DOCS" ]; then
--- a/.ci/docker/common/install_conda_docker.sh
+++ b/.ci/docker/common/install_conda_docker.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 # Anaconda
 # Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git
 # Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example
 MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh
 wget -q $MINICONDA_URL
 # NB: Manually invoke bash per https://github.com/conda/conda/issues/10431
 bash $(basename "$MINICONDA_URL") -b -p /opt/conda
 rm $(basename "$MINICONDA_URL")
 export PATH=/opt/conda/bin:$PATH
 # See https://github.com/pytorch/builder/issues/1473
 # Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1
 conda install -y conda=23.5.2 conda-build anaconda-client git ninja
 # The cmake version here needs to match with the minimum version of cmake
 # supported by PyTorch (3.18). There is only 3.18.2 on anaconda
 /opt/conda/bin/pip3 install cmake==3.18.2
 conda remove -y --force patchelf
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -1,112 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -uex -o pipefail
 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
 PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
 # Python versions to be installed in /opt/$VERSION_NO
 CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
 function check_var {
    if [ -z "$1" ]; then
        echo "required variable not defined"
        exit 1
    fi
 }
 function do_cpython_build {
    local py_ver=$1
    local py_folder=$2
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
    local additional_flags=""
    if [ "$py_ver" == "3.13.0t" ]; then
        additional_flags=" --disable-gil"
        mv cpython-3.13/ cpython-3.13t/
    fi
    pushd $py_folder
    local prefix="/opt/_internal/cpython-${py_ver}"
    mkdir -p ${prefix}/lib
    if [[ -n $(which patchelf) ]]; then
        local shared_flags="--enable-shared"
    else
        local shared_flags="--disable-shared"
    fi
    if [[ -z  "${WITH_OPENSSL+x}" ]]; then
        local openssl_flags=""
    else
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null
    make -j40 > /dev/null
    make install > /dev/null
    if [[ "${shared_flags}" == "--enable-shared" ]]; then
        patchelf --set-rpath '$ORIGIN/../lib' ${prefix}/bin/python3
    fi
    popd
    rm -rf $py_folder
    # Some python's install as bin/python3. Make them available as
    # bin/python.
    if [ -e ${prefix}/bin/python3 ]; then
        ln -s python3 ${prefix}/bin/python
    fi
    ${prefix}/bin/python get-pip.py
    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
        ln -s pip3 ${prefix}/bin/pip
    fi
    # install setuptools since python 3.12 is required to use distutils
    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
    ln -s ${prefix} /opt/python/${abi_tag}
 }
 function build_cpython {
    local py_ver=$1
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
    if [ "$py_ver" = "3.13.0t" ]; then
        PY_VER_SHORT="3.13"
        PYT_VER_SHORT="3.13t"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
    elif [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
        do_cpython_build $py_ver cpython-$PY_VER_SHORT
    else
        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
        do_cpython_build $py_ver Python-$py_ver
    fi
    rm -f Python-$py_ver.tgz
 }
 function build_cpythons {
    check_var $GET_PIP_URL
    curl -sLO $GET_PIP_URL
    for py_ver in $@; do
        build_cpython $py_ver
    done
    rm -f get-pip.py
 }
 mkdir -p /opt/python
 mkdir -p /opt/_internal
 build_cpythons $CPYTHON_VERSIONS
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -1,250 +0,0 @@
 #!/bin/bash
 set -ex
 NCCL_VERSION=v2.21.5-1
 CUDNN_VERSION=9.1.0.70
 function install_cusparselt_040 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_cusparselt_052 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_118 {
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
    chmod +x cuda_11.8.0_520.61.05_linux.run
    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
    rm -f cuda_11.8.0_520.61.05_linux.run
    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn && cd tmp_cudnn
    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf tmp_cudnn
    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
    cd nccl && make -j src.build
    cp -a build/include/* /usr/local/cuda/include/
    cp -a build/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf nccl
    install_cusparselt_040
    ldconfig
 }
 function install_121 {
    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
    # install CUDA 12.1.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
    chmod +x cuda_12.1.1_530.30.02_linux.run
    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
    rm -f cuda_12.1.1_530.30.02_linux.run
    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn && cd tmp_cudnn
    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf tmp_cudnn
    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
    cd nccl && make -j src.build
    cp -a build/include/* /usr/local/cuda/include/
    cp -a build/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf nccl
    install_cusparselt_052
    ldconfig
 }
 function install_124 {
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
  chmod +x cuda_12.4.1_550.54.15_linux.run
  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
  rm -f cuda_12.4.1_550.54.15_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
  cd nccl && make -j src.build
  cp -a build/include/* /usr/local/cuda/include/
  cp -a build/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf nccl
  install_cusparselt_062
  ldconfig
 }
 function prune_118 {
    echo "Pruning CUDA 11.8 and cuDNN"
    #####################################################################################
    # CUDA 11.8 prune static libs
    #####################################################################################
    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    if [[ -n "$OVERRIDE_GENCODE" ]]; then
        export GENCODE=$OVERRIDE_GENCODE
    fi
    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
    # prune CuDNN and CuBLAS
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
    #####################################################################################
    # CUDA 11.8 prune visual tools
    #####################################################################################
    export CUDA_BASE="/usr/local/cuda-11.8/"
    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
 }
 function prune_121 {
  echo "Pruning CUDA 12.1"
  #####################################################################################
  # CUDA 12.1 prune static libs
  #####################################################################################
    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    if [[ -n "$OVERRIDE_GENCODE" ]]; then
        export GENCODE=$OVERRIDE_GENCODE
    fi
    # all CUDA libs except CuDNN and CuBLAS
    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
    # prune CuDNN and CuBLAS
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
    #####################################################################################
    # CUDA 12.1 prune visual tools
    #####################################################################################
    export CUDA_BASE="/usr/local/cuda-12.1/"
    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
 }
 function prune_124 {
  echo "Pruning CUDA 12.4"
  #####################################################################################
  # CUDA 12.4 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    11.8) install_118; prune_118
        ;;
    12.1) install_121; prune_121
        ;;
    12.4) install_124; prune_124
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
    shift
 done
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -1,93 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 NCCL_VERSION=v2.21.5-1
 function install_cusparselt_052 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_124 {
  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
  cd nccl && make -j src.build
  cp -a build/include/* /usr/local/cuda/include/
  cp -a build/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf nccl
  install_cusparselt_052
  ldconfig
 }
 function prune_124 {
  echo "Pruning CUDA 12.4"
  #####################################################################################
  # CUDA 12.4 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    12.4) install_124; prune_124
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
    shift
 done
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,18 +1,20 @@
 #!/bin/bash
-if [[ -n "${CUDNN_VERSION}" ]]; then
+if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
        print "Unsupported CUDA version ${CUDA_VERSION}"
        exit 1
    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+
    tar xf ${CUDNN_NAME}.tar.xz
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
--- a/.ci/docker/common/install_cudss.sh
+++ b/.ci/docker/common/install_cudss.sh
@ -1,25 +0,0 @@
 #!/bin/bash
 set -ex
 # cudss license: https://docs.nvidia.com/cuda/cudss/license.html
 mkdir tmp_cudss && cd tmp_cudss
 if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz
    # only for cuda 12
    tar xf ${CUDSS_NAME}.tar.xz
    cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/
 fi
 cd ..
 rm -rf tmp_cudss
 ldconfig
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,22 +5,9 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
-    arch_path='sbsa'
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -37,9 +37,6 @@ install_conda_dependencies() {
 install_pip_dependencies() {
  pushd executorch/.ci/docker
  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
  # binaries later, ExecuTorch only needs CPU
  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
  # Install all Python dependencies
  pip_install -r requirements-ci.txt
  popd
@ -47,14 +44,13 @@ install_pip_dependencies() {
 setup_executorch() {
  pushd executorch
-  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+  source .ci/scripts/utils.sh
  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
-  export PYTHON_EXECUTABLE=python
+  install_flatc_from_source
-  export EXECUTORCH_BUILD_PYBIND=ON
+  pip_install .
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-  as_jenkins .ci/scripts/setup-linux.sh cmake
+  # Make sure that all the newly generate files are owned by Jenkins
  chown -R jenkins .
  popd
 }
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -1,46 +0,0 @@
 #!/bin/bash
 set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 COMMIT=$(get_pinned_commit halide)
 test -n "$COMMIT"
 # activate conda to populate CONDA_PREFIX
 test -n "$ANACONDA_PYTHON_VERSION"
 eval "$(conda shell.bash hook)"
 conda activate py_$ANACONDA_PYTHON_VERSION
 if [ -n "${UBUNTU_VERSION}" ];then
    apt update
    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi
 conda_install numpy scipy imageio cmake ninja
 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
        -DLLVM_ENABLE_PROJECTS="clang" \
        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
        -S llvm-project/llvm -B llvm-build -G Ninja
 cmake --build llvm-build
 cmake --install llvm-build --prefix llvm-install
 export LLVM_ROOT=`pwd`/llvm-install
 export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
 git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
 cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
 chown -R jenkins ${CONDA_PREFIX}
 popd
 rm -rf Halide llvm-build llvm-project llvm-install
 python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_libpng.sh
+++ b/.ci/docker/common/install_libpng.sh
@ -1,23 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 LIBPNG_VERSION=1.6.37
 mkdir -p libpng
 pushd libpng
 wget http://download.sourceforge.net/libpng/libpng-$LIBPNG_VERSION.tar.gz
 tar -xvzf libpng-$LIBPNG_VERSION.tar.gz
 pushd libpng-$LIBPNG_VERSION
 ./configure
 make
 make install
 popd
 popd
 rm -rf libpng
--- a/.ci/docker/common/install_magma.sh
+++ b/.ci/docker/common/install_magma.sh
@ -1,29 +0,0 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 set -eou pipefail
 MAGMA_VERSION="2.5.2"
 function do_install() {
    cuda_version=$1
    cuda_version_nodot=${1/./}
    MAGMA_VERSION="2.6.1"
    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
    cuda_dir="/usr/local/cuda-${cuda_version}"
    (
        set -x
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
        tar -xvf "${magma_archive}"
        mkdir -p "${cuda_dir}/magma"
        mv include "${cuda_dir}/magma/include"
        mv lib "${cuda_dir}/magma/lib"
        popd
    )
 }
 do_install $1
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -1,172 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 ROCM_VERSION=$1
 if [[ -z $ROCM_VERSION ]]; then
    echo "missing ROCM_VERSION"
    exit 1;
 fi
 IS_UBUNTU=0
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
  ubuntu)
    IS_UBUNTU=1
    ;;
  centos)
    IS_UBUNTU=0
    ;;
  *)
    echo "Unable to determine OS..."
    exit 1
    ;;
 esac
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
 IFS="$save_IFS"
 if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
    ROCM_VERSION_PATCH=0
 elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
 else
    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
    exit 1
 fi
 ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
 # Install custom MIOpen + COMgr for ROCm >= 4.0.1
 if [[ $ROCM_INT -lt 40001 ]]; then
    echo "ROCm version < 4.0.1; will not install custom MIOpen"
    exit 0
 fi
 # Function to retry functions that sometimes timeout or have flaky failures
 retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }
 # Build custom MIOpen to use comgr for offline compilation.
 ## Need a sanitized ROCM_VERSION without patchlevel; patchlevel version 0 must be added to paths.
 ROCM_DOTS=$(echo ${ROCM_VERSION} | tr -d -c '.' | wc -c)
 if [[ ${ROCM_DOTS} == 1 ]]; then
    ROCM_VERSION_NOPATCH="${ROCM_VERSION}"
    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}.0"
 else
    ROCM_VERSION_NOPATCH="${ROCM_VERSION%.*}"
    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
 fi
 # MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
 MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_USE_COMGR=ON
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
 if [[ $ROCM_INT -ge 60300 ]]; then
    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
 elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
 elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
 elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
 elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
 elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
 elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
 elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
 else
    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
    exit 1
 fi
 if [[ ${IS_UBUNTU} == 1 ]]; then
  apt-get remove -y miopen-hip
 else
  yum remove -y miopen-hip
 fi
 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
 # Don't build CK to save docker build time
 if [[ $ROCM_INT -ge 60200 ]]; then
    sed -i '/composable_kernel/d' requirements.txt
 fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
    sed -i '/rocMLIR/d' requirements.txt
 elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
    sed -i '/llvm-project-mlir/d' requirements.txt
 fi
 ## MIOpen minimum requirements
 cmake -P install_deps.cmake --minimum
 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
 if [[ ${IS_UBUNTU} == 1 ]]; then
  apt-get autoclean && apt-get clean
  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 else
  yum clean all
  rm -rf /var/cache/yum
  rm -rf /var/lib/yum/yumdb
  rm -rf /var/lib/yum/history
 fi
 ## Build MIOpen
 mkdir -p build
 cd build
 PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
    ${MIOPEN_CMAKE_COMMON_FLAGS} \
    ${MIOPEN_CMAKE_DB_FLAGS} \
    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
 make MIOpen -j $(nproc)
 # Build MIOpen package
 make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget
 if [[ ${IS_UBUNTU} == 1 ]]; then
  sudo dpkg -i miopen-hip*.deb
 else
  yum install -y miopen-*.rpm
 fi
 popd
 rm -rf MIOpen
--- a/.ci/docker/common/install_mkl.sh
+++ b/.ci/docker/common/install_mkl.sh
@ -1,16 +0,0 @@
 #!/bin/bash
 set -ex
 # MKL
 MKL_VERSION=2024.2.0
 MKLROOT=/opt/intel
 mkdir -p ${MKLROOT}
 pushd /tmp
 python3 -mpip install wheel
 python3 -mpip download -d . mkl-static==${MKL_VERSION}
 python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
 python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
 mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
 mv mkl_include-${MKL_VERSION}/mkl_include-${MKL_VERSION}.data/data/include ${MKLROOT}
--- a/.ci/docker/common/install_mnist.sh
+++ b/.ci/docker/common/install_mnist.sh
@ -1,13 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 mkdir -p /usr/local/mnist/
 cd /usr/local/mnist
 for img in train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz; do
  wget -q https://ossci-datasets.s3.amazonaws.com/mnist/$img
  gzip -d $img
 done
--- a/.ci/docker/common/install_nvpl.sh
+++ b/.ci/docker/common/install_nvpl.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 set -ex
 function install_nvpl {
    mkdir -p /opt/nvpl/lib /opt/nvpl/include
    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_blas/linux-sbsa/nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
    tar xf nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/lib/* /opt/nvpl/lib/
    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/include/* /opt/nvpl/include/
    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_lapack/linux-sbsa/nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
    tar xf nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/lib/* /opt/nvpl/lib/
    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/include/* /opt/nvpl/include/
 }
 install_nvpl
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -15,7 +15,7 @@ pip_install \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
-  networkx==2.5 \
+  networkx==2.0 \
  numpy==1.24.2
 # ONNXRuntime should be installed before installing
@ -30,11 +30,10 @@ pip_install \
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.18.1
+pip_install onnxruntime==1.17.0
-pip_install onnx==1.16.2
+pip_install onnx==1.15.0
-pip_install onnxscript==0.1.0.dev20240831 --no-deps
+# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-# required by onnxscript
+pip_install onnxscript==0.1.0.dev20240315 --no-deps
 pip_install ml_dtypes
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -1,22 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 cd /
 git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
 USE_OPENMP=1
 NO_SHARED=0
 DYNAMIC_ARCH=1
 TARGET=ARMV8
 CFLAGS=-O3
 "
 OPENBLAS_CHECKOUT_DIR="OpenBLAS"
 make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
 make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
--- a/.ci/docker/common/install_patchelf.sh
+++ b/.ci/docker/common/install_patchelf.sh
@ -1,16 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 set -ex
 # Pin the version to latest release 0.17.2, building newer commit starts
 # to fail on the current image
 git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf
 cd patchelf
 sed -i 's/serial/parallel/g' configure.ac
 ./bootstrap.sh
 ./configure
 make
 make install
 cd ..
 rm -rf patchelf
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -39,8 +39,7 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
+                   roctracer-dev
                   amd-smi-lib
    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
@ -107,8 +106,7 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
+                   roctracer-dev
                   amd-smi-lib
  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -1,150 +0,0 @@
 #!/bin/bash
 # Script used only in CD pipeline
 ###########################
 ### prereqs
 ###########################
 # Install Python packages depending on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
  ubuntu)
    apt-get update -y
    apt-get install -y libpciaccess-dev pkg-config
    apt-get clean
    ;;
  centos)
    yum install -y libpciaccess-devel pkgconfig
    ;;
  *)
    echo "Unable to determine OS..."
    exit 1
    ;;
 esac
 python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
 GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm
 ###########################
 ### patch
 ###########################
 patch -p1 <<'EOF'
 diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c
 index a5007ffc..13fa07fc 100644
 --- a/amdgpu/amdgpu_asic_id.c
 +++ b/amdgpu/amdgpu_asic_id.c
@@ -22,6 +22,13 @@
  *
  */
 +#define _XOPEN_SOURCE 700
 +#define _LARGEFILE64_SOURCE
 +#define _FILE_OFFSET_BITS 64
 +#include <ftw.h>
 +#include <link.h>
 +#include <limits.h>
 +
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -34,6 +41,19 @@
 #include "amdgpu_drm.h"
 #include "amdgpu_internal.h"
 +static char *amdgpuids_path = NULL;
 +static const char* amdgpuids_path_msg = NULL;
 +
 +static int check_for_location_of_amdgpuids(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo)
 +{
 +	if (typeflag == FTW_F && strstr(filepath, "amdgpu.ids")) {
 +		amdgpuids_path = strdup(filepath);
 +		return 1;
 +	}
 +
 +	return 0;
 +}
 +
 static int parse_one_line(struct amdgpu_device *dev, const char *line)
 {
 	char *buf, *saveptr;
@@ -113,10 +133,46 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
 	int line_num = 1;
 	int r = 0;
 +	// attempt to find typical location for amdgpu.ids file
 	fp = fopen(AMDGPU_ASIC_ID_TABLE, "r");
 +
 +	// if it doesn't exist, search
 +	if (!fp) {
 +
 +	char self_path[ PATH_MAX ];
 +	ssize_t count;
 +	ssize_t i;
 +
 +	count = readlink( "/proc/self/exe", self_path, PATH_MAX );
 +	if (count > 0) {
 +		self_path[count] = '\0';
 +
 +		// remove '/bin/python' from self_path
 +		for (i=count; i>0; --i) {
 +			if (self_path[i] == '/') break;
 +			self_path[i] = '\0';
 +		}
 +		self_path[i] = '\0';
 +		for (; i>0; --i) {
 +			if (self_path[i] == '/') break;
 +			self_path[i] = '\0';
 +		}
 +		self_path[i] = '\0';
 +
 +		if (1 == nftw(self_path, check_for_location_of_amdgpuids, 5, FTW_PHYS)) {
 +			fp = fopen(amdgpuids_path, "r");
 +			amdgpuids_path_msg = amdgpuids_path;
 +		}
 +	}
 +
 +	}
 +	else {
 +		amdgpuids_path_msg = AMDGPU_ASIC_ID_TABLE;
 +	}
 +
 +	// both hard-coded location and search have failed
 	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
 +		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
 		return;
 	}
@@ -132,7 +188,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
 			continue;
 		}
 -		drmMsg("%s version: %s\n", AMDGPU_ASIC_ID_TABLE, line);
 +		drmMsg("%s version: %s\n", amdgpuids_path_msg, line);
 		break;
 	}
@@ -150,7 +206,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
 	if (r == -EINVAL) {
 		fprintf(stderr, "Invalid format: %s: line %d: %s\n",
 -			AMDGPU_ASIC_ID_TABLE, line_num, line);
 +			amdgpuids_path_msg, line_num, line);
 	} else if (r && r != -EAGAIN) {
 		fprintf(stderr, "%s: Cannot parse ASIC IDs: %s\n",
 			__func__, strerror(-r));
 EOF
 ###########################
 ### build
 ###########################
 meson builddir --prefix=/opt/amdgpu
 pushd builddir
 ninja install
 popd
 popd
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,11 +1,7 @@
 #!/bin/bash
 # Script used in CI and CD pipeline
 set -ex
 MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
@ -15,10 +11,7 @@ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
 fi
 echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
 echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
 export PATH="${PATH}:/opt/rocm/bin"
 if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
@ -32,7 +25,7 @@ done
 # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
 sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
 make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
 popd
 mv magma /opt/rocm
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -12,7 +12,10 @@ conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }
-if [ -n "${XPU_VERSION}" ]; then
+if [ -n "${ROCM_VERSION}" ]; then
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton-rocm"
 elif [ -n "${BASEKIT_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
@ -38,33 +41,19 @@ if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
 fi
 # Git checkout triton
 mkdir /var/lib/jenkins/triton
 chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/
 as_jenkins git clone ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 cd python
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
 if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9
-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9
-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 else
-  pip_install -e .
+  pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 fi
 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -1,33 +1,30 @@
 #!/bin/bash
 set -xe
-# Script used in CI and CD pipeline
+
 # Intel® software for general purpose GPU capabilities.
-# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html
 # Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
 # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html
 # Users should update to the latest version as it becomes available
 function install_ubuntu() {
    . /etc/os-release
    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
        echo "Ubuntu version ${VERSION_CODENAME} not supported"
        exit
    fi
    apt-get update -y
    apt-get install -y gpg-agent wget
-    # To add the online network package repository for the GPU Driver
+
    # Set up the repository. To do this, download the key to the system keyring
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
+        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}${XPU_DRIVER_VERSION} unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
+
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
+    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
        | tee /etc/apt/sources.list.d/oneAPI.list
    # Update the packages list and repository index
    apt-get update
@ -43,11 +40,11 @@ function install_ubuntu() {
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel Support Packages
+    # Install Intel® oneAPI Base Toolkit
-    if [ -n "$XPU_VERSION" ]; then
+    if [ -n "$BASEKIT_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
+        apt-get install intel-basekit=$BASEKIT_VERSION -y
    else
-        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
+        apt-get install intel-basekit -y
    fi
    # Cleanup
@ -55,49 +52,44 @@ function install_ubuntu() {
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 }
-function install_rhel() {
+function install_centos() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
    elif [[ "${ID}" == "almalinux" ]]; then
        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
        VERSION_ID="8.6"
    fi
    dnf install -y 'dnf-command(config-manager)'
    # To add the online network package repository for the GPU Driver
    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
+        https://repositories.intel.com/gpu/rhel/8.6/production/2328/unified/intel-gpu-8.6.repo
-    # To add the online network network package repository for the Intel Support Packages
+    # To add the EPEL repository needed for DKMS
-    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
+    dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
-[intel-for-pytorch-gpu-dev]
+        # https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
-name=Intel for Pytorch GPU dev repository
+
-baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
+    # Create the YUM repository file in the /temp directory as a normal user
    tee > /tmp/oneAPI.repo << EOF
 [oneAPI]
 name=Intel® oneAPI repository
 baseurl=https://yum.repos.intel.com/oneapi
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 EOF
    # Move the newly created oneAPI.repo file to the YUM configuration directory /etc/yum.repos.d
    mv /tmp/oneAPI.repo /etc/yum.repos.d
    # The xpu-smi packages
-    dnf install -y xpu-smi
+    dnf install -y flex bison xpu-smi
    # Compute and Media Runtimes
-    dnf install --skip-broken -y \
+    dnf install -y \
        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
        mesa-libxatracker libvpl-tools intel-metrics-discovery \
        intel-metrics-library intel-igc-core intel-igc-cm \
-        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc
+        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc hwinfo clinfo
    # Development packages
    dnf install -y --refresh \
        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
        level-zero-devel
-    # Install Intel Support Packages
+    # Install Intel® oneAPI Base Toolkit
-    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
+    dnf install intel-basekit -y
    # Cleanup
    dnf clean all
@ -106,41 +98,6 @@ EOF
    rm -rf /var/lib/yum/history
 }
 function install_sles() {
    . /etc/os-release
    VERSION_SP=${VERSION_ID//./sp}
    if [[ ! " 15sp4 15sp5 " =~ " ${VERSION_SP} " ]]; then
        echo "SLES version ${VERSION_ID} not supported"
        exit
    fi
    # To add the online network package repository for the GPU Driver
    zypper addrepo -f -r \
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
    # The xpu-smi packages
    zypper install -y lsb-release flex bison xpu-smi
    # Compute and Media Runtimes
    zypper install -y intel-level-zero-gpu level-zero intel-gsc intel-opencl intel-ocloc \
        intel-media-driver libigfxcmrt7 libvpl2 libvpl-tools libmfxgen1 libmfx1
    # Development packages
    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
    # Install Intel Support Packages
    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
 }
 # Default use GPU driver LTS releases
 XPU_DRIVER_VERSION="/lts/2350"
 if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    # Use GPU driver rolling releases
    XPU_DRIVER_VERSION=""
 fi
 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@ -148,11 +105,8 @@ case "$ID" in
    ubuntu)
        install_ubuntu
    ;;
-    rhel|almalinux)
+    centos)
-        install_rhel
+        install_centos
    ;;
    sles)
        install_sles
    ;;
    *)
        echo "Unable to determine OS..."
--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -1,100 +0,0 @@
 ARG CUDA_VERSION=10.2
 ARG BASE_TARGET=cuda${CUDA_VERSION}
 FROM centos:7 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=9
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum update -y
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 # EPEL for cmake
 RUN yum --enablerepo=extras install -y epel-release
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 RUN yum install -y autoconf aclocal automake make sudo
 RUN rm -rf /usr/local/cuda-*
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf
 FROM base as openssl
 # Install openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 FROM base as conda
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
 ARG CUDA_VERSION=10.2
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
 ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
 FROM cuda as cuda11.8
 RUN bash ./install_cuda.sh 11.8
 ENV DESIRED_CUDA=11.8
 FROM cuda as cuda12.1
 RUN bash ./install_cuda.sh 12.1
 ENV DESIRED_CUDA=12.1
 FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh
 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 # Final step
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
 COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
 COPY --from=conda              /opt/conda             /opt/conda
 # Add jni.h for java host build.
 COPY ./common/install_jni.sh install_jni.sh
 COPY ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 ENV  PATH /opt/conda/bin:$PATH
 COPY --from=mnist  /usr/local/mnist /usr/local/mnist
 RUN rm -rf /usr/local/cuda
 RUN chmod o+rw /usr/local
 RUN touch /.condarc && \
    chmod o+rw /.condarc && \
    chmod -R o+rw /opt/conda
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -1,82 +0,0 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 set -eou pipefail
 image="$1"
 shift
 if [ -z "${image}" ]; then
  echo "Usage: $0 IMAGE"
  exit 1
 fi
 DOCKER_IMAGE_NAME="pytorch/${image}"
 export DOCKER_BUILDKIT=1
 TOPDIR=$(git rev-parse --show-toplevel)
 CUDA_VERSION=${CUDA_VERSION:-12.1}
 case ${CUDA_VERSION} in
  cpu)
    BASE_TARGET=base
    DOCKER_TAG=cpu
    ;;
  all)
    BASE_TARGET=all_cuda
    DOCKER_TAG=latest
    ;;
  *)
    BASE_TARGET=cuda${CUDA_VERSION}
    DOCKER_TAG=cuda${CUDA_VERSION}
    ;;
 esac
 (
  set -x
  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
  sudo systemctl daemon-reload
  sudo systemctl restart docker
  docker build \
    --target final \
    --progress plain \
    --build-arg "BASE_TARGET=${BASE_TARGET}" \
    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
    --build-arg "DEVTOOLSET_VERSION=9" \
    -t ${DOCKER_IMAGE_NAME} \
    $@ \
    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
    ${TOPDIR}/.ci/docker/
 )
 if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
  # Test that we're using the right CUDA compiler
  (
    set -x
    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
  )
 fi
 GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH:-}" == true ]]; then
  (
    set -x
    docker push "${DOCKER_IMAGE_NAME}"
    if [[ -n ${GITHUB_REF} ]]; then
        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
        docker push "${DOCKER_IMAGE_SHA_TAG}"
    fi
  )
 fi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -1,107 +0,0 @@
 ARG BASE_TARGET=base
 ARG GPU_IMAGE=ubuntu:20.04
 FROM ${GPU_IMAGE} as base
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get clean && apt-get update
 RUN apt-get install -y curl locales g++ git-all autoconf automake make cmake wget unzip sudo
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 RUN locale-gen en_US.UTF-8
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 # Install openssl
 FROM base as openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # Install python
 FROM base as python
 ADD common/install_cpython.sh install_cpython.sh
 RUN apt-get update -y && \
    apt-get install build-essential gdb lcov libbz2-dev libffi-dev \
        libgdbm-dev liblzma-dev libncurses5-dev libreadline6-dev \
        libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y && \
    bash ./install_cpython.sh && \
    rm install_cpython.sh && \
    apt-get clean
 FROM base as conda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 FROM base as cpu
 # Install Anaconda
 COPY --from=conda /opt/conda /opt/conda
 # Install python
 COPY --from=python /opt/python    /opt/python
 COPY --from=python /opt/_internal /opt/_internal
 ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
 # Install MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
 ENV CUDA_HOME /usr/local/cuda
 FROM cuda as cuda11.8
 RUN bash ./install_cuda.sh 11.8
 RUN bash ./install_magma.sh 11.8
 RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
 FROM cuda as cuda12.1
 RUN bash ./install_cuda.sh 12.1
 RUN bash ./install_magma.sh 12.1
 RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
 FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 RUN bash ./install_magma.sh 12.4
 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
 FROM cpu as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
 # Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
 # find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
 # Remove below when ROCm5.7 is not in support matrix anymore.
 ENV ROCM_PATH /opt/rocm
 # No need to install ROCm as base docker image should have full ROCm install
 #ADD ./common/install_rocm.sh install_rocm.sh
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
    apt-get install python -y && \
    apt-get clean
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 # Install Anaconda
 COPY --from=conda /opt/conda /opt/conda
 # Install python
 COPY --from=python /opt/python    /opt/python
 COPY --from=python /opt/_internal /opt/_internal
 ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,93 +0,0 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 set -eou pipefail
 image="$1"
 shift
 if [ -z "${image}" ]; then
  echo "Usage: $0 IMAGE"
  exit 1
 fi
 DOCKER_IMAGE="pytorch/${image}"
 TOPDIR=$(git rev-parse --show-toplevel)
 GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 WITH_PUSH=${WITH_PUSH:-}
 DOCKER=${DOCKER:-docker}
 case ${GPU_ARCH_TYPE} in
    cpu)
        BASE_TARGET=cpu
        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
    cuda)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
    rocm)
        BASE_TARGET=rocm
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
        else
            echo "ERROR: rocm regex failed"
            exit 1
        fi
        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
            PYTORCH_ROCM_ARCH+=";gfx942"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac
 (
    set -x
    DOCKER_BUILDKIT=1 ${DOCKER} build \
         --target final \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --build-arg "BASE_TARGET=${BASE_TARGET}" \
        -t "${DOCKER_IMAGE}" \
        $@ \
        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
        "${TOPDIR}/.ci/docker/"
 )
 GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH}" == true ]]; then
  (
    set -x
    ${DOCKER} push "${DOCKER_IMAGE}"
    if [[ -n ${GITHUB_REF} ]]; then
        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
    fi
  )
 fi
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -29,7 +29,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/re
 # Install cuda and cudnn
 ARG CUDA_VERSION
-COPY ./common/install_cuda.sh install_cuda.sh
+RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -1,203 +0,0 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=11.8
 ARG GPU_IMAGE=centos:7
 FROM centos:7 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=9
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 # Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
 # patch is required once again. Somehow this steps adds mirror.centos.org
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 RUN yum --enablerepo=extras install -y epel-release
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake
 RUN yum install -y autoconf aclocal automake make sudo
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # EPEL for cmake
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 RUN cp $(which patchelf) /patchelf
 FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
 RUN bash build_scripts/build.sh && rm -r build_scripts
 FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
 FROM base as intel
 # MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as jni
 # Install java jni header
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 # Install libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum install -y \
        aclocal \
        autoconf \
        automake \
        bison \
        bzip2 \
        curl \
        diffutils \
        file \
        git \
        make \
        patch \
        perl \
        unzip \
        util-linux \
        wget \
        which \
        xz \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.1
 ARG DEVTOOLSET_VERSION=9
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # cmake is already installed inside the rocm base image, so remove if present
 RUN rpm -e cmake || true
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake
 # ninja
 RUN yum install -y ninja-build
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
 FROM cpu_final as rocm_final
 ARG ROCM_VERSION=3.7
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 # Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
 # find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
 # Remove below when ROCm5.7 is not in support matrix anymore.
 ENV ROCM_PATH /opt/rocm
 ENV MKLROOT /opt/intel
 # No need to install ROCm as base docker image should have full ROCm install
 #ADD ./common/install_rocm.sh install_rocm.sh
 #RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 # cmake3 is needed for the MIOpen build
 RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -1,153 +0,0 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=10.2
 ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
 FROM quay.io/pypa/manylinux2014_x86_64 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
 RUN yum install -y yum-utils centos-release-scl sudo
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
 ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
 FROM base as intel
 # MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as jni
 # Install java jni header
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 # Install libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum install -y \
        aclocal \
        autoconf \
        automake \
        bison \
        bzip2 \
        curl \
        diffutils \
        file \
        git \
        make \
        patch \
        perl \
        unzip \
        util-linux \
        wget \
        which \
        xz \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.2
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
 ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 # ninja
 RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
 RUN yum install -y ninja-build
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 FROM common as rocm_final
 ARG ROCM_VERSION=3.7
 # Install ROCm
 ADD ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
 # cmake is already installed inside the rocm base image, but both 2 and 3 exist
 # cmake3 is needed for the later MIOpen custom build, so that step is last.
 RUN yum install -y cmake3 && \
    rm -f /usr/bin/cmake && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -1,157 +0,0 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=11.8
 ARG GPU_IMAGE=amd64/almalinux:8
 FROM quay.io/pypa/manylinux_2_28_x86_64 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=11
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake3
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
 FROM base as intel
 # MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as jni
 # Install java jni header
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 # Install libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
 ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum -y install epel-release
 RUN yum -y update
 RUN yum install -y \
        autoconf \
        automake \
        bison \
        bzip2 \
        curl \
        diffutils \
        file \
        git \
        make \
        patch \
        perl \
        unzip \
        util-linux \
        wget \
        which \
        xz \
        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
        glibc-langpack-en
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=11.8
 ARG DEVTOOLSET_VERSION=11
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
    ln -s /usr/local/bin/cmake /usr/bin/cmake3
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 FROM common as rocm_final
 ARG ROCM_VERSION=3.7
 # Install ROCm
 ADD ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
 # cmake is already installed inside the rocm base image, but both 2 and 3 exist
 # cmake3 is needed for the later MIOpen custom build, so that step is last.
 RUN yum install -y cmake3 && \
    rm -f /usr/bin/cmake && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 FROM cpu_final as xpu_final
 # XPU CD use rolling driver
 ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 # Install setuptools and wheel for python 3.13
 RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -1,57 +0,0 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
 # Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
 ARG GCCTOOLSET_VERSION=11
 # Language variabes
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
 RUN yum -y update
 RUN yum install -y \
  autoconf \
  automake \
  bison \
  bzip2 \
  curl \
  diffutils \
  file \
  git \
  less \
  libffi-devel \
  libgomp \
  make \
  openssl-devel \
  patch \
  perl \
  unzip \
  util-linux \
  wget \
  which \
  xz \
  yasm \
  zstd \
  sudo \
  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 FROM base as final
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -1,94 +0,0 @@
 FROM quay.io/pypa/manylinux2014_aarch64 as base
 # Graviton needs GCC 10 for the build
 ARG DEVTOOLSET_VERSION=10
 # Language variabes
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
 RUN yum -y update
 RUN yum install -y \
  autoconf \
  automake \
  bison \
  bzip2 \
  curl \
  diffutils \
  file \
  git \
  make \
  patch \
  perl \
  unzip \
  util-linux \
  wget \
  which \
  xz \
  yasm \
  less \
  zstd \
  libgomp \
  sudo \
  devtoolset-${DEVTOOLSET_VERSION}-gcc \
  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  devtoolset-${DEVTOOLSET_VERSION}-binutils
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ###############################################################################
 # libglfortran.a hack
 #
 # libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
 # This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
 # ubuntu's libgfortran.a which is compiled with -fPIC
 # NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
 ###############################################################################
 RUN cd ~/ \
  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
  && ar x ~/libgfortran-10-dev.deb \
  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
 # install cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 FROM base as openblas
 # Install openblas
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 FROM openssl as final
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -1,91 +0,0 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base
 # Cuda ARM build needs gcc 11
 ARG DEVTOOLSET_VERSION=11
 # Language variables
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
 RUN yum -y update
 RUN yum install -y \
  autoconf \
  automake \
  bison \
  bzip2 \
  curl \
  diffutils \
  file \
  git \
  make \
  patch \
  perl \
  unzip \
  util-linux \
  wget \
  which \
  xz \
  yasm \
  less \
  zstd \
  libgomp \
  sudo \
  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 FROM openssl as final
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
 ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
 RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as nvpl
 # Install nvpl
 ADD ./common/install_nvpl.sh install_nvpl.sh
 RUN bash ./install_nvpl.sh && rm install_nvpl.sh
 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -1,71 +0,0 @@
 FROM centos:8 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 # change to a valid repo
 RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
 # enable to install ninja-build
 RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
 RUN yum -y update
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
 RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
 FROM base as openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # Install python
 FROM base as python
 RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
 ADD common/install_cpython.sh install_cpython.sh
 RUN bash ./install_cpython.sh && rm install_cpython.sh
 FROM base as conda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 RUN /opt/conda/bin/conda install -y cmake
 FROM base as intel
 # Install MKL
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=conda              /opt/conda                            /opt/conda
 ENV PATH=/opt/conda/bin:$PATH
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 RUN cp $(which patchelf) /patchelf
 FROM base as jni
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM base as final
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=conda              /opt/conda                            /opt/conda
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -1,73 +0,0 @@
 FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
 # Language variables
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN apt update ; apt upgrade -y
 RUN apt install -y \
  build-essential \
  autoconf \
  automake \
  bzip2 \
  curl \
  diffutils \
  file \
  git \
  make \
  patch \
  perl \
  unzip \
  util-linux \
  wget \
  which \
  xz-utils \
  less \
  zstd \
  cmake \
  python3 \
  python3-dev \
  python3-setuptools \
  python3-yaml \
  python3-typing-extensions \
  libblas-dev \
  libopenblas-dev \
  liblapack-dev \
  libatlas-base-dev
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # EPEL for cmake
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh
 RUN cp $(which patchelf) /patchelf
 FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
 RUN bash build_scripts/build.sh && rm -r build_scripts
 FROM openssl as final
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
 COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,161 +0,0 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
 set -eou pipefail
 TOPDIR=$(git rev-parse --show-toplevel)
 image="$1"
 shift
 if [ -z "${image}" ]; then
  echo "Usage: $0 IMAGE"
  exit 1
 fi
 DOCKER_IMAGE="pytorch/${image}"
 DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
 GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 WITH_PUSH=${WITH_PUSH:-}
 case ${GPU_ARCH_TYPE} in
    cpu)
        TARGET=cpu_final
        DOCKER_TAG=cpu
        GPU_IMAGE=centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        ;;
    cpu-manylinux_2_28)
        TARGET=cpu_final
        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    cpu-aarch64)
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
    cpu-aarch64-2_28)
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
    cpu-cxx11-abi)
        TARGET=final
        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
    cpu-s390x)
        TARGET=final
        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=redhat/ubi9
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
    cuda)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        # Keep this up to date with the minimum version of CUDA we currently support
        GPU_IMAGE=centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
        ;;
    cuda-manylinux_2_28)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    cuda-aarch64)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    rocm)
        TARGET=rocm_final
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
        else
            echo "ERROR: rocm regex failed"
            exit 1
        fi
        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
            PYTORCH_ROCM_ARCH+=";gfx942"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
        ;;
    xpu)
        TARGET=xpu_final
        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac
 IMAGES=''
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
 (
    set -x
    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
    sudo systemctl daemon-reload
    sudo systemctl restart docker
    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
        -t "${DOCKER_IMAGE}" \
        $@ \
        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
        "${TOPDIR}/.ci/docker/"
 )
 GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH}" == true ]]; then
    (
        set -x
        docker push "${DOCKER_IMAGE}"
        if [[ -n ${GITHUB_REF} ]]; then
            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
            docker push "${DOCKER_IMAGE_SHA_TAG}"
        fi
    )
 fi
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -1,131 +0,0 @@
 #!/bin/bash
 # Top-level build script called from Dockerfile
 # Script used only in CD pipeline
 # Stop at any error, show all commands
 set -ex
 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
 OPENSSL_ROOT=openssl-1.1.1l
 OPENSSL_HASH=0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
 PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
 CURL_ROOT=curl-7.73.0
 CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
 source $MY_DIR/build_utils.sh
 if [ "$(uname -m)" != "s390x" ] ; then
    # Dependencies for compiling Python that we want to remove from
    # the final image after compiling Python
    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
    # Libraries that are allowed as part of the manylinux1 profile
    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
    # Development tools and libraries
    yum -y install bzip2 make git patch unzip bison yasm diffutils \
        automake which file cmake28 \
        kernel-devel-`uname -r` \
        ${PYTHON_COMPILE_DEPS}
 else
    # Dependencies for compiling Python that we want to remove from
    # the final image after compiling Python
    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
    # Libraries that are allowed as part of the manylinux1 profile
    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
    # Development tools and libraries
    apt install -y bzip2 make git patch unzip diffutils \
        automake which file cmake \
        linux-headers-virtual \
        ${PYTHON_COMPILE_DEPS}
 fi
 # Install newest autoconf
 build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
 autoconf --version
 # Compile the latest Python releases.
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 build_openssl $OPENSSL_ROOT $OPENSSL_HASH
 /build_scripts/install_cpython.sh
 PY39_BIN=/opt/python/cp39-cp39/bin
 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
 # And it's not clear how up-to-date that is anyway
 # So let's just use the same one pip and everyone uses
 $PY39_BIN/pip install certifi
 ln -s $($PY39_BIN/python -c 'import certifi; print(certifi.where())') \
      /opt/_internal/certs.pem
 # If you modify this line you also have to modify the versions in the
 # Dockerfiles:
 export SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install newest curl
 build_curl $CURL_ROOT $CURL_HASH
 rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
 hash -r
 curl --version
 curl-config --features
 # Install patchelf (latest with unreleased bug fixes)
 curl -sLOk https://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.gz
 # check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
 tar -xzf patchelf-0.10.tar.gz
 (cd patchelf-0.10 && ./configure && make && make install)
 rm -rf patchelf-0.10.tar.gz patchelf-0.10
 # Install latest pypi release of auditwheel
 $PY39_BIN/pip install auditwheel
 ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel
 # Clean up development headers and other unnecessary stuff for
 # final image
 if [ "$(uname -m)" != "s390x" ] ; then
    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
        avahi freetype bitstream-vera-fonts \
        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
    yum -y install ${MANYLINUX1_DEPS}
    yum -y clean all > /dev/null 2>&1
    yum list installed
 else
    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
 fi
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
 # Strip what we can -- and ignore errors, because this just attempts to strip
 # *everything*, including non-ELF files:
 find /opt/_internal -type f -print0 \
    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
 # We do not need the Python test suites, or indeed the precompiled .pyc and
 # .pyo files. Partially cribbed from:
 #    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
 find /opt/_internal \
     \( -type d -a -name test -o -name tests \) \
  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
  -print0 | xargs -0 rm -f
 for PYTHON in /opt/python/*/bin/python; do
    # Smoke test to make sure that our Pythons work, and do indeed detect as
    # being manylinux compatible:
    $PYTHON $MY_DIR/manylinux1-check.py
    # Make sure that SSL cert checking works
    $PYTHON $MY_DIR/ssl-check.py
 done
 # Fix libc headers to remain compatible with C99 compilers.
 find /usr/include/ -type f -exec sed -i 's/\bextern _*inline_*\b/extern __inline __attribute__ ((__gnu_inline__))/g' {} +
 # Now we can delete our built SSL
 rm -rf /usr/local/ssl
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -1,91 +0,0 @@
 #!/bin/bash
 # Helper utilities for build
 # Script used only in CD pipeline
 OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
 CURL_DOWNLOAD_URL=https://curl.askapache.com/download
 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
 function check_var {
    if [ -z "$1" ]; then
        echo "required variable not defined"
        exit 1
    fi
 }
 function do_openssl_build {
    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
    make > /dev/null
    make install > /dev/null
 }
 function check_sha256sum {
    local fname=$1
    check_var ${fname}
    local sha256=$2
    check_var ${sha256}
    echo "${sha256}  ${fname}" > ${fname}.sha256
    sha256sum -c ${fname}.sha256
    rm -f ${fname}.sha256
 }
 function build_openssl {
    local openssl_fname=$1
    check_var ${openssl_fname}
    local openssl_sha256=$2
    check_var ${openssl_sha256}
    check_var ${OPENSSL_DOWNLOAD_URL}
    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
    tar -xzf ${openssl_fname}.tar.gz
    (cd ${openssl_fname} && do_openssl_build)
    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
 }
 function do_curl_build {
    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
    make > /dev/null
    make install > /dev/null
 }
 function build_curl {
    local curl_fname=$1
    check_var ${curl_fname}
    local curl_sha256=$2
    check_var ${curl_sha256}
    check_var ${CURL_DOWNLOAD_URL}
    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
    tar -jxf ${curl_fname}.tar.bz2
    (cd ${curl_fname} && do_curl_build)
    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
 }
 function do_standard_install {
    ./configure > /dev/null
    make > /dev/null
    make install > /dev/null
 }
 function build_autoconf {
    local autoconf_fname=$1
    check_var ${autoconf_fname}
    local autoconf_sha256=$2
    check_var ${autoconf_sha256}
    check_var ${AUTOCONF_DOWNLOAD_URL}
    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
    tar -zxf ${autoconf_fname}.tar.gz
    (cd ${autoconf_fname} && do_standard_install)
    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
 }
--- a/.ci/docker/manywheel/build_scripts/manylinux1-check.py
+++ b/.ci/docker/manywheel/build_scripts/manylinux1-check.py
@ -1,60 +0,0 @@
 # Logic copied from PEP 513
 def is_manylinux1_compatible():
    # Only Linux, and only x86-64 / i686
    from distutils.util import get_platform
    if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]:
        return False
    # Check for presence of _manylinux module
    try:
        import _manylinux
        return bool(_manylinux.manylinux1_compatible)
    except (ImportError, AttributeError):
        # Fall through to heuristic check below
        pass
    # Check glibc version. CentOS 5 uses glibc 2.5.
    return have_compatible_glibc(2, 5)
 def have_compatible_glibc(major, minimum_minor):
    import ctypes
    process_namespace = ctypes.CDLL(None)
    try:
        gnu_get_libc_version = process_namespace.gnu_get_libc_version
    except AttributeError:
        # Symbol doesn't exist -> therefore, we are not linked to
        # glibc.
        return False
    # Call gnu_get_libc_version, which returns a string like "2.5".
    gnu_get_libc_version.restype = ctypes.c_char_p
    version_str = gnu_get_libc_version()
    # py2 / py3 compatibility:
    if not isinstance(version_str, str):
        version_str = version_str.decode("ascii")
    # Parse string and check against requested version.
    version = [int(piece) for piece in version_str.split(".")]
    assert len(version) == 2
    if major != version[0]:
        return False
    if minimum_minor > version[1]:
        return False
    return True
 import sys
 if is_manylinux1_compatible():
    print(f"{sys.executable} is manylinux1 compatible")
    sys.exit(0)
 else:
    print(f"{sys.executable} is NOT manylinux1 compatible")
    sys.exit(1)
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -1,35 +0,0 @@
 # cf. https://github.com/pypa/manylinux/issues/53
 GOOD_SSL = "https://google.com"
 BAD_SSL = "https://self-signed.badssl.com"
 import sys
 print("Testing SSL certificate checking for Python:", sys.version)
 if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
    print("This version never checks SSL certs; skipping tests")
    sys.exit(0)
 if sys.version_info[0] >= 3:
    from urllib.request import urlopen
    EXC = OSError
 else:
    from urllib import urlopen
    EXC = IOError
 print(f"Connecting to {GOOD_SSL} should work")
 urlopen(GOOD_SSL)
 print("...it did, yay.")
 print(f"Connecting to {BAD_SSL} should fail")
 try:
    urlopen(BAD_SSL)
    # If we get here then we failed:
    print("...it DIDN'T!!!!!11!!1one!")
    sys.exit(1)
 except EXC:
    print("...it did, yay.")
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -30,14 +30,9 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
-expecttest==0.2.1
+expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
 #Pinned versions: 0.2.1
 #test that import:
 fbscribelogger==0.1.6
 #Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:
@ -90,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:
-mypy==1.11.2
+mypy==1.9.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.9.0
 #test that import: test_typing.py, test_type_hints.py
 networkx==2.8.8
@ -109,7 +104,7 @@ networkx==2.8.8
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
+numba==0.54.1 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
@ -139,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
-optree==0.12.1
+optree==0.11.0
 #Description: A library for tree manipulation
-#Pinned versions: 0.12.1
+#Pinned versions: 0.11.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -223,7 +218,7 @@ pygments==2.15.0
 #test that import:
 scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.20.0 ; python_version >= "3.10"
 #Description: image processing routines
 #Pinned versions:
 #test that import: test_nn.py
@ -274,10 +269,6 @@ lintrunner==0.12.5
 #Pinned versions: 0.12.5
 #test that import:
 redis>=4.0.0
 #Description: redis database
 #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)
 rockset==1.0.3
 #Description: queries Rockset
 #Pinned versions: 1.0.3
@ -315,30 +306,9 @@ pywavelets==1.5.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
-lxml==5.0.0
+lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting
 # Python-3.9 binaries
 PyGithub==2.3.0
 sympy==1.12.1 ; python_version == "3.8"
 sympy==1.13.1 ; python_version >= "3.9"
 #Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
 #Pinned versions:
 #test that import:
 onnx==1.16.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 parameterized==0.8.1
 #Description: Parameterizes unittests, both the tests themselves and the entire testing class
 #Pinned versions:
 #test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.1.0
+3.0.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
 ARG HALIDE
 # Build and install halide
 COPY ./common/install_halide.sh install_halide.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -147,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh
 # Install CUSPARSELT
@ -156,17 +148,10 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh
 # Install CUDSS
 ARG CUDA_VERSION
 COPY ./common/install_cudss.sh install_cudss.sh
 RUN bash install_cudss.sh
 RUN rm install_cudss.sh
 # Delete /usr/local/cuda-11.X/cuda-11.X symlinks
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
 RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,8 +68,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -80,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8
 # Install amdsmi
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
 RUN rm install_amdsmi.sh
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -102,17 +95,10 @@ ARG TRITON
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
 # Install AOTriton
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
@ -123,8 +109,5 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -30,7 +30,6 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ENV DOCS=$DOCS
@ -63,7 +62,7 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 # Install XPU Dependencies
-ARG XPU_VERSION
+ARG BASEKIT_VERSION
 COPY ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -50,7 +50,7 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
-COPY ./common/install_cuda.sh install_cuda.sh
+RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
@ -155,14 +155,6 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt
 ARG HALIDE
 # Build and install halide
 COPY ./common/install_halide.sh install_halide.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/halide.txt halide.txt
 RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
 RUN rm install_halide.sh common_utils.sh halide.txt
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/README.md
+++ b/.ci/pytorch/README.md
@ -1 +1,42 @@
 This directory contains scripts for our continuous integration.
 One important thing to keep in mind when reading the scripts here is
 that they are all based off of Docker images, which we build for each of
 the various system configurations we want to run on Jenkins.  This means
 it is very easy to run these tests yourself:
 1. Figure out what Docker image you want.  The general template for our
   images look like:
   ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
   where ``$BUILD_ENVIRONMENT`` is one of the build environments
   enumerated in
   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.ci/docker/build.sh). The dockerfile used by jenkins can be found under the `.ci` [directory](https://github.com/pytorch/pytorch/blob/master/.ci/docker)
 2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
   run one of the scripts in this directory.
 The Docker images are designed so that any "reasonable" build commands
 will work; if you look in [build.sh](build.sh) you will see that it is a
 very simple script.  This is intentional.  Idiomatic build instructions
 should work inside all of our Docker images.  You can tweak the commands
 however you need (e.g., in case you want to rebuild with DEBUG, or rerun
 the build with higher verbosity, etc.).
 We have to do some work to make this so.  Here is a summary of the
 mechanisms we use:
 - We install binaries to directories like `/usr/local/bin` which
  are automatically part of your PATH.
 - We add entries to the PATH using Docker ENV variables (so
  they apply when you enter Docker) and `/etc/environment` (so they
  continue to apply even if you sudo), instead of modifying
  `PATH` in our build scripts.
 - We use `/etc/ld.so.conf.d` to register directories containing
  shared libraries, instead of modifying `LD_LIBRARY_PATH` in our
  build scripts.
 - We reroute well known paths like `/usr/bin/gcc` to alternate
  implementations with `update-alternatives`, instead of setting
  `CC` and `CXX` in our implementations.
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,13 +44,21 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi
-if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
  export ATEN_THREADING=TBB
  export USE_TBB=1
 elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi
 # Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+  export USE_LLVM=/opt/rocm/llvm
  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
 else
  export USE_LLVM=/opt/llvm
  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 fi
 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -171,8 +179,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
+  export USE_XPU=1
  export USE_KINETO=0
 fi
 # sccache will fail for CUDA builds if all cores are used for compiling
@ -226,13 +233,9 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi
 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -278,32 +281,15 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
-        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        # Install numpy-2.0 release candidate for builds
-        python -mpip install --pre numpy==2.0.2
+        # Which should be backward compatible with Numpy-1.X
-      fi
+        python -mpip install --pre numpy==2.0.0rc1
      WERROR=1 python setup.py clean
      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
      else
        WERROR=1 python setup.py bdist_wheel
      fi
      WERROR=1 python setup.py bdist_wheel
    else
-      python setup.py clean
+      python setup.py bdist_wheel
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
        exit 1
      else
        python setup.py bdist_wheel
      fi
    fi
    pip_install_whl "$(echo dist/*.whl)"
@ -341,11 +327,10 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -355,10 +340,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -370,7 +355,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -403,6 +388,6 @@ fi
 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -56,29 +56,9 @@ function assert_git_not_dirty() {
 function pip_install_whl() {
  # This is used to install PyTorch and other build artifacts wheel locally
  # without using any network connection
-
+  python3 -mpip install --no-index --no-deps "$@"
  # Convert the input arguments into an array
  local args=("$@")
  # Check if the first argument contains multiple paths separated by spaces
  if [[ "${args[0]}" == *" "* ]]; then
    # Split the string by spaces into an array
    IFS=' ' read -r -a paths <<< "${args[0]}"
    # Loop through each path and install individually
    for path in "${paths[@]}"; do
      echo "Installing $path"
      python3 -mpip install --no-index --no-deps "$path"
    done
  else
    # Loop through each argument and install individually
    for path in "${args[@]}"; do
      echo "Installing $path"
      python3 -mpip install --no-index --no-deps "$path"
    done
  fi
 }
 function pip_install() {
  # retry 3 times
  # old versions of pip don't have the "--progress-bar" flag
@ -179,7 +159,7 @@ function install_torchvision() {
 }
 function install_tlparse() {
-  pip_install --user "tlparse==0.3.25"
+  pip_install --user "tlparse==0.3.7"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }
@ -208,6 +188,28 @@ function clone_pytorch_xla() {
  fi
 }
 function checkout_install_torchdeploy() {
  local commit
  commit=$(get_pinned_commit multipy)
  pushd ..
  git clone --recurse-submodules https://github.com/pytorch/multipy.git
  pushd multipy
  git checkout "${commit}"
  python multipy/runtime/example/generate_examples.py
  BUILD_CUDA_TESTS=1 pip install -e .
  popd
  popd
 }
 function test_torch_deploy(){
 pushd ..
 pushd multipy
 ./multipy/runtime/build/test_deploy
 ./multipy/runtime/build/test_deploy_gpu
 popd
 popd
 }
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
@ -222,8 +224,6 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
 }
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 from tempfile import mkdtemp
 from cryptography import x509
@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.x509.oid import NameOID
 temp_dir = mkdtemp()
 print(temp_dir)
@ -42,10 +41,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
+            datetime.utcnow()
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +87,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
+            datetime.utcnow()
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"
 cd docs
-TERM=vt100 make doctest
+make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,37 +0,0 @@
 #!/bin/bash
 # Script for installing sccache on the xla build job, which uses xla's docker
 # image and doesn't have sccache installed on it.  This is mostly copied from
 # .ci/docker/install_cache.sh.  Changes are: removing checks that will always
 # return the same thing, ex checks for for rocm, CUDA, and changing the path
 # where sccache is installed, and not changing /etc/environment.
 set -ex
 install_binary() {
  echo "Downloading sccache binary from S3 repo"
  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
 }
 mkdir -p /tmp/cache/bin
 mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"
 install_binary
 chmod a+x /tmp/cache/bin/sccache
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
  # shellcheck disable=SC2086
  # shellcheck disable=SC2059
  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
  chmod a+x "/tmp/cache/bin/$1"
 }
 write_sccache_stub cc
 write_sccache_stub c++
 write_sccache_stub gcc
 write_sccache_stub g++
 write_sccache_stub clang
 write_sccache_stub clang++
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -9,13 +9,15 @@ if [[ -n "$CONDA_ENV" ]]; then
  export PATH="$CONDA_ENV/bin":$PATH
 fi
-# Test that OpenMP is enabled
+# Test that OpenMP is enabled for non-arm64 build
-pushd test
+if [[ ${BUILD_ENVIRONMENT} != *arm64* ]]; then
-if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
+  pushd test
-  echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
+  if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
-  exit 1
+    echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
    exit 1
  fi
  popd
 fi
 popd
 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
@ -25,9 +27,8 @@ setup_test_python() {
  echo "Ninja version: $(ninja --version)"
  echo "Python version: $(which python) ($(python --version))"
-  # Set the limit on open file handles to 16384
+  # Increase default limit on open file handles from 256 to 1024
-  # might help with intermittent compiler test failures
+  ulimit -n 1024
  ulimit -n 16384
 }
 test_python_all() {
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,9 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
 time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 # FSDP tests
@ -44,16 +42,14 @@ time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compi
 time python test/run_test.py --verbose -i distributed/test_device_mesh
 # DTensor/TP tests
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
 # ND composability tests
 time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
 time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability
 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
 time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -3,7 +3,6 @@ import json
 import math
 import sys
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -3,7 +3,6 @@ import sys
 import numpy
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]
--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -1,7 +1,6 @@
 import json
 import sys
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]
--- a/.ci/pytorch/print_sccache_log.py
+++ b/.ci/pytorch/print_sccache_log.py
@ -1,6 +1,5 @@
 import sys
 log_file_path = sys.argv[1]
 with open(log_file_path) as f:
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -6,9 +6,6 @@
 set -ex
 # Suppress ANSI color escape sequences
 export TERM=vt100
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
@ -169,7 +166,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
+  # refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # Check XPU status before testing
@ -252,7 +249,9 @@ fi
 # This tests that the debug asserts are working correctly.
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
-    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
+    # TODO: Enable the check after we setup the build to run debug asserts without having
    #       to do a full (and slow) debug build
    # (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
 elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
@ -278,9 +277,6 @@ test_python_shard() {
  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
@ -319,18 +315,17 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
@ -339,51 +334,26 @@ test_inductor_distributed() {
  assert_git_not_dirty
 }
-test_inductor_shard() {
+test_inductor() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
  fi
  python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --inductor \
+  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
    --include test_modules test_ops test_ops_gradients test_torch \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py \
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose
    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
 }
 test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
-  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    # We need to hipify before building again
+      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    python3 tools/amd_build/build_amd.py
+      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
  fi
  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }
 test_inductor_cpp_wrapper_abi_compatible() {
  export TORCHINDUCTOR_ABI_COMPATIBLE=1
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  # cpu stack allocation causes segfault and needs more investigation
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
 }
 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -394,22 +364,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
 # .github/workflows/inductor-perf-test-nightly.yml
 DYNAMO_BENCHMARK_FLAGS=()
-pr_time_benchmarks() {
+if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  pip_install --user "fbscribelogger"
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
 }
 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
  pr_time_benchmarks
  exit 0
 elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
@ -423,7 +378,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi
-if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -447,18 +402,6 @@ test_perf_for_dashboard() {
  # TODO: All the accuracy tests can be skipped once the CI accuracy checking is stable enough
  local targets=(accuracy performance)
  local device=cuda
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
      device=cpu_x86
    elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
      device=cpu_aarch64
    fi
    test_inductor_set_cpu_affinity
  elif [[ "${TEST_CONFIG}" == *cuda_a10g* ]]; then
    device=cuda_a10g
  fi
  for mode in "${modes[@]}"; do
    if [[ "$mode" == "inference" ]]; then
      dtype=bfloat16
@ -474,62 +417,56 @@ test_perf_for_dashboard() {
      fi
      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
            --dynamic-batch-only "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_CPP_WRAPPER=1 python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        if [[ "$target" == "accuracy" ]]; then
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \
          # Also collect Export pass rate and display as a separate row
          $TASKSET python "benchmarks/dynamo/$suite.py" \
              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
        fi
        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
        # to fill the dashboard.
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        python "benchmarks/dynamo/$suite.py" \
          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
-          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
+          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
        # Copy cudagraph results as mock data, easiest choice?
-        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" \
+        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
-          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv"
+          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
      fi
    done
  done
@ -566,19 +503,11 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      # Test AOTInductor with the ABI-compatible mode on CI
      # This can be removed once the ABI-compatible mode becomes default.
      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx2/}
    fi
    if [[ "${TEST_CONFIG}" == *_avx512* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx512/}
    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
@ -595,17 +524,9 @@ test_single_dynamo_benchmark() {
 test_inductor_micro_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    test_inductor_set_cpu_affinity
  fi
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }
 test_inductor_halide() {
  python test/run_test.py --include inductor/test_halide.py --verbose
  assert_git_not_dirty
 }
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -620,16 +541,8 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
-      local dt="float32"
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
      if [[ "${TEST_CONFIG}" == *amp* ]]; then
        dt="amp"
      fi
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
      else
        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -643,16 +556,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  # Test some models in the cpp wrapper mode
+  # smoke test the cpp_wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -679,88 +588,50 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
  # Perform some "warm-start" runs for a few huggingface models.
  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
  done
 }
 test_inductor_get_core_number() {
  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
  else
    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
  fi
 }
 test_inductor_set_cpu_affinity(){
  #set jemalloc
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
    # Use Intel OpenMP for x86
    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
    export KMP_AFFINITY=granularity=fine,compact,1,0
    export KMP_BLOCKTIME=1
  fi
  cores=$(test_inductor_get_core_number)
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
  export TASKSET="taskset -c 0-$end_core"
 }
 test_inductor_torchbench_cpu_smoketest_perf(){
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  test_inductor_set_cpu_affinity
+  #set jemalloc
  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
  export KMP_AFFINITY=granularity=fine,compact,1,0
  export KMP_BLOCKTIME=1
  CORES=$(lscpu | grep Core | awk '{print $4}')
  export OMP_NUM_THREADS=$CORES
  end_core=$(( CORES-1 ))
  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
  do
    local model_name=${model_cfg[0]}
-    local data_type=${model_cfg[2]}
+    local data_type=${model_cfg[1]}
-    local speedup_target=${model_cfg[5]}
+    local speedup_target=${model_cfg[4]}
-    local backend=${model_cfg[1]}
+    if [[ ${model_cfg[3]} == "cpp" ]]; then
    if [[ ${model_cfg[4]} == "cpp" ]]; then
      export TORCHINDUCTOR_CPP_WRAPPER=1
    else
      unset TORCHINDUCTOR_CPP_WRAPPER
    fi
    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
-    if [[ ${model_cfg[3]} == "dynamic" ]]; then
+    if [[ ${model_cfg[2]} == "dynamic" ]]; then
-      $TASKSET python benchmarks/dynamo/torchbench.py \
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
-        --dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
+        --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name"
    else
-      $TASKSET python benchmarks/dynamo/torchbench.py \
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
-        --freezing --timeout 9000 --"$backend" --output "$output_name"
+        --freezing --timeout 9000 --backend=inductor --output "$output_name"
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
  done
  # Add a few ABI-compatible accuracy tests for CPU. These can be removed once we turn on ABI-compatible as default.
  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only adv_inception_v3 \
    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only beit_base_patch16_224 \
    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv" \
    --expected "benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv"
 }
 test_torchbench_gcp_smoketest(){
@ -800,6 +671,7 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"
  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -824,6 +696,21 @@ test_without_numpy() {
  popd
 }
 # pytorch extensions require including torch/extension.h which includes all.h
 # which includes utils.h which includes Parallel.h.
 # So you can call for instance parallel_for() from your extension,
 # but the compilation will fail because of Parallel.h has only declarations
 # and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
 # I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
 # But if Pytorch is built with TBB it provides Config.h
 # that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
 # torch/extension.h which transitively includes Parallel.h
 # which transitively includes tbb.h which is not available!
 if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
  sudo mkdir -p /usr/include/tbb
  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
 fi
 test_libtorch() {
  local SHARD="$1"
@ -837,6 +724,7 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"
    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -973,6 +861,7 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1074,113 +963,11 @@ test_xla() {
  assert_git_not_dirty
 }
 function check_public_api_test_fails {
    test_name=$1
    invalid_item_name=$2
    invalid_item_desc=$3
    echo "Running public API test '${test_name}'..."
    test_output=$(python test/test_public_bindings.py -k "${test_name}" 2>&1) && ret=$? || ret=$?
    # Ensure test fails correctly.
    if [ "$ret" -eq 0 ]; then
        cat << EOF
 Expected the public API test '${test_name}' to fail after introducing
 ${invalid_item_desc}, but it succeeded! Check test/test_public_bindings.py
 for any changes that may have broken the test.
 EOF
        return 1
    fi
    # Ensure invalid item is in the test output.
    echo "${test_output}" | grep -q "${invalid_item_name}" && ret=$? || ret=$?
    if [ $ret -ne 0 ]; then
        cat << EOF
 Expected the public API test '${test_name}' to identify ${invalid_item_desc}, but
 it didn't! It's possible the test may not have run. Check test/test_public_bindings.py
 for any changes that may have broken the test.
 EOF
        return 1
    fi
    echo "Success! '${test_name}' identified ${invalid_item_desc} ${invalid_item_name}."
    return 0
 }
 # Do NOT run this test before any other tests, like test_python_shard, etc.
 # Because this function uninstalls the torch built from branch and installs
 # the torch built on its base commit.
 test_forward_backward_compatibility() {
  set -x
  # First, validate public API tests in the torch built from branch.
  # Step 1. Make sure the public API test "test_correct_module_names" fails when a new file
  # introduces an invalid public API function.
  new_filename=$(mktemp XXXXXXXX.py -p "${TORCH_INSTALL_DIR}")
  BAD_PUBLIC_FUNC=$(
  cat << 'EOF'
 def new_public_func():
  pass
 # valid public API functions have __module__ set correctly
 new_public_func.__module__ = None
 EOF
  )
  echo "${BAD_PUBLIC_FUNC}" >> "${new_filename}"
  invalid_api="torch.$(basename -s '.py' "${new_filename}").new_public_func"
  echo "Created an invalid public API function ${invalid_api}..."
  check_public_api_test_fails \
      "test_correct_module_names" \
      "${invalid_api}" \
      "an invalid public API function" && ret=$? || ret=$?
  rm -v "${new_filename}"
  if [ "$ret" -ne 0 ]; then
      exit 1
  fi
  # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
  # file is modified to introduce an invalid public API function.
  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
  invalid_api="torch.nn.parameter.new_public_func"
  echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
  check_public_api_test_fails \
      "test_correct_module_names" \
      "${invalid_api}" \
      "an invalid public API function" && ret=$? || ret=$?
  mv -v "${EXISTING_FILEPATH}.orig" "${EXISTING_FILEPATH}"
  if [ "$ret" -ne 0 ]; then
      exit 1
  fi
  # Step 3. Make sure that the public API test "test_modules_can_be_imported" fails when a module
  # cannot be imported.
  new_module_dir=$(mktemp XXXXXXXX -d -p "${TORCH_INSTALL_DIR}")
  echo "invalid syntax garbage" > "${new_module_dir}/__init__.py"
  invalid_module_name="torch.$(basename "${new_module_dir}")"
  check_public_api_test_fails \
      "test_modules_can_be_imported" \
      "${invalid_module_name}" \
      "a non-importable module" && ret=$? || ret=$?
  rm -rv "${new_module_dir}"
  if [ "$ret" -ne 0 ]; then
      exit 1
  fi
  # Next, build torch from the merge base.
  REPO_DIR=$(pwd)
  if [[ "${BASE_SHA}" == "${SHA1}" ]]; then
    echo "On trunk, we should compare schemas with torch built from the parent commit"
@ -1354,21 +1141,15 @@ test_executorch() {
  pushd /executorch
-  export PYTHON_EXECUTABLE=python
+  # NB: We need to build ExecuTorch runner here and not inside the Docker image
-  export EXECUTORCH_BUILD_PYBIND=ON
+  # because it depends on PyTorch
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
  # shellcheck disable=SC1091
-  source .ci/scripts/setup-linux.sh cmake
+  source .ci/scripts/utils.sh
-
+  build_executorch_runner "cmake"
  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
  # shellcheck disable=SC1091
  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
  echo "Run ExecuTorch regression tests for some models"
  # NB: This is a sample model, more can be added here
  export PYTHON_EXECUTABLE=python
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1382,16 +1163,14 @@ test_executorch() {
  assert_git_not_dirty
 }
-test_linux_aarch64() {
+test_linux_aarch64(){
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-        test_transformers test_multiprocessing test_numpy_interop \
+       test_transformers test_multiprocessing test_numpy_interop --verbose
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
@ -1401,15 +1180,14 @@ test_linux_aarch64() {
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
@ -1431,10 +1209,11 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
 elif [[ "$TEST_CONFIG" == deploy ]]; then
  checkout_install_torchdeploy
  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1446,14 +1225,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
  fi
  install_torchtext
  install_torchvision
  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1461,9 +1239,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
+      shufflenet_v2_x1_0 hf_GPT2
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
    checkout_install_torchbench
@ -1472,7 +1250,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1480,24 +1258,17 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
  test_inductor_cpp_wrapper_abi_compatible
-elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
-  test_inductor_shard "${SHARD_NUMBER}"
+  test_inductor
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+  test_inductor_distributed
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
+elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
-      test_inductor_distributed
+  install_torchvision
-    fi
+  test_dynamo_shard 1
-  fi
+  test_aten
-elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
+elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_aten
  fi
 elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  install_torchvision
  test_python_shard "$SHARD_NUMBER"
  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
@ -1527,6 +1298,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
 elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
  install_torchvision
  test_python
  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -24,12 +24,6 @@ call %INSTALLER_DIR%\install_sccache.bat
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 if "%USE_XPU%"=="1" (
  :: Install xpu support packages
  call %INSTALLER_DIR%\install_xpu.bat
  if errorlevel 1 exit /b 1
 )
 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
@ -49,16 +43,6 @@ if "%VC_VERSION%" == "" (
 )
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
 if "%USE_XPU%"=="1" (
  :: Activate xpu environment - VS env is required for xpu
  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
  if errorlevel 1 exit /b 1
  :: Reduce build time. Only have MTL self-hosted runner now
  SET TORCH_XPU_ARCH_LIST=xe-lpg
  SET USE_KINETO=0
 )
@echo on
 popd
@ -81,6 +65,13 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 :cuda_build_end
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
@ -1,91 +0,0 @@
@echo on
 REM Description: Install Intel Support Packages on Windows
 REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
 set XPU_INSTALL_MODE=%~1
 if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
 if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start
 if "%XPU_INSTALL_MODE%"=="driver" goto xpu_driver_install_start
 if "%XPU_INSTALL_MODE%"=="all" goto xpu_driver_install_start
 :arg_error
 echo Illegal XPU installation mode. The value can be "bundle"/"driver"/"all"
 echo If keep the value as space, will use default "bundle" mode
 exit /b 1
 :xpu_driver_install_start
 :: TODO Need more testing for driver installation
 set XPU_DRIVER_LINK=https://downloadmirror.intel.com/830975/gfx_win_101.5972.exe
 curl -o xpu_driver.exe --retry 3 --retry-all-errors -k %XPU_DRIVER_LINK%
 echo "XPU Driver installing..."
 start /wait "Intel XPU Driver Installer" "xpu_driver.exe"
 if errorlevel 1 exit /b 1
 del xpu_driver.exe
 if "%XPU_INSTALL_MODE%"=="driver" goto xpu_install_end
 :xpu_bundle_install_start
 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
 set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-for-pytorch-gpu-dev_p_0.5.3.37_offline.exe
 set XPU_PTI_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-pti-dev_p_0.9.0.37_offline.exe
 set XPU_BUNDLE_VERSION=0.5.3+31
 set XPU_PTI_VERSION=0.9.0+36
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.intel-for-pytorch-gpu-dev.product
 set XPU_PTI_PRODUCT_NAME=intel.oneapi.win.intel-pti-dev.product
 set XPU_BUNDLE_INSTALLED=0
 set XPU_PTI_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
 set XPU_PTI_UNINSTALL=0
 :: Check if XPU bundle is target version or already installed
 if exist "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" goto xpu_bundle_ver_check
 goto xpu_bundle_install
 :xpu_bundle_ver_check
 "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --list-products > xpu_bundle_installed_ver.log
 for /f "tokens=1,2" %%a in (xpu_bundle_installed_ver.log) do (
    if "%%a"=="%XPU_BUNDLE_PRODUCT_NAME%" (
        echo %%a Installed Version: %%b
        set XPU_BUNDLE_INSTALLED=1
        if not "%XPU_BUNDLE_VERSION%"=="%%b" (
            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_BUNDLE_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
            set XPU_BUNDLE_UNINSTALL=1
        )
    )
    if "%%a"=="%XPU_PTI_PRODUCT_NAME%" (
        echo %%a Installed Version: %%b
        set XPU_PTI_INSTALLED=1
        if not "%XPU_PTI_VERSION%"=="%%b" (
            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_PTI_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
            set XPU_PTI_UNINSTALL=1
        )
    )
 )
 if errorlevel 1 exit /b 1
 if exist xpu_bundle_installed_ver.log del xpu_bundle_installed_ver.log
 if "%XPU_BUNDLE_INSTALLED%"=="0" goto xpu_bundle_install
 if "%XPU_BUNDLE_UNINSTALL%"=="1" goto xpu_bundle_install
 if "%XPU_PTI_INSTALLED%"=="0" goto xpu_pti_install
 if "%XPU_PTI_UNINSTALL%"=="1" goto xpu_pti_install
 goto xpu_install_end
 :xpu_bundle_install
 curl -o xpu_bundle.exe --retry 3 --retry-all-errors -k %XPU_BUNDLE_URL%
 echo "XPU Bundle installing..."
 start /wait "Intel Pytorch Bundle Installer" "xpu_bundle.exe" --action=install --eula=accept --silent --log-dir install_bundle
 if errorlevel 1 exit /b 1
 del xpu_bundle.exe
 :xpu_pti_install
 curl -o xpu_pti.exe --retry 3 --retry-all-errors -k %XPU_PTI_URL%
 echo "XPU PTI installing..."
 start /wait "Intel PTI Installer" "xpu_pti.exe" --action=install --eula=accept --silent --log-dir install_bundle
 if errorlevel 1 exit /b 1
 del xpu_pti.exe
 :xpu_install_end
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -4,7 +4,6 @@ import os
 import subprocess
 import sys
 COMMON_TESTS = [
    (
        "Checking that torch is available",
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -40,6 +40,7 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
 set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
--- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_backend.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1
 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_ops.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -5,7 +5,7 @@ if errorlevel 1 exit /b 1
 set CWD=%cd%
 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\bin
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 set TORCH_CPP_TEST_MNIST_PATH=%CWD%\test\cpp\api\mnist
 python tools\download_mnist.py --quiet -d %TORCH_CPP_TEST_MNIST_PATH%
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -40,12 +40,6 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.12.2.0
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
 python -m pip install tlparse==0.3.25
 # Install parameterized
 python -m pip install parameterized==0.8.1
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@ -5,7 +5,6 @@ import sys
 import yaml
 # Need to import modules that lie on an upward-relative path
 sys.path.append(os.path.join(sys.path[0], ".."))
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -46,12 +46,14 @@ if [[ "\$python_nodot" = *310* ]]; then
  PROTOBUF_PACKAGE="protobuf>=3.19.0"
 fi
-if [[ "\$python_nodot" = *39* ]]; then
+if [[ "\$python_nodot" = *39*  ]]; then
  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
  # we set a lower boundary here just to be safe
  NUMPY_PIN=">=1.20"
 fi
 # Move debug wheels out of the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
 mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
@ -81,7 +83,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
      "numpy\${NUMPY_PIN}" \
      mkl>=2018 \
      ninja \
-      sympy>=1.12 \
+      sympy \
      typing-extensions \
      ${PROTOBUF_PACKAGE}
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
@ -95,16 +97,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
+    retry pip install -q numpy protobuf typing-extensions
      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
      # todo: after folder is populated use the pypi_pkg channel instead
      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
      retry pip install -q numpy protobuf typing-extensions
    else
      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
      retry pip install -q numpy protobuf typing-extensions
    fi
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
@ -119,14 +113,6 @@ fi
 # Test the package
 /builder/check_binary.sh
 if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
  python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
 fi
 # Clean temp files
 cd /builder && git clean -ffdx
 # =================== The above code will be executed inside Docker container ===================
 EOL
 echo
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$PACKAGE_TYPE" == conda ]]; then
    export DOCKER_IMAGE="pytorch/conda-cuda"
  elif [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux-cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
  fi
 fi
@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
+  # Only linux Python < 3.12 are supported wheels for triton
  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 fi
 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,18 +100,30 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi
-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
+JAVA_HOME=
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
+BUILD_JNI=OFF
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+if [[ "$PACKAGE_TYPE" == libtorch ]]; then
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
+  POSSIBLE_JAVA_HOMES=()
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
+  POSSIBLE_JAVA_HOMES+=(/usr/local)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+  POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
-    fi
+  POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
+  # Add the Windows-specific JNI path
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
+  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
-    else
+  for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+    if [[ -e "$JH/include/jni.h" ]] ; then
      # Skip if we're not on Windows but haven't found a JAVA_HOME
      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
        break
      fi
      echo "Found jni.h under $JH"
      JAVA_HOME="$JH"
      BUILD_JNI=ON
      break
    fi
  done
  if [ -z "$JAVA_HOME" ]; then
    echo "Did not find jni.h"
  fi
 fi
 cat >"$envfile" <<EOL
@ -124,7 +136,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
 export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -148,6 +159,8 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
 export ANACONDA_USER='pytorch'
 export USE_FBGEMM=1
 export JAVA_HOME=$JAVA_HOME
 export BUILD_JNI=$BUILD_JNI
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -25,15 +25,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi
 if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
 fi
 # this is special build with all dependencies packaged
 if [[ ${BUILD_NAME} == *-full* ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
 fi
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -10,11 +10,6 @@ export SCCACHE_BUCKET=ossci-compiler-cache
 export SCCACHE_IGNORE_SERVER_IO_ERROR=1
 export VC_YEAR=2019
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
 fi
 echo "Free space on filesystem before build:"
 df -h
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -6,10 +6,6 @@ source "${BINARY_ENV_FILE:-/c/w/env}"
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 export VC_YEAR=2019
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
 fi
 pushd "$BUILDER_ROOT"
 ./windows/internal/smoke_test.bat
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@ -8,7 +8,6 @@ import time
 import requests
 AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -62,6 +62,4 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
 CheckOptions:
  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
 ...
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@ -5,7 +5,7 @@ git submodule sync
 git submodule update --init --recursive
 # This takes some time
-make setup-lint
+make setup_lint
 # Add CMAKE_PREFIX_PATH to bashrc
 echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
--- a/.flake8
+++ b/.flake8
@ -2,12 +2,12 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+    E203,E305,E402,E501,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
@ -55,9 +55,6 @@ per-file-ignores =
    torch/distributed/_functional_collectives.py: TOR901
    torch/distributed/_spmd/data_parallel.py: TOR901
    torch/distributed/_tensor/_collective_utils.py: TOR901
    # This is a full package that happen to live within the test
    # folder, so ok to skip
    test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
 # 2024-01-02 clangformat: fused adam #116583
 9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
 # 2024-06-28 enable UFMT in `torch/storage.py`
 d80939e5e9337e8078f11489afefec59fd42f93b
 # 2024-06-28 enable UFMT in `torch.utils.data`
 7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,59 +1,30 @@
 self-hosted-runner:
  labels:
    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
    # Organization-wide AWS Linux Runners
    - linux.large
    - linux.large.arc
    - linux.2xlarge
    - linux.4xlarge
    - linux.9xlarge.ephemeral
    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
    # Pytorch/pytorch AWS Linux Runners on Linux Foundation account
    - lf.linux.large
    - lf.linux.2xlarge
    - lf.linux.4xlarge
    - lf.linux.12xlarge
    - lf.linux.24xlarge
    - lf.linux.arm64.2xlarge
    - lf.linux.4xlarge.nvidia.gpu
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
    - windows.g4dn.xlarge
    - windows.g4dn.xlarge.nonephemeral
    - windows.4xlarge
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    # Organization-wide AMD hosted MI300 runners
+    - bm-runner
    - linux.rocm.gpu
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
    - macos-m2-14
    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-13
    - macos-m1-14
-    # GitHub-hosted MacOS runners
+    - macos-12-xl
    - macos-12
    - macos12.3-m1
    - macos-latest-xlarge
    - macos-13-xlarge
    - macos-14-xlarge
    # Organization-wide Intel hosted XPU runners
    - linux.idc.xpu
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,14 +14,12 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
+        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -41,9 +41,6 @@ outputs:
  ci-verbose-test-logs:
    description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
    value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
  ci-test-showlocals:
    description: True if ci-test-showlocals label was on PR or [ci-test-showlocals] in PR body.
    value: ${{ steps.filter.outputs.ci-test-showlocals }}
  ci-no-test-timeout:
    description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
    value: ${{ steps.filter.outputs.ci-no-test-timeout }}
@ -57,7 +54,7 @@ outputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@v3.0.0
+    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      name: Setup dependencies
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
@ -69,8 +66,7 @@ runs:
        command: |
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
+          python3 -m pip install requests==2.26.0 pyyaml==6.0.1
          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
    - name: Parse ref
      id: parse-ref
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -0,0 +1,207 @@
 name: linux-build
 inputs:
  build-environment:
    required: true
    description: Top-level label for what's being built/tested.
  docker-image-name:
    required: true
    description: Name of the base docker image to build with.
  build-generates-artifacts:
    required: false
    default: "true"
    description: If set, upload generated build artifacts.
  build-with-debug:
    required: false
    default: "false"
    description: If set, build in debug mode.
  sync-tag:
    required: false
    default: ""
    description: |
      If this is set, our linter will use this to make sure that every other
      job with the same `sync-tag` is identical.
  cuda-arch-list:
    required: false
    default: "5.2"
    description: Runner label to select worker type
  runner:
    required: false
    default: "linux.2xlarge"
    description: |
      List of CUDA architectures CI build should target.
  test-matrix:
    required: false
    type: string
    description: |
      An option JSON description of what test configs to run later on. This
      is moved here from the Linux test workflow so that we can apply filter
      logic using test-config labels earlier and skip unnecessary builds
  s3-bucket:
    description: S3 bucket to download artifact
    required: false
    default: "gha-artifacts"
  aws-role-to-assume:
    description: role to assume for downloading artifacts
    required: false
    default: ""
  GITHUB_TOKEN:
    description: GitHub token
    required: true
  HUGGING_FACE_HUB_TOKEN:
    description: Hugging Face Hub token
    required: false
    default: ""
 outputs:
  docker-image:
    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
    description: The docker image containing the built PyTorch.
  test-matrix:
    value: ${{ steps.filter.outputs.test-matrix }}
    description: An optional JSON description of what test configs to run later on.
 runs:
  using: composite
  steps:
    - name: Setup Linux
      uses: ./.github/actions/setup-linux
    - name: configure aws credentials
      uses: aws-actions/configure-aws-credentials@v3
      if: ${{ inputs.aws-role-to-assume != '' }}
      with:
        role-to-assume: ${{ inputs.aws-role-to-assume }}
        role-session-name: gha-linux-build
        role-duration-seconds: 10800
        aws-region: us-east-1
    - name: Calculate docker image
      id: calculate-docker-image
      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
      with:
        docker-image-name: ${{ inputs.docker-image-name }}
    - name: Use following to pull public copy of the image
      id: print-ghcr-mirror
      env:
        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      shell: bash
      run: |
        tag=${ECR_DOCKER_IMAGE##*/}
        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
    - name: Pull docker image
      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
    - name: Parse ref
      id: parse-ref
      shell: bash
      run: .github/scripts/parse_ref.py
    - name: Get workflow job id
      id: get-job-id
      uses: ./.github/actions/get-workflow-job-id
      if: always()
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}
    # Apply the filter logic to the build step too if the test-config label is already there
    - name: Select all requested test configurations (if the test matrix is available)
      id: filter
      uses: ./.github/actions/filter-test-configs
      with:
        github-token: ${{ inputs.GITHUB_TOKEN }}
        test-matrix: ${{ inputs.test-matrix }}
        job-name: ${{ steps.get-job-id.outputs.job-name }}
    - name: Download pytest cache
      uses: ./.github/actions/pytest-cache-download
      continue-on-error: true
      with:
        cache_dir: .pytest_cache
        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
        s3_bucket: ${{ inputs.s3-bucket }}
    - name: Build
      if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
      id: build
      env:
        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
        BRANCH: ${{ steps.parse-ref.outputs.branch }}
        # TODO duplicated
        AWS_DEFAULT_REGION: us-east-1
        PR_NUMBER: ${{ github.event.pull_request.number }}
        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
        PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
        DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
      shell: bash
      run: |
        # detached container should get cleaned up by teardown_ec2_linux
        container_name=$(docker run \
          -e BUILD_ENVIRONMENT \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e AWS_DEFAULT_REGION \
          -e PR_NUMBER \
          -e SHA1 \
          -e BRANCH \
          -e SCCACHE_BUCKET \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e TORCH_CUDA_ARCH_LIST \
          -e PR_LABELS \
          -e OUR_GITHUB_JOB_ID \
          -e HUGGING_FACE_HUB_TOKEN \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
          --tty \
          --detach \
          --user jenkins \
          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
          -w /var/lib/jenkins/workspace \
          "${DOCKER_IMAGE}"
        )
        docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
    - name: Archive artifacts into zip
      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
      shell: bash
      run: |
        zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
    - name: Store PyTorch Build Artifacts on S3
      uses: seemethere/upload-artifact-s3@v5
      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
      with:
        name: ${{ inputs.build-environment }}
        retention-days: 14
        if-no-files-found: error
        path: artifacts.zip
        s3-bucket: ${{ inputs.s3-bucket }}
    - name: Upload sccache stats
      if: steps.build.outcome != 'skipped'
      uses: seemethere/upload-artifact-s3@v5
      with:
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 365
        if-no-files-found: warn
        path: sccache-stats-*.json
        s3-bucket: ${{ inputs.s3-bucket }}
    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
      if: always()
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -167,7 +167,6 @@ runs:
        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
        TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -17,7 +17,7 @@ inputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@v3.0.0
+    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      name: Setup dependencies
      with:
        shell: bash
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Simon Fan	bbf5ebbd4b	[compiled autograd] torch.compile API ghstack-source-id: 28065ffb2ed01641b4dcd31fb8fc0e729192f9ec Pull Request resolved: https://github.com/pytorch/pytorch/pull/125880	2024-05-16 01:00:03 -07:00
Simon Fan	463913e679	[compiled autograd] clear compiled_autograd_verbose once test is done ghstack-source-id: f817bd618a06b97a86ef1262dd457cf19879c548 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126148	2024-05-14 08:30:58 -07:00
Simon Fan	643f57a782	[inductor] Clear cache on ctx manager exit ghstack-source-id: 9146aea2680868b25af31a9271a7aa0a396668af Pull Request resolved: https://github.com/pytorch/pytorch/pull/126146	2024-05-14 08:30:57 -07:00
Simon Fan	f13bfd8d87	[compiled autograd] Fix flaky tests ghstack-source-id: 9e999edf4e9a1e41c381fdf20063338a6eb2f313 Pull Request resolved: https://github.com/pytorch/pytorch/pull/126144	2024-05-14 08:30:57 -07:00
`@ -1 +1 @@`
	`cd1c833b079adb324871dcbbe75b43d42ffc0ade`	`d4b3e5cc607e97afdba79dc90f8ef968142f347c`
`@ -1 +1 @@`
	`ac3470188b914c5d7a5058a7e28b9eb685a62427`	`730b907b4d45a4713cbc425cbf224c46089fd514`
`@ -1 +1 @@`
	`91b14bf5593cf58a8541f3e6b9125600a867d4ef`	`b8c64f64c18d8cac598b3adb355c21e7439c21de`
`@ -1 +1 @@`
	`5fe38ffd73c2ac6ed6323b554205186696631c6f`	`45fff310c891f5a92d55445adf8cc9d29df5841e`