[BE] typing for decorators - _inductor/lowering (#131574 )

See #131429 Pull Request resolved: https://github.com/pytorch/pytorch/pull/131574 Approved by: https://github.com/oulgen, https://github.com/zou3519 ghstack dependencies: #131568, #131569, #131570, #131571, #131572, #131573
[BE] typing for decorators - _jit_internal (#131573 )
2025-10-30 03:34:56 +08:00 · 2024-07-25 22:24:19 +00:00 · 2024-07-25 22:24:19 +00:00 · 2024-07-25 22:24:19 +00:00 · 2024-07-25 22:24:19 +00:00 · 2024-07-25 22:24:19 +00:00
2656 changed files with 72940 additions and 39945 deletions
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -1,4 +1,4 @@
-# Docker images for GitHub CI
+# Docker images for GitHub CI and CD

 This directory contains everything needed to build the Docker images
 that are used in our CI.
@ -12,7 +12,7 @@ each image as the `BUILD_ENVIRONMENT` environment variable.

 See `build.sh` for valid build environments (it's the giant switch).

-## Contents
+## Docker CI builds

 * `build.sh` -- dispatch script to launch all builds
 * `common` -- scripts used to execute individual Docker build stages
@ -21,6 +21,12 @@ See `build.sh` for valid build environments (it's the giant switch).
 * `ubuntu-rocm` -- Dockerfile for Ubuntu image with ROCm support
 * `ubuntu-xpu` -- Dockerfile for Ubuntu image with XPU support

+### Docker CD builds
+
+* `conda` - Dockerfile and build.sh to build Docker images used in nightly conda builds
+* `manywheel` - Dockerfile and build.sh to build Docker images used in nightly manywheel builds
+* `libtorch` - Dockerfile and build.sh to build Docker images used in nightly libtorch builds
+
 ## Usage

 ```bash
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -407,6 +407,22 @@ case "$image" in
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
+  pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    ACL=yes
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
+    # snadampal: skipping llvm src build install because the current version
+    # from pytorch/llvm:9.0.1 is x86 specific
+    SKIP_LLVM_SRC_BUILD_INSTALL=yes
+    INDUCTOR_BENCHMARKS=yes
+    ;;
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-c572f9e509b5ec5d56f4d218271e36269bba244f
+48da61aa34b73ea8e2ee815a6a79eea817e361db
--- a/.ci/docker/common/aotriton_version.txt
+++ b/.ci/docker/common/aotriton_version.txt
@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6.1
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -85,7 +85,7 @@ fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
--- a/.ci/docker/common/install_conda_docker.sh
+++ b/.ci/docker/common/install_conda_docker.sh
@ -0,0 +1,20 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -ex
+
+# Anaconda
+# Latest anaconda is using openssl-3 which is incompatible with all currently published versions of git
+# Which are using openssl-1.1.1, see https://anaconda.org/anaconda/git/files?version=2.40.1 for example
+MINICONDA_URL=https://repo.anaconda.com/miniconda/Miniconda3-py311_23.5.2-0-Linux-x86_64.sh
+wget -q $MINICONDA_URL
+# NB: Manually invoke bash per https://github.com/conda/conda/issues/10431
+bash $(basename "$MINICONDA_URL") -b -p /opt/conda
+rm $(basename "$MINICONDA_URL")
+export PATH=/opt/conda/bin:$PATH
+# See https://github.com/pytorch/builder/issues/1473
+# Pin conda to 23.5.2 as it's the last one compatible with openssl-1.1.1
+conda install -y conda=23.5.2 conda-build anaconda-client git ninja
+# The cmake version here needs to match with the minimum version of cmake
+# supported by PyTorch (3.18). There is only 3.18.2 on anaconda
+/opt/conda/bin/pip3 install cmake==3.18.2
+conda remove -y --force patchelf
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -0,0 +1,95 @@
+#!/bin/bash
+# Script used only in CD pipeline
+set -uex -o pipefail
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+# Python versions to be installed in /opt/$VERSION_NO
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+function do_cpython_build {
+    local py_ver=$1
+    local py_folder=$2
+    check_var $py_ver
+    check_var $py_folder
+    tar -xzf Python-$py_ver.tgz
+    pushd $py_folder
+
+    local prefix="/opt/_internal/cpython-${py_ver}"
+    mkdir -p ${prefix}/lib
+    if [[ -n $(which patchelf) ]]; then
+        local shared_flags="--enable-shared"
+    else
+        local shared_flags="--disable-shared"
+    fi
+    if [[ -z  "${WITH_OPENSSL+x}" ]]; then
+        local openssl_flags=""
+    else
+        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
+    fi
+
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+
+    make -j40 > /dev/null
+    make install > /dev/null
+
+    if [[ "${shared_flags}" == "--enable-shared" ]]; then
+        patchelf --set-rpath '$ORIGIN/../lib' ${prefix}/bin/python3
+    fi
+
+    popd
+    rm -rf $py_folder
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    ${prefix}/bin/python get-pip.py
+    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
+        ln -s pip3 ${prefix}/bin/pip
+    fi
+    ${prefix}/bin/pip install wheel==0.34.2
+    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    local py_ver_folder=$py_ver
+    if [ "$py_ver" = "3.13.0" ]; then
+        PY_VER_SHORT="3.13"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PY_VER_SHORT
+    else
+        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
+        do_cpython_build $py_ver Python-$py_ver
+    fi
+
+    rm -f Python-$py_ver.tgz
+}
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm -f get-pip.py
+}
+
+mkdir -p /opt/python
+mkdir -p /opt/_internal
+build_cpythons $CPYTHON_VERSIONS
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -0,0 +1,239 @@
+#!/bin/bash
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.1.0.70
+
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_121 {
+    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
+    # install CUDA 12.1.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
+    chmod +x cuda_12.1.1_530.30.02_linux.run
+    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
+    rm -f cuda_12.1.1_530.30.02_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_052
+
+    ldconfig
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+  chmod +x cuda_12.4.0_550.54.14_linux.run
+  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"
+
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_121 {
+  echo "Pruning CUDA 12.1"
+  #####################################################################################
+  # CUDA 12.1 prune static libs
+  #####################################################################################
+    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
+
+    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi
+
+    # all CUDA libs except CuDNN and CuBLAS
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 12.1 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-12.1/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    11.8) install_118; prune_118
+        ;;
+    12.1) install_121; prune_121
+        ;;
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,93 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_libpng.sh
+++ b/.ci/docker/common/install_libpng.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+LIBPNG_VERSION=1.6.37
+
+mkdir -p libpng
+pushd libpng
+
+wget http://download.sourceforge.net/libpng/libpng-$LIBPNG_VERSION.tar.gz
+tar -xvzf libpng-$LIBPNG_VERSION.tar.gz
+
+pushd libpng-$LIBPNG_VERSION
+
+./configure
+make
+make install
+
+popd
+
+popd
+rm -rf libpng
--- a/.ci/docker/common/install_magma.sh
+++ b/.ci/docker/common/install_magma.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+MAGMA_VERSION="2.5.2"
+
+function do_install() {
+    cuda_version=$1
+    cuda_version_nodot=${1/./}
+
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    cuda_dir="/usr/local/cuda-${cuda_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mkdir -p "${cuda_dir}/magma"
+        mv include "${cuda_dir}/magma/include"
+        mv lib "${cuda_dir}/magma/lib"
+        popd
+    )
+}
+
+do_install $1
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -0,0 +1,134 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+ROCM_VERSION=$1
+
+if [[ -z $ROCM_VERSION ]]; then
+    echo "missing ROCM_VERSION"
+    exit 1;
+fi
+
+# To make version comparison easier, create an integer representation.
+save_IFS="$IFS"
+IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
+IFS="$save_IFS"
+if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=0
+elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
+    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
+    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
+    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
+
+# Install custom MIOpen + COMgr for ROCm >= 4.0.1
+if [[ $ROCM_INT -lt 40001 ]]; then
+    echo "ROCm version < 4.0.1; will not install custom MIOpen"
+    exit 0
+fi
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Build custom MIOpen to use comgr for offline compilation.
+
+## Need a sanitized ROCM_VERSION without patchlevel; patchlevel version 0 must be added to paths.
+ROCM_DOTS=$(echo ${ROCM_VERSION} | tr -d -c '.' | wc -c)
+if [[ ${ROCM_DOTS} == 1 ]]; then
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}.0"
+else
+    ROCM_VERSION_NOPATCH="${ROCM_VERSION%.*}"
+    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
+fi
+
+# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
+MIOPEN_CMAKE_COMMON_FLAGS="
+-DMIOPEN_USE_COMGR=ON
+-DMIOPEN_BUILD_DRIVER=OFF
+"
+# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
+if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
+    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
+    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
+elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
+elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
+elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
+elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
+elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
+fi
+
+yum remove -y miopen-hip
+
+git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
+pushd MIOpen
+# remove .git to save disk space since CI runner was running out
+rm -rf .git
+# Don't build MLIR to save docker build time
+# since we are disabling MLIR backend for MIOpen anyway
+if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    sed -i '/rocMLIR/d' requirements.txt
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    sed -i '/llvm-project-mlir/d' requirements.txt
+fi
+## MIOpen minimum requirements
+cmake -P install_deps.cmake --minimum
+
+# clean up since CI runner was running out of disk space
+rm -rf /tmp/*
+yum clean all
+rm -rf /var/cache/yum
+rm -rf /var/lib/yum/yumdb
+rm -rf /var/lib/yum/history
+
+## Build MIOpen
+mkdir -p build
+cd build
+PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
+    ${MIOPEN_CMAKE_COMMON_FLAGS} \
+    ${MIOPEN_CMAKE_DB_FLAGS} \
+    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
+make MIOpen -j $(nproc)
+
+# Build MIOpen package
+make -j $(nproc) package
+
+# clean up since CI runner was running out of disk space
+rm -rf /usr/local/cget
+
+yum install -y miopen-*.rpm
+
+popd
+rm -rf MIOpen
--- a/.ci/docker/common/install_mkl.sh
+++ b/.ci/docker/common/install_mkl.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+set -ex
+
+# MKL
+MKL_VERSION=2024.2.0
+
+MKLROOT=/opt/intel
+mkdir -p ${MKLROOT}
+pushd /tmp
+
+python3 -mpip install wheel
+python3 -mpip download -d . mkl-static==${MKL_VERSION}
+python3 -m wheel unpack mkl_static-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+python3 -m wheel unpack mkl_include-${MKL_VERSION}-py2.py3-none-manylinux1_x86_64.whl
+mv mkl_static-${MKL_VERSION}/mkl_static-${MKL_VERSION}.data/data/lib ${MKLROOT}
+mv mkl_include-${MKL_VERSION}/mkl_include-${MKL_VERSION}.data/data/include ${MKLROOT}
--- a/.ci/docker/common/install_mnist.sh
+++ b/.ci/docker/common/install_mnist.sh
@ -0,0 +1,13 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+mkdir -p /usr/local/mnist/
+
+cd /usr/local/mnist
+
+for img in train-images-idx3-ubyte.gz train-labels-idx1-ubyte.gz t10k-images-idx3-ubyte.gz t10k-labels-idx1-ubyte.gz; do
+  wget -q https://ossci-datasets.s3.amazonaws.com/mnist/$img
+  gzip -d $img
+done
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+cd /
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
+
+
+OPENBLAS_BUILD_FLAGS="
+NUM_THREADS=128
+USE_OPENMP=1
+NO_SHARED=0
+DYNAMIC_ARCH=1
+TARGET=ARMV8
+CFLAGS=-O3
+"
+
+OPENBLAS_CHECKOUT_DIR="OpenBLAS"
+
+make -j8 ${OPENBLAS_BUILD_FLAGS} -C ${OPENBLAS_CHECKOUT_DIR}
+make -j8 ${OPENBLAS_BUILD_FLAGS} install -C ${OPENBLAS_CHECKOUT_DIR}
--- a/.ci/docker/common/install_patchelf.sh
+++ b/.ci/docker/common/install_patchelf.sh
@ -0,0 +1,16 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+# Pin the version to latest release 0.17.2, building newer commit starts
+# to fail on the current image
+git clone -b 0.17.2 --single-branch https://github.com/NixOS/patchelf
+cd patchelf
+sed -i 's/serial/parallel/g' configure.ac
+./bootstrap.sh
+./configure
+make
+make install
+cd ..
+rm -rf patchelf
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -0,0 +1,150 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+###########################
+### prereqs
+###########################
+# Install Python packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get update -y
+    apt-get install -y libpciaccess-dev pkg-config
+    apt-get clean
+    ;;
+  centos)
+    yum install -y libpciaccess-devel pkgconfig
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+python3 -m pip install meson ninja
+
+###########################
+### clone repo
+###########################
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
+pushd drm
+
+###########################
+### patch
+###########################
+patch -p1 <<'EOF'
+diff --git a/amdgpu/amdgpu_asic_id.c b/amdgpu/amdgpu_asic_id.c
+index a5007ffc..13fa07fc 100644
+--- a/amdgpu/amdgpu_asic_id.c
+++ b/amdgpu/amdgpu_asic_id.c
+@@ -22,6 +22,13 @@
+  *
+  */
+
+#define _XOPEN_SOURCE 700
+#define _LARGEFILE64_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <ftw.h>
+#include <link.h>
+#include <limits.h>
+
+ #include <ctype.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+@@ -34,6 +41,19 @@
+ #include "amdgpu_drm.h"
+ #include "amdgpu_internal.h"
+
+static char *amdgpuids_path = NULL;
+static const char* amdgpuids_path_msg = NULL;
+
+static int check_for_location_of_amdgpuids(const char *filepath, const struct stat *info, const int typeflag, struct FTW *pathinfo)
+{
+	if (typeflag == FTW_F && strstr(filepath, "amdgpu.ids")) {
+		amdgpuids_path = strdup(filepath);
+		return 1;
+	}
+
+	return 0;
+}
+
+ static int parse_one_line(struct amdgpu_device *dev, const char *line)
+ {
+ 	char *buf, *saveptr;
+@@ -113,10 +133,46 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 	int line_num = 1;
+ 	int r = 0;
+
+	// attempt to find typical location for amdgpu.ids file
+ 	fp = fopen(AMDGPU_ASIC_ID_TABLE, "r");
+
+	// if it doesn't exist, search
+	if (!fp) {
+
+	char self_path[ PATH_MAX ];
+	ssize_t count;
+	ssize_t i;
+
+	count = readlink( "/proc/self/exe", self_path, PATH_MAX );
+	if (count > 0) {
+		self_path[count] = '\0';
+
+		// remove '/bin/python' from self_path
+		for (i=count; i>0; --i) {
+			if (self_path[i] == '/') break;
+			self_path[i] = '\0';
+		}
+		self_path[i] = '\0';
+		for (; i>0; --i) {
+			if (self_path[i] == '/') break;
+			self_path[i] = '\0';
+		}
+		self_path[i] = '\0';
+
+		if (1 == nftw(self_path, check_for_location_of_amdgpuids, 5, FTW_PHYS)) {
+			fp = fopen(amdgpuids_path, "r");
+			amdgpuids_path_msg = amdgpuids_path;
+		}
+	}
+
+	}
+	else {
+		amdgpuids_path_msg = AMDGPU_ASIC_ID_TABLE;
+	}
+
+	// both hard-coded location and search have failed
+ 	if (!fp) {
+-		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
+-			strerror(errno));
+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+ 		return;
+ 	}
+
+@@ -132,7 +188,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+ 			continue;
+ 		}
+
+-		drmMsg("%s version: %s\n", AMDGPU_ASIC_ID_TABLE, line);
+		drmMsg("%s version: %s\n", amdgpuids_path_msg, line);
+ 		break;
+ 	}
+
+@@ -150,7 +206,7 @@ void amdgpu_parse_asic_ids(struct amdgpu_device *dev)
+
+ 	if (r == -EINVAL) {
+ 		fprintf(stderr, "Invalid format: %s: line %d: %s\n",
+-			AMDGPU_ASIC_ID_TABLE, line_num, line);
+			amdgpuids_path_msg, line_num, line);
+ 	} else if (r && r != -EAGAIN) {
+ 		fprintf(stderr, "%s: Cannot parse ASIC IDs: %s\n",
+ 			__func__, strerror(-r));
+EOF
+
+###########################
+### build
+###########################
+meson builddir --prefix=/opt/amdgpu
+pushd builddir
+ninja install
+
+popd
+popd
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,7 +1,11 @@
 #!/bin/bash
+# Script used in CI and CD pipeline

 set -ex

+
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
+
 # "install" hipMAGMA into /opt/rocm/magma by copying after build
 git clone https://bitbucket.org/icl/magma.git
 pushd magma
@ -11,7 +15,10 @@ git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6

 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
 echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
 export PATH="${PATH}:/opt/rocm/bin"
 if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
@ -25,7 +32,7 @@ done
 # hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
 sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
 make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
-make testing/testing_dgemm -j $(nproc) MKLROOT=/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
 popd
 mv magma /opt/rocm
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -1,6 +1,6 @@
 #!/bin/bash
 set -xe
-
+# Script used in CI and CD pipeline

 # Intel® software for general purpose GPU capabilities.
 # Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
@ -8,19 +8,23 @@ set -xe
 # Users should update to the latest version as it becomes available

 function install_ubuntu() {
+    . /etc/os-release
+    if [[ ! " jammy " =~ " ${VERSION_CODENAME} " ]]; then
+        echo "Ubuntu version ${VERSION_CODENAME} not supported"
+        exit
+    fi
+
    apt-get update -y
    apt-get install -y gpg-agent wget
-
-    # Set up the repository. To do this, download the key to the system keyring
+    # To add the online network package repository for the GPU Driver LTS releases
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
-        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-
-    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
+        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
-        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
+        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \
+        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
+    # To add the online network network package repository for the Intel Support Packages
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
@ -97,6 +101,86 @@ EOF
    rm -rf /var/lib/yum/history
 }

+function install_rhel() {
+    . /etc/os-release
+    if [[ "${ID}" == "rhel" ]]; then
+        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+            echo "RHEL version ${VERSION_ID} not supported"
+            exit
+        fi
+    elif [[ "${ID}" == "almalinux" ]]; then
+        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
+        VERSION_ID="8.6"
+    fi
+
+    dnf install -y 'dnf-command(config-manager)'
+    # To add the online network package repository for the GPU Driver LTS releases
+    dnf config-manager --add-repo \
+        https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo
+    # To add the online network network package repository for the Intel Support Packages
+    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
+[intel-for-pytorch-gpu-dev]
+name=Intel for Pytorch GPU dev repository
+baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+EOF
+
+    # The xpu-smi packages
+    dnf install -y xpu-smi
+    # Compute and Media Runtimes
+    dnf install -y \
+        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
+        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
+        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
+        mesa-libxatracker libvpl-tools intel-metrics-discovery \
+        intel-metrics-library intel-igc-core intel-igc-cm \
+        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc
+    # Development packages
+    dnf install -y --refresh \
+        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
+        level-zero-devel
+    # Install Intel Support Packages
+    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+    # Cleanup
+    dnf clean all
+    rm -rf /var/cache/yum
+    rm -rf /var/lib/yum/yumdb
+    rm -rf /var/lib/yum/history
+}
+
+function install_sles() {
+    . /etc/os-release
+    VERSION_SP=${VERSION_ID//./sp}
+    if [[ ! " 15sp4 15sp5 " =~ " ${VERSION_SP} " ]]; then
+        echo "SLES version ${VERSION_ID} not supported"
+        exit
+    fi
+
+    # To add the online network package repository for the GPU Driver LTS releases
+    zypper addrepo -f -r \
+        https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo
+    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
+    # To add the online network network package repository for the Intel Support Packages
+    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
+    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+
+    # The xpu-smi packages
+    zypper install -y lsb-release flex bison xpu-smi
+    # Compute and Media Runtimes
+    zypper install -y intel-level-zero-gpu level-zero intel-gsc intel-opencl intel-ocloc \
+        intel-media-driver libigfxcmrt7 libvpl2 libvpl-tools libmfxgen1 libmfx1
+    # Development packages
+    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
+
+    # Install Intel Support Packages
+    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
+
+}
+

 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@ -107,6 +191,12 @@ case "$ID" in
    centos)
        install_centos
    ;;
+    rhel|almalinux)
+        install_rhel
+    ;;
+    sles)
+        install_sles
+    ;;
    *)
        echo "Unable to determine OS..."
        exit 1
--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -0,0 +1,101 @@
+ARG CUDA_VERSION=10.2
+ARG BASE_TARGET=cuda${CUDA_VERSION}
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum update -y
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+# EPEL for cmake
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum install -y autoconf aclocal automake make sudo
+RUN rm -rf /usr/local/cuda-*
+
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf
+
+FROM base as openssl
+# Install openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+FROM base as conda
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+# Install CUDA
+FROM base as cuda
+ARG CUDA_VERSION=10.2
+RUN rm -rf /usr/local/cuda-*
+ADD ./common/install_cuda.sh install_cuda.sh
+ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
+# Preserve CUDA_VERSION for the builds
+ENV CUDA_VERSION=${CUDA_VERSION}
+# Make things in our path by default
+ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+ENV DESIRED_CUDA=11.8
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
+# Install MNIST test data
+FROM base as mnist
+ADD ./common/install_mnist.sh install_mnist.sh
+RUN bash ./install_mnist.sh
+
+FROM base as all_cuda
+COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
+
+# Final step
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+COPY --from=patchelf           /patchelf              /usr/local/bin/patchelf
+COPY --from=conda              /opt/conda             /opt/conda
+
+# Add jni.h for java host build.
+COPY ./common/install_jni.sh install_jni.sh
+COPY ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+ENV  PATH /opt/conda/bin:$PATH
+COPY --from=mnist  /usr/local/mnist /usr/local/mnist
+RUN rm -rf /usr/local/cuda
+RUN chmod o+rw /usr/local
+RUN touch /.condarc && \
+    chmod o+rw /.condarc && \
+    chmod -R o+rw /opt/conda
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE_NAME="pytorch/${image}"
+
+
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+
+CUDA_VERSION=${CUDA_VERSION:-12.1}
+
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=9" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
+  # Test that we're using the right CUDA compiler
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -0,0 +1,107 @@
+ARG BASE_TARGET=base
+ARG GPU_IMAGE=ubuntu:20.04
+FROM ${GPU_IMAGE} as base
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get clean && apt-get update
+RUN apt-get install -y curl locales g++ git-all autoconf automake make cmake wget unzip sudo
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+
+RUN locale-gen en_US.UTF-8
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+# Install openssl
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+ADD common/install_cpython.sh install_cpython.sh
+RUN apt-get update -y && \
+    apt-get install build-essential gdb lcov libbz2-dev libffi-dev \
+        libgdbm-dev liblzma-dev libncurses5-dev libreadline6-dev \
+        libsqlite3-dev libssl-dev lzma lzma-dev tk-dev uuid-dev zlib1g-dev -y && \
+    bash ./install_cpython.sh && \
+    rm install_cpython.sh && \
+    apt-get clean
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+
+FROM base as cpu
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
+# Install MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM cpu as cuda
+ADD ./common/install_cuda.sh install_cuda.sh
+ADD ./common/install_magma.sh install_magma.sh
+ENV CUDA_HOME /usr/local/cuda
+
+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+RUN bash ./install_magma.sh 11.8
+RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+RUN bash ./install_magma.sh 12.1
+RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+RUN bash ./install_magma.sh 12.4
+RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
+
+FROM cpu as rocm
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+ENV MKLROOT /opt/intel
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+# gfortran and python needed for building magma from source for ROCm
+RUN apt-get update -y && \
+    apt-get install gfortran -y && \
+    apt-get install python -y && \
+    apt-get clean
+
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
+FROM ${BASE_TARGET} as final
+COPY --from=openssl            /opt/openssl           /opt/openssl
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+# Install Anaconda
+COPY --from=conda /opt/conda /opt/conda
+# Install python
+COPY --from=python /opt/python    /opt/python
+COPY --from=python /opt/_internal /opt/_internal
+ENV PATH=/opt/conda/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
+DOCKER=${DOCKER:-docker}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    cuda)
+        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=ubuntu:20.04
+        DOCKER_GPU_BUILD_ARG=""
+        ;;
+    rocm)
+        BASE_TARGET=rocm
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,203 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -0,0 +1,152 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=10.2
+ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
+FROM quay.io/pypa/manylinux2014_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+RUN yum install -y yum-utils centos-release-scl sudo
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.2
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+# ninja
+RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -0,0 +1,153 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+ARG GPU_IMAGE=amd64/almalinux:8
+FROM quay.io/pypa/manylinux_2_28_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=11.8
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+ARG DEVTOOLSET_VERSION=11
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+FROM cpu_final as xpu_final
+# cmake-3.28.4 from pip
+RUN python3 -m pip install --upgrade pip && \
+    python3 -mpip install cmake==3.28.4
+ADD ./common/install_xpu.sh install_xpu.sh
+RUN bash ./install_xpu.sh && rm install_xpu.sh
+RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -0,0 +1,57 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  less \
+  libffi-devel \
+  libgomp \
+  make \
+  openssl-devel \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  zstd \
+  sudo \
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as final
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -0,0 +1,91 @@
+FROM quay.io/pypa/manylinux_2_28_aarch64 as base
+
+# Cuda ARM build needs gcc 11
+ARG DEVTOOLSET_VERSION=11
+
+# Language variables
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION
+# Install CUDA
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM final as cuda_final
+ARG BASE_CUDA_VERSION
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cxx11-abi
+++ b/.ci/docker/manywheel/Dockerfile_cxx11-abi
@ -0,0 +1,71 @@
+FROM centos:8 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/gcc-toolset-11/root/bin/:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+
+# change to a valid repo
+RUN sed -i 's|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g' /etc/yum.repos.d/CentOS-Linux-*.repo
+# enable to install ninja-build
+RUN sed -i 's|enabled=0|enabled=1|g' /etc/yum.repos.d/CentOS-Linux-PowerTools.repo
+
+RUN yum -y update
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which zlib-devel sudo
+RUN yum install -y autoconf automake make cmake gdb gcc-toolset-11-gcc-c++
+
+
+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# Install python
+FROM base as python
+RUN yum install -y openssl-devel zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel
+ADD common/install_cpython.sh install_cpython.sh
+RUN bash ./install_cpython.sh && rm install_cpython.sh
+
+FROM base as conda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+RUN /opt/conda/bin/conda install -y cmake
+
+FROM base as intel
+# Install MKL
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=conda              /opt/conda                            /opt/conda
+ENV PATH=/opt/conda/bin:$PATH
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM base as jni
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM base as final
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=conda              /opt/conda                            /opt/conda
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+RUN yum install -y ninja-build
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -0,0 +1,73 @@
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
+
+# Language variables
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+ENV LANGUAGE=C.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN apt update ; apt upgrade -y
+RUN apt install -y \
+  build-essential \
+  autoconf \
+  automake \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz-utils \
+  less \
+  zstd \
+  cmake \
+  python3 \
+  python3-dev \
+  python3-setuptools \
+  python3-yaml \
+  python3-typing-extensions \
+  libblas-dev \
+  libopenblas-dev \
+  liblapack-dev \
+  libatlas-base-dev
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM openssl as final
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -0,0 +1,154 @@
+#!/usr/bin/env bash
+# Script used only in CD pipeline
+
+set -eou pipefail
+
+TOPDIR=$(git rev-parse --show-toplevel)
+
+image="$1"
+shift
+
+if [ -z "${image}" ]; then
+  echo "Usage: $0 IMAGE"
+  exit 1
+fi
+
+DOCKER_IMAGE="pytorch/${image}"
+
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
+
+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
+DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
+WITH_PUSH=${WITH_PUSH:-}
+
+case ${GPU_ARCH_TYPE} in
+    cpu)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cpu-aarch64)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
+        ;;
+    cpu-aarch64-2_28)
+        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        ;;
+    cpu-cxx11-abi)
+        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
+        GPU_IMAGE=""
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        MANY_LINUX_VERSION="cxx11-abi"
+        ;;
+    cpu-s390x)
+        TARGET=final
+        DOCKER_TAG=cpu-s390x
+        GPU_IMAGE=redhat/ubi9
+        DOCKER_GPU_BUILD_ARG=""
+        MANY_LINUX_VERSION="s390x"
+        ;;
+    cuda)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    cuda-aarch64)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="aarch64"
+        DOCKERFILE_SUFFIX="_cuda_aarch64"
+        ;;
+    rocm)
+        TARGET=rocm_final
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    xpu)
+        TARGET=xpu_final
+        DOCKER_TAG=xpu
+        GPU_IMAGE=amd64/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28"
+        ;;
+    *)
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        exit 1
+        ;;
+esac
+
+IMAGES=''
+
+if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
+    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
+fi
+(
+    set -x
+    DOCKER_BUILDKIT=1 docker build \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
+fi
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -0,0 +1,131 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+# Script used only in CD pipeline
+
+# Stop at any error, show all commands
+set -ex
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.1.1l
+OPENSSL_HASH=0b7a3e5e59c34827fe0c3a74b7ec8baef302b98fa80088d7f9153aa16fa76bd1
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.73.0
+CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+if [ "$(uname -m)" != "s390x" ] ; then
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+    # Development tools and libraries
+    yum -y install bzip2 make git patch unzip bison yasm diffutils \
+        automake which file cmake28 \
+        kernel-devel-`uname -r` \
+        ${PYTHON_COMPILE_DEPS}
+else
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
+
+    # Development tools and libraries
+    apt install -y bzip2 make git patch unzip diffutils \
+        automake which file cmake \
+        linux-headers-virtual \
+        ${PYTHON_COMPILE_DEPS}
+fi
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+/build_scripts/install_cpython.sh
+
+PY39_BIN=/opt/python/cp39-cp39/bin
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+$PY39_BIN/pip install certifi
+ln -s $($PY39_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLOk https://nixos.org/releases/patchelf/patchelf-0.10/patchelf-0.10.tar.gz
+# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.10.tar.gz
+(cd patchelf-0.10 && ./configure && make && make install)
+rm -rf patchelf-0.10.tar.gz patchelf-0.10
+
+# Install latest pypi release of auditwheel
+$PY39_BIN/pip install auditwheel
+ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+if [ "$(uname -m)" != "s390x" ] ; then
+    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+        avahi freetype bitstream-vera-fonts \
+        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+    yum -y install ${MANYLINUX1_DEPS}
+    yum -y clean all > /dev/null 2>&1
+    yum list installed
+else
+    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+fi
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Fix libc headers to remain compatible with C99 compilers.
+find /usr/include/ -type f -exec sed -i 's/\bextern _*inline_*\b/extern __inline __attribute__ ((__gnu_inline__))/g' {} +
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -0,0 +1,91 @@
+#!/bin/bash
+# Helper utilities for build
+# Script used only in CD pipeline
+
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download
+
+AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm -f ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
--- a/.ci/docker/manywheel/build_scripts/manylinux1-check.py
+++ b/.ci/docker/manywheel/build_scripts/manylinux1-check.py
@ -0,0 +1,60 @@
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+
+    if get_platform() not in ["linux-x86_64", "linux-i686", "linux-s390x"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+
+
+if is_manylinux1_compatible():
+    print(f"{sys.executable} is manylinux1 compatible")
+    sys.exit(0)
+else:
+    print(f"{sys.executable} is NOT manylinux1 compatible")
+    sys.exit(1)
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -0,0 +1,35 @@
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+
+    EXC = OSError
+else:
+    from urllib import urlopen
+
+    EXC = IOError
+
+print(f"Connecting to {GOOD_SSL} should work")
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print(f"Connecting to {BAD_SSL} should fail")
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -134,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.11.0
+optree==0.12.1
 #Description: A library for tree manipulation
-#Pinned versions: 0.11.0
+#Pinned versions: 0.12.1
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
--- a/.ci/pytorch/README.md
+++ b/.ci/pytorch/README.md
@ -1,42 +1 @@
 This directory contains scripts for our continuous integration.
-
-One important thing to keep in mind when reading the scripts here is
-that they are all based off of Docker images, which we build for each of
-the various system configurations we want to run on Jenkins.  This means
-it is very easy to run these tests yourself:
-
-1. Figure out what Docker image you want.  The general template for our
-   images look like:
-   ``registry.pytorch.org/pytorch/pytorch-$BUILD_ENVIRONMENT:$DOCKER_VERSION``,
-   where ``$BUILD_ENVIRONMENT`` is one of the build environments
-   enumerated in
-   [pytorch-dockerfiles](https://github.com/pytorch/pytorch/blob/master/.ci/docker/build.sh). The dockerfile used by jenkins can be found under the `.ci` [directory](https://github.com/pytorch/pytorch/blob/master/.ci/docker)
-
-2. Run ``docker run -it -u jenkins $DOCKER_IMAGE``, clone PyTorch and
-   run one of the scripts in this directory.
-
-The Docker images are designed so that any "reasonable" build commands
-will work; if you look in [build.sh](build.sh) you will see that it is a
-very simple script.  This is intentional.  Idiomatic build instructions
-should work inside all of our Docker images.  You can tweak the commands
-however you need (e.g., in case you want to rebuild with DEBUG, or rerun
-the build with higher verbosity, etc.).
-
-We have to do some work to make this so.  Here is a summary of the
-mechanisms we use:
-
- We install binaries to directories like `/usr/local/bin` which
-  are automatically part of your PATH.
-
- We add entries to the PATH using Docker ENV variables (so
-  they apply when you enter Docker) and `/etc/environment` (so they
-  continue to apply even if you sudo), instead of modifying
-  `PATH` in our build scripts.
-
- We use `/etc/ld.so.conf.d` to register directories containing
-  shared libraries, instead of modifying `LD_LIBRARY_PATH` in our
-  build scripts.
-
- We reroute well known paths like `/usr/bin/gcc` to alternate
-  implementations with `update-alternatives`, instead of setting
-  `CC` and `CXX` in our implementations.
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -230,6 +230,10 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

+if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
+  export CMAKE_BUILD_TYPE=RelWithAssert
+fi
+
 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
 if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -6,6 +6,7 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.x509.oid import NameOID

+
 temp_dir = mkdtemp()
 print(temp_dir)

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,6 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
 time python test/run_test.py --verbose -i distributed/test_store
 time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
@ -43,16 +44,15 @@ time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compi
 time python test/run_test.py --verbose -i distributed/test_device_mesh

 # DTensor/TP tests
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
+# ND composability tests
+time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
+time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -3,6 +3,7 @@ import json
 import math
 import sys

+
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -3,6 +3,7 @@ import sys

 import numpy

+
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]

--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -1,6 +1,7 @@
 import json
 import sys

+
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]

--- a/.ci/pytorch/print_sccache_log.py
+++ b/.ci/pytorch/print_sccache_log.py
@ -1,5 +1,6 @@
 import sys

+
 log_file_path = sys.argv[1]

 with open(log_file_path) as f:
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -249,9 +249,7 @@ fi
 # This tests that the debug asserts are working correctly.
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
-    # TODO: Enable the check after we setup the build to run debug asserts without having
-    #       to do a full (and slow) debug build
-    # (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
+    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
 elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
@ -318,14 +316,13 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
+  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
@ -406,7 +403,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -430,6 +427,19 @@ test_perf_for_dashboard() {
  # TODO: All the accuracy tests can be skipped once the CI accuracy checking is stable enough
  local targets=(accuracy performance)

+  local device=cuda
+  local taskset=""
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
+      device=cpu_x86
+    elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
+      device=cpu_aarch64
+    fi
+    test_inductor_set_cpu_affinity
+    end_core=$(( $(test_inductor_get_core_number)-1 ))
+    taskset="taskset -c 0-$end_core"
+  fi
+
  for mode in "${modes[@]}"; do
    if [[ "$mode" == "inference" ]]; then
      dtype=bfloat16
@ -445,56 +455,56 @@ test_perf_for_dashboard() {
      fi

      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_no_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs-true* ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *dynamic-true* ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --dynamic-shapes \
            --dynamic-batch-only "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_CPP_WRAPPER=1 python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_CPP_WRAPPER=1 $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freezing_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *freeze_autotune_cudagraphs-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" --freezing \
-            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *maxautotune-true* ]]; then
-        TORCHINDUCTOR_MAX_AUTOTUNE=1 python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_MAX_AUTOTUNE=1 $taskset python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
-            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
+            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
        # to fill the dashboard.
-        python "benchmarks/dynamo/$suite.py" \
+        $taskset python "benchmarks/dynamo/$suite.py" \
          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
-          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
+          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv" || true
        # Copy cudagraph results as mock data, easiest choice?
-        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
-          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
+        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_${device}_${target}.csv" \
+          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_${device}_${target}.csv"
      fi
    done
  done
@ -575,7 +585,7 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu* ]]; then
      local dt="float32"
      if [[ "${TEST_CONFIG}" == *amp* ]]; then
        dt="amp"
@ -646,10 +656,11 @@ test_inductor_torchbench_smoketest_perf() {
  done
 }

-test_inductor_torchbench_cpu_smoketest_perf(){
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
+test_inductor_get_core_number() {
+  echo $(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))
+}

+test_inductor_set_cpu_affinity(){
  #set jemalloc
  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
@ -657,32 +668,39 @@ test_inductor_torchbench_cpu_smoketest_perf(){
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
  export KMP_AFFINITY=granularity=fine,compact,1,0
  export KMP_BLOCKTIME=1
-  CORES=$(lscpu | grep Core | awk '{print $4}')
-  export OMP_NUM_THREADS=$CORES
-  end_core=$(( CORES-1 ))
+  cores=$(test_inductor_get_core_number)
+  export OMP_NUM_THREADS=$cores
+}

+test_inductor_torchbench_cpu_smoketest_perf(){
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  test_inductor_set_cpu_affinity
+  end_core=$(( $(test_inductor_get_core_number)-1 ))
  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv

  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
  do
    local model_name=${model_cfg[0]}
-    local data_type=${model_cfg[1]}
-    local speedup_target=${model_cfg[4]}
-    if [[ ${model_cfg[3]} == "cpp" ]]; then
+    local data_type=${model_cfg[2]}
+    local speedup_target=${model_cfg[5]}
+    local backend=${model_cfg[1]}
+    if [[ ${model_cfg[4]} == "cpp" ]]; then
      export TORCHINDUCTOR_CPP_WRAPPER=1
    else
      unset TORCHINDUCTOR_CPP_WRAPPER
    fi
    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"

-    if [[ ${model_cfg[2]} == "dynamic" ]]; then
+    if [[ ${model_cfg[3]} == "dynamic" ]]; then
      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
-        --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name"
+        --dynamic-batch-only --freezing --timeout 9000 --"$backend" --output "$output_name"
    else
      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
-        --freezing --timeout 9000 --backend=inductor --output "$output_name"
+        --freezing --timeout 9000 --"$backend" --output "$output_name"
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
@ -1268,7 +1286,7 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
@ -1285,7 +1303,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      shufflenet_v2_x1_0 hf_GPT2
+      shufflenet_v2_x1_0 hf_GPT2 yolov3 mobilenet_v2 resnext50_32x4d hf_T5_base
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
    checkout_install_torchbench
@ -1294,7 +1312,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -4,6 +4,7 @@ import os
 import subprocess
 import sys

+
 COMMON_TESTS = [
    (
        "Checking that torch is available",
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@ -5,6 +5,7 @@ import sys

 import yaml

+
 # Need to import modules that lie on an upward-relative path
 sys.path.append(os.path.join(sys.path[0], ".."))

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -46,14 +46,12 @@ if [[ "\$python_nodot" = *310* ]]; then
  PROTOBUF_PACKAGE="protobuf>=3.19.0"
 fi

-if [[ "\$python_nodot" = *39*  ]]; then
+if [[ "\$python_nodot" = *39* ]]; then
  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
  # we set a lower boundary here just to be safe
  NUMPY_PIN=">=1.20"
 fi

-
-
 # Move debug wheels out of the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
 mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
@ -83,7 +81,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
      "numpy\${NUMPY_PIN}" \
      mkl>=2018 \
      ninja \
-      sympy \
+      sympy>=1.12 \
      typing-extensions \
      ${PROTOBUF_PACKAGE}
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
@ -118,9 +116,18 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

+if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
+  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
+  set +u
+  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+fi
+
 # Test the package
 /builder/check_binary.sh

+# Clean temp files
+cd /builder && git clean -ffdx
+
 # =================== The above code will be executed inside Docker container ===================
 EOL
 echo
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -100,6 +100,20 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

+# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
+    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
+        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
+    fi
+    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
+    else
+        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+    fi
+fi
+
 cat >"$envfile" <<EOL
 # =================== The following code will be executed inside Docker container ===================
 export TZ=UTC
--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -29,6 +29,11 @@ if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
 fi

+# this is special build with all dependencies packaged
+if [[ ${BUILD_NAME} == *-full* ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
+fi
+
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@ -8,6 +8,7 @@ import time

 import requests

+
 AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
--- a/.devcontainer/scripts/install-dev-tools.sh
+++ b/.devcontainer/scripts/install-dev-tools.sh
@ -5,7 +5,7 @@ git submodule sync
 git submodule update --init --recursive

 # This takes some time
-make setup_lint
+make setup-lint

 # Add CMAKE_PREFIX_PATH to bashrc
 echo 'export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}' >> ~/.bashrc
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
+select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -40,3 +40,7 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
 # 2024-01-02 clangformat: fused adam #116583
 9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
+# 2024-06-28 enable UFMT in `torch/storage.py`
+d80939e5e9337e8078f11489afefec59fd42f93b
+# 2024-06-28 enable UFMT in `torch.utils.data`
+7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -9,14 +9,16 @@ self-hosted-runner:
    - linux.large
    - linux.2xlarge
    - linux.4xlarge
+    - linux.9xlarge.ephemeral
    - linux.12xlarge
+    - linux.12xlarge.ephemeral
    - linux.24xlarge
    - linux.arm64.2xlarge
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners on Linux Foundation account
+    # Pytorch/pytorch AWS Linux Runners on Linux Foundation account
    - lf.linux.large
    - lf.linux.2xlarge
    - lf.linux.4xlarge
@ -27,6 +29,28 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
+    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
+    - amz2023.linux.large
+    - amz2023.linux.2xlarge
+    - amz2023.linux.4xlarge
+    - amz2023.linux.12xlarge
+    - amz2023.linux.24xlarge
+    - amz2023.linux.arm64.2xlarge
+    - amz2023.linux.4xlarge.nvidia.gpu
+    - amz2023.linux.8xlarge.nvidia.gpu
+    - amz2023.linux.16xlarge.nvidia.gpu
+    - amz2023.linux.g5.4xlarge.nvidia.gpu
+    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
+    - amz2023.lf.linux.large
+    - amz2023.lf.linux.2xlarge
+    - amz2023.lf.linux.4xlarge
+    - amz2023.lf.linux.12xlarge
+    - amz2023.lf.linux.24xlarge
+    - amz2023.lf.linux.arm64.2xlarge
+    - amz2023.lf.linux.4xlarge.nvidia.gpu
+    - amz2023.lf.linux.8xlarge.nvidia.gpu
+    - amz2023.lf.linux.16xlarge.nvidia.gpu
+    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
@ -47,3 +71,5 @@ self-hosted-runner:
    - macos-latest-xlarge
    - macos-13-xlarge
    - macos-14-xlarge
+    # Organization-wide Intel hosted XPU runners
+    - linux.idc.xpu
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -36,7 +36,8 @@ runs:
          "${DOCKER_IMAGE}"
        )

-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" ]]; then
+        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
@ -47,10 +48,9 @@ runs:
        docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"

    - name: Cleanup docker
-      if: always() && env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel'
+      if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
      shell: bash
      run: |
-        # on s390x stop the container for clean worker stop
-        # ignore expansion of "docker ps -q" since it could be empty
+        # on s390x or xpu stop the container for clean worker stop
        # shellcheck disable=SC2046
-        docker stop $(docker ps -q) || true
+        docker stop "${{ env.CONTAINER_NAME }}" || true
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-b829e936f7cc61b48149f5f957a451a38bf2a178
+69b2a0adc2ec03ab99990d7e8be3d4510438c148
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-6f0b61e5d782913a0fc7743812f2a8e522189111
+5ea4535f0699f366adb554183a65ebf7dc34a8be
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -152,3 +152,130 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
+
+  ### Setup runner types to test the Amazon Linux 2023 AMI
+  lf.c.amz2023.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 520
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 50
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 1200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.large:
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.arm64.m7g.2xlarge:
+    disk_size: 256
+    instance_type: m7g.2xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -152,3 +152,130 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
+
+  ### Setup runner types to test the Amazon Linux 2023 AMI
+  lf.amz2023.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 520
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 50
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 1200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.large:
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.arm64.m7g.2xlarge:
+    disk_size: 256
+    instance_type: m7g.2xlarge
+    is_ephemeral: false
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -286,6 +286,7 @@
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
  approved_by:
+  - wconstab
  - mrshenli
  - pritamdamania87
  - zhaojuanmao
@ -312,6 +313,25 @@
  - Lint
  - pull

+- name: DCP
+  patterns:
+  - torch/distributed/checkpoint/**
+  approved_by:
+  - LucasLLC
+  - fegin
+  - wz337
+  - saumishr
+  - daulet-askarov
+  - pradeepdfb
+  - kirtiteja
+  - mhorowitz
+  - saiteja64
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+
+
 - name: IDEEP
  patterns:
  - third_party/ideep
@ -387,7 +407,7 @@
  - torch/_inductor/codegen/cpp_template.py
  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/inductor/test_cpu_repo.py
+  - test/inductor/test_cpu_repro.py
  - test/inductor/test_cpu_cpp_wrapper.py
  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -6,6 +6,7 @@ ciflow_push_tags:
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
 - ciflow/inductor
+- ciflow/inductor-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-cu124
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.11.0
+optree==0.12.1
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -17,16 +17,16 @@ pytest-xdist==3.3.1
 pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
 scipy==1.10.1
-sympy==1.11.1
+sympy==1.12.1 ; python_version == "3.8"
+sympy>=1.13.0 ; python_version >= "3.9"
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
-sympy==1.11.1
 pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.11.0
+optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+
 import os
 import shutil
 import sys
@ -7,6 +8,7 @@ from subprocess import check_call
 from tempfile import TemporaryDirectory
 from typing import Optional

+
 SCRIPT_DIR = Path(__file__).parent
 REPO_DIR = SCRIPT_DIR.parent.parent

--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -5,7 +5,6 @@ import sys
 from typing import Any

 from github_utils import gh_delete_comment, gh_post_pr_comment
-
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
 from trymerge import GitHubPR
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -4,11 +4,9 @@ import json
 import os
 import re
 from typing import Any, cast, Dict, List, Optional
-
 from urllib.error import HTTPError

 from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
-
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR

--- a/.github/scripts/close_nonexistent_disable_issues.py
+++ b/.github/scripts/close_nonexistent_disable_issues.py
@ -10,6 +10,7 @@ import requests
 import rockset  # type: ignore[import]
 from gitutils import retries_decorator

+
 LOGS_QUERY = """
 with
    shas as (
--- a/.github/scripts/collect_ciflow_labels.py
+++ b/.github/scripts/collect_ciflow_labels.py
@ -1,10 +1,12 @@
 #!/usr/bin/env python3
+
 import sys
 from pathlib import Path
 from typing import Any, cast, Dict, List, Set

 import yaml

+
 GITHUB_DIR = Path(__file__).parent.parent


--- a/.github/scripts/convert_lintrunner_annotations_to_github.py
+++ b/.github/scripts/convert_lintrunner_annotations_to_github.py
@ -1,7 +1,6 @@
 import json
 import subprocess
 import sys
-
 from enum import Enum
 from pathlib import Path
 from typing import NamedTuple, Optional
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Set
 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo

+
 SEC_IN_DAY = 24 * 60 * 60
 CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
 NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3

 import sys
-
 from pathlib import Path

 import yaml
--- a/.github/scripts/export_pytorch_labels.py
+++ b/.github/scripts/export_pytorch_labels.py
@ -14,7 +14,6 @@ import json
 from typing import Any

 import boto3  # type: ignore[import]
-
 from label_utils import gh_get_labels


--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -15,6 +15,7 @@ from urllib.request import Request, urlopen

 import yaml

+
 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"

 PREFIX = "test-config/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -8,11 +8,13 @@ architectures:
    * CPU
    * Latest CUDA
    * Latest ROCM
+    * Latest XPU
 """

 import os
 from typing import Dict, List, Optional, Tuple

+
 CUDA_ARCHES = ["11.8", "12.1", "12.4"]


@ -24,6 +26,7 @@ CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}

 ROCM_ARCHES = ["6.0", "6.1"]

+XPU_ARCHES = ["xpu"]

 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

@ -132,6 +135,8 @@ def arch_type(arch_version: str) -> str:
        return "cuda"
    elif arch_version in ROCM_ARCHES:
        return "rocm"
+    elif arch_version in XPU_ARCHES:
+        return "xpu"
    elif arch_version in CPU_CXX11_ABI_ARCH:
        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
@ -156,6 +161,7 @@ WHEEL_CONTAINER_IMAGES = {
        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
+    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
@ -221,6 +227,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
+        "xpu": "xpu",
    }.get(gpu_arch_type, gpu_arch_version)


@ -325,13 +332,13 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS
+        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]

    if arches is None:
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES
        elif os == "linux-aarch64":
@ -354,9 +361,16 @@ def generate_wheels_matrix(
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "cuda-aarch64"
+                or arch_version == "xpu"
                else arch_version
            )

+            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
+            if (
+                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
+            ) and python_version == "3.13":
+                continue
+
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
            if (
                arch_version in ["12.4", "12.1", "11.8"]
@ -396,9 +410,7 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True",
-                            "devtoolset": (
-                                "cxx11-abi" if arch_version == "cuda-aarch64" else ""
-                            ),
+                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": (
@ -411,6 +423,26 @@ def generate_wheels_matrix(
                            ),
                        }
                    )
+                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
+                    if python_version == "3.10" and arch_version == "12.1":
+                        ret.append(
+                            {
+                                "python_version": python_version,
+                                "gpu_arch_type": gpu_arch_type,
+                                "gpu_arch_version": gpu_arch_version,
+                                "desired_cuda": translate_desired_cuda(
+                                    gpu_arch_type, gpu_arch_version
+                                ),
+                                "use_split_build": "False",
+                                "devtoolset": "",
+                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+                                "package_type": package_type,
+                                "pytorch_extra_install_requirements": "",
+                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                                    ".", "_"
+                                ),
+                            }
+                        )
            else:
                ret.append(
                    {
@ -421,7 +453,9 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
+                            "cxx11-abi"
+                            if arch_version in ["cpu-cxx11-abi", "xpu"]
+                            else ""
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -8,9 +8,9 @@ from typing import Dict, Iterable, List, Literal, Set
 from typing_extensions import TypedDict  # Python 3.11+

 import generate_binary_build_matrix  # type: ignore[import]
-
 import jinja2

+
 Arch = Literal["windows", "linux", "macos"]

 GITHUB_DIR = Path(__file__).resolve().parent.parent
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -16,6 +16,7 @@ from typing import Dict, List

 import generate_binary_build_matrix

+
 DOCKER_IMAGE_TYPES = ["runtime", "devel"]


--- a/.github/scripts/generate_pytorch_version.py
+++ b/.github/scripts/generate_pytorch_version.py
@ -4,11 +4,11 @@ import argparse
 import os
 import re
 import subprocess
-
 from datetime import datetime
 from distutils.util import strtobool
 from pathlib import Path

+
 LEADING_V_PATTERN = re.compile("^v")
 TRAILING_RC_PATTERN = re.compile("-rc[0-9]*$")
 LEGACY_BASE_VERSION_SUFFIX_PATTERN = re.compile("a0$")
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -11,7 +11,6 @@ import sys
 import time
 import urllib
 import urllib.parse
-
 from typing import Any, Callable, Dict, List, Optional, Tuple
 from urllib.request import Request, urlopen

--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -3,7 +3,6 @@
 import json
 import os
 import warnings
-
 from dataclasses import dataclass
 from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union
 from urllib.error import HTTPError
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -19,6 +19,7 @@ from typing import (
    Union,
 )

+
 T = TypeVar("T")

 RE_GITHUB_URL_MATCH = re.compile("^https://.*@?github.com/(.+)/(.+)$")
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -1,12 +1,12 @@
 """GitHub Label Utilities."""

 import json
-
 from functools import lru_cache
 from typing import Any, List, Tuple, TYPE_CHECKING, Union

 from github_utils import gh_fetch_url_and_headers, GitHubComment

+
 # TODO: this is a temp workaround to avoid circular dependencies,
 #       and should be removed once GitHubPR is refactored out of trymerge script.
 if TYPE_CHECKING:
--- a/.github/scripts/pytest_cache.py
+++ b/.github/scripts/pytest_cache.py
@ -9,6 +9,7 @@ from pytest_caching_utils import (
    upload_pytest_cache,
 )

+
 TEMP_DIR = "./tmp"  # a backup location in case one isn't provided


--- a/.github/scripts/pytest_caching_utils.py
+++ b/.github/scripts/pytest_caching_utils.py
@ -14,6 +14,7 @@ from file_io_utils import (
    zip_folder,
 )

+
 PYTEST_CACHE_KEY_PREFIX = "pytest_cache"
 PYTEST_CACHE_DIR_NAME = ".pytest_cache"
 BUCKET = "gha-artifacts"
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -12,6 +12,7 @@ from github.Issue import Issue

 WORKFLOW_LABEL_META = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
+WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
@ -203,6 +204,10 @@ def main() -> None:
            )
            label_type = WORKFLOW_LABEL_META

+    # For Canary builds use canary runners
+    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
+        label_type = WORKFLOW_LABEL_LF_CANARY
+
    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)


--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -2,7 +2,7 @@

 set -eoux pipefail

-SYNC_BRANCH=fbcode/pytorch-stable-prototype
+SYNC_BRANCH=pytorch-stable-prototype

 git config user.email "fake@example.com"
 git config user.name  "PyTorch Stable Bot"
@ -11,7 +11,9 @@ git fetch origin main
 git fetch origin "$SYNC_BRANCH"
 git checkout "$SYNC_BRANCH"

-for SHA in $(git log 4333e122d4b74cdf84351ed2907045c6a767b4cd..origin/main --pretty="%h" --reverse -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
+# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
+# This specific SHA was chosen as it was before the "branch point" of the stable branch
+for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
 do
    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
@ -20,7 +22,12 @@ do
        continue
    fi
    echo "Copying $SHA"
-    git cherry-pick -x "$SHA"
+    git cherry-pick -x "$SHA" -X theirs
+    git reset --soft HEAD~1
+    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
+    git checkout .
+    git commit --reuse-message=HEAD@{1}
+    git clean -f
 done

 if [[ "${WITH_PUSH}" == true ]]; then
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -41,7 +41,7 @@ def main() -> None:
    )

    options = parser.parse_args()
-    tagged_images: Dict[str, bool] = dict()
+    tagged_images: Dict[str, bool] = {}
    platform_images = [
        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
--- a/.github/scripts/td_llm_indexer.sh
+++ b/.github/scripts/td_llm_indexer.sh
@ -7,6 +7,7 @@ cd llm-target-determinator
 pip install -q -r requirements.txt
 cd ../codellama
 pip install -e .
+pip install numpy==1.26.0

 # Run indexer
 cd ../llm-target-determinator
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -17,9 +17,7 @@ from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

 from github_utils import gh_graphql
-
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
-
 from trymerge import (
    categorize_checks,
    DRCI_CHECKRUN_NAME,
@ -39,6 +37,7 @@ from trymerge import (
    validate_revert,
 )

+
 if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -45,7 +45,6 @@ from github_utils import (
    gh_update_pr_state,
    GitHubComment,
 )
-
 from gitutils import (
    are_ghstack_branches_in_sync,
    get_git_remote_name,
@ -62,6 +61,7 @@ from label_utils import (
 )
 from trymerge_explainer import get_revert_message, TryMergeExplainer

+
 # labels
 MERGE_IN_PROGRESS_LABEL = "merging"
 MERGE_COMPLETE_LABEL = "merged"
@ -1459,7 +1459,7 @@ def find_matching_merge_rule(

        if not skip_internal_checks and pr.has_internal_changes():
            raise RuntimeError(
-                "This PR has internal changes and must be landed via Phabricator"
+                "This PR has internal changes and must be landed via Phabricator! Please try reimporting/rexporting the PR!"
            )

        # Categorize all checks when skip_mandatory_checks (force merge) is set. Do it here
--- a/.github/scripts/tryrebase.py
+++ b/.github/scripts/tryrebase.py
@ -11,6 +11,7 @@ from github_utils import gh_post_pr_comment as gh_post_comment
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import GitHubPR

+
 SAME_SHA_ERROR = (
    "\n```\nAborting rebase because rebasing the branch resulted in the same sha as the target branch.\n"
    + "This usually happens because the PR has already been merged.  Please rebase locally and push.\n```"
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -81,7 +81,7 @@ jobs:
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: !{{ config["build_name"] }}-build
-    {%- if config["gpu_arch_type"] != "rocm" %}
+    {%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
      build_name: !{{ config["build_name"] }}
@ -101,6 +101,40 @@ jobs:
      {%- endif %}
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
+    {%- elif config["gpu_arch_type"] == "xpu" %}
+    runs-on: linux.idc.xpu
+    timeout-minutes: !{{ common.timeout_minutes }}
+    !{{ upload.binary_env(config) }}
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - name: Setup XPU
+        uses: ./.github/actions/setup-xpu
+      - name: configure aws credentials
+        id: aws_creds
+        uses: aws-actions/configure-aws-credentials@v1.7.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          aws-region: us-east-1
+      - name: Login to Amazon ECR
+        id: login-ecr
+        uses: aws-actions/amazon-ecr-login@v2
+      - uses: !{{ common.download_artifact_action }}
+        name: Download Build Artifacts
+        with:
+          name: !{{ config["build_name"] }}
+          path: "${{ runner.temp }}/artifacts/"
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: !{{ config["container_image"] }}
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown XPU
+        uses: ./.github/actions/teardown-xpu
    {%- else %}
    runs-on: linux.rocm.gpu
    timeout-minutes: !{{ common.timeout_minutes }}
--- a/.github/workflows/_linux-build-rg.yml
+++ b/.github/workflows/_linux-build-rg.yml
@ -1,105 +0,0 @@
-name: linux-build-rg
-
-on:
-  workflow_call:
-    inputs:
-      build-environment:
-        required: true
-        type: string
-        description: Top-level label for what's being built/tested.
-      docker-image-name:
-        required: true
-        type: string
-        description: Name of the base docker image to build with.
-      build-generates-artifacts:
-        required: false
-        type: boolean
-        default: true
-        description: If set, upload generated build artifacts.
-      build-with-debug:
-        required: false
-        type: boolean
-        default: false
-        description: If set, build in debug mode.
-      sync-tag:
-        required: false
-        type: string
-        default: ""
-        description: |
-          If this is set, our linter will use this to make sure that every other
-          job with the same `sync-tag` is identical.
-      cuda-arch-list:
-        required: false
-        type: string
-        default: "5.2"
-        description: |
-          List of CUDA architectures CI build should target.
-      runner-group:
-        required: false
-        type: string
-        default: "arc-lf-linux.2xlarge"
-        description: Runner group to select group type
-      test-matrix:
-        required: false
-        type: string
-        description: |
-          An option JSON description of what test configs to run later on. This
-          is moved here from the Linux test workflow so that we can apply filter
-          logic using test-config labels earlier and skip unnecessary builds
-      s3-bucket:
-        description: S3 bucket to download artifact
-        required: false
-        type: string
-        default: "gha-artifacts"
-      aws-role-to-assume:
-        description: role to assume for downloading artifacts
-        required: false
-        type: string
-        default: ""
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
-
-    outputs:
-      docker-image:
-        value: ${{ jobs.build.outputs.docker-image }}
-        description: The docker image containing the built PyTorch.
-      test-matrix:
-        value: ${{ jobs.build.outputs.test-matrix }}
-        description: An optional JSON description of what test configs to run later on.
-
-jobs:
-  build:
-    # Don't run on forked repos
-    if: github.repository_owner == 'pytorch'
-    runs-on:
-      group: ${{ inputs.runner-group }}
-    timeout-minutes: 240
-    outputs:
-      docker-image: ${{ steps.linux-build.outputs.docker-image }}
-      test-matrix: ${{ steps.linux-build.outputs.test-matrix }}
-    steps:
-      # [pytorch repo ref]
-      # Use a pytorch/pytorch reference instead of a reference to the local
-      # checkout because when we run this action we don't *have* a local
-      # checkout. In other cases you should prefer a local checkout.
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-      - name: Linux Build
-        id: linux-build
-        uses: ./.github/actions/linux-build
-        with:
-          build-environment: ${{ inputs.build-environment }}
-          docker-image-name: ${{ inputs.docker-image-name }}
-          build-generates-artifacts: ${{ inputs.build-generates-artifacts }}
-          build-with-debug: ${{ inputs.build-with-debug }}
-          sync-tag: ${{ inputs.sync-tag }}
-          cuda-arch-list: ${{ inputs.cuda-arch-list }}
-          test-matrix: ${{ inputs.test-matrix }}
-          s3-bucket: ${{ inputs.s3-bucket }}
-          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -228,7 +228,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -236,9 +236,9 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

-      - name: Store PyTorch Build Artifacts on S3
+      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
--- a/.github/workflows/_linux-test-label.yml
+++ b/.github/workflows/_linux-test-label.yml
@ -1,85 +0,0 @@
-name: linux-test-rg
-
-on:
-  workflow_call:
-    inputs:
-      build-environment:
-        required: true
-        type: string
-        description: Top-level label for what's being built/tested.
-      test-matrix:
-        required: true
-        type: string
-        description: JSON description of what test configs to run.
-      docker-image:
-        required: true
-        type: string
-        description: Docker image to run in.
-      sync-tag:
-        required: false
-        type: string
-        default: ""
-        description: |
-          If this is set, our linter will use this to make sure that every other
-          job with the same `sync-tag` is identical.
-      timeout-minutes:
-        required: false
-        type: number
-        default: 240
-        description: |
-          Set the maximum (in minutes) how long the workflow should take to finish
-      use-gha:
-        required: false
-        type: string
-        default: ""
-        description: If set to any value, upload to GHA. Otherwise upload to S3.
-      dashboard-tag:
-        required: false
-        type: string
-        default: ""
-      s3-bucket:
-        description: S3 bucket to download artifact
-        required: false
-        type: string
-        default: "gha-artifacts"
-      aws-role-to-assume:
-        description: role to assume for downloading artifacts
-        required: false
-        type: string
-        default: ""
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
-
-env:
-  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-
-jobs:
-  test:
-    # Don't run on forked repos or empty test matrix
-    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
-    strategy:
-      matrix: ${{ fromJSON(inputs.test-matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-      - name: Linux Test
-        id: linux-test
-        uses: ./.github/actions/linux-test
-        with:
-          build-environment: ${{ inputs.build-environment }}
-          test-matrix: ${{ inputs.test-matrix }}
-          docker-image: ${{ inputs.docker-image }}
-          sync-tag: ${{ inputs.sync-tag }}
-          use-gha: ${{ inputs.use-gha }}
-          dashboard-tag: ${{ inputs.dashboard-tag }}
-          s3-bucket: ${{ inputs.s3-bucket }}
-          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/_linux-test-rg.yml
+++ b/.github/workflows/_linux-test-rg.yml
@ -1,86 +0,0 @@
-name: linux-test-label
-
-on:
-  workflow_call:
-    inputs:
-      build-environment:
-        required: true
-        type: string
-        description: Top-level label for what's being built/tested.
-      test-matrix:
-        required: true
-        type: string
-        description: JSON description of what test configs to run.
-      docker-image:
-        required: true
-        type: string
-        description: Docker image to run in.
-      sync-tag:
-        required: false
-        type: string
-        default: ""
-        description: |
-          If this is set, our linter will use this to make sure that every other
-          job with the same `sync-tag` is identical.
-      timeout-minutes:
-        required: false
-        type: number
-        default: 240
-        description: |
-          Set the maximum (in minutes) how long the workflow should take to finish
-      use-gha:
-        required: false
-        type: string
-        default: ""
-        description: If set to any value, upload to GHA. Otherwise upload to S3.
-      dashboard-tag:
-        required: false
-        type: string
-        default: ""
-      s3-bucket:
-        description: S3 bucket to download artifact
-        required: false
-        type: string
-        default: "gha-artifacts"
-      aws-role-to-assume:
-        description: role to assume for downloading artifacts
-        required: false
-        type: string
-        default: ""
-    secrets:
-      HUGGING_FACE_HUB_TOKEN:
-        required: false
-        description: |
-          HF Auth token to avoid rate limits when downloading models or datasets from hub
-
-env:
-  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
-
-jobs:
-  test:
-    # Don't run on forked repos or empty test matrix
-    if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]'
-    strategy:
-      matrix: ${{ fromJSON(inputs.test-matrix) }}
-      fail-fast: false
-    runs-on:
-      group: ${{ matrix.runner }}
-    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-      - name: Linux Test
-        id: linux-test
-        uses: ./.github/actions/linux-test
-        with:
-          build-environment: ${{ inputs.build-environment }}
-          test-matrix: ${{ inputs.test-matrix }}
-          docker-image: ${{ inputs.docker-image }}
-          sync-tag: ${{ inputs.sync-tag }}
-          use-gha: ${{ inputs.use-gha }}
-          dashboard-tag: ${{ inputs.dashboard-tag }}
-          s3-bucket: ${{ inputs.s3-bucket }}
-          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -54,6 +54,7 @@ jobs:
      # Hardcoding below is temporary for testing ALI runners
      # This file below should match the script found in .github/scripts/runner_determinator.py
      - name: Hardcode runner-determinator script
+        id: hardcode-script
        run: |
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004
@ -70,6 +71,7 @@ jobs:

          WORKFLOW_LABEL_META = ""  # use meta runners
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
+          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
@ -261,8 +263,11 @@ jobs:
                      )
                      label_type = WORKFLOW_LABEL_META

-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
+              # For Canary builds use canary runners
+              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
+                  label_type = WORKFLOW_LABEL_LF_CANARY

+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)

          if __name__ == "__main__":
              main()
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -188,7 +188,7 @@ jobs:
        run: |
          pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
          # shellcheck disable=SC2046,SC2102
-          python3 -mpip install $(echo *.whl)[opt-einsum,optree]
+          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.12.1
          popd

          .ci/pytorch/win-test.sh
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -0,0 +1,64 @@
+name: Build conda docker images
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+  pull_request:
+    paths:
+      - conda/Dockerfile
+      - 'common/*'
+      - .github/workflows/build-conda-images.yml
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  DOCKER_ID: ${{ secrets.DOCKER_ID }}
+  DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker:
+    runs-on: linux.9xlarge.ephemeral
+    strategy:
+      matrix:
+        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
+    env:
+      CUDA_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+      - name: Calculate docker image
+        if: env.WITH_PUSH == 'false'
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+            docker-image-name: conda-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/conda
+            always-rebuild: true
+            push: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
+        run: |
+          .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/Show More
+++ b/Show More