Gracefully handle optree less than minimum version (#150977 )

Gracefully handle optree less than minimum version (#150956) Summary: - We are saying the minimum version of pytree that PyTorch can use is 0.13.0 - If a user imports torch.utils._cxx_pytree, it will raise an ImportError if optree doesn't exist or exists and is less than the minimum version. Fixes https://github.com/pytorch/pytorch/issues/150889. There are actually two parts to that issue: 1. dtensor imports torch.utils._cxx_pytree, but the optree installed in the environment might be too old. Instead, raising ImportError in torch.utils._cxx_pytree solves the issue. 2. We emit an "optree too low version" warning. I've deleted the warning in favor of the more explicit ImportError. Test Plan: - code reading Pull Request resolved: https://github.com/pytorch/pytorch/pull/150956 Approved by: https://github.com/albanD, https://github.com/atalman, https://github.com/XuehaiPan (cherry picked from commit 061832bc7a6711daaaf2bca12c2140bd8dea7eb5) Co-authored-by: rzou <zou3519@gmail.com>
Update triton wheel build, setuptools pin (#150953 )
2025-10-31 12:15:03 +08:00 · 2025-04-10 10:39:40 -04:00 · 2025-04-10 10:39:03 -04:00 · 2025-04-08 20:35:02 -04:00 · 2025-04-08 18:49:02 -04:00 · 2025-04-07 23:07:41 -04:00
4516 changed files with 103909 additions and 234792 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -27,7 +27,6 @@ if [ "$DESIRED_CUDA" = "cpu" ]; then
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    export USE_SYSTEM_NCCL=1
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -31,47 +31,33 @@ def build_ArmComputeLibrary() -> None:
        "build=native",
    ]
    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
-    if os.path.isdir(acl_install_dir):
-        shutil.rmtree(acl_install_dir)
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
-        check_call(
-            [
-                "git",
-                "clone",
-                "https://github.com/ARM-software/ComputeLibrary.git",
-                "-b",
-                "v25.02",
-                "--depth",
-                "1",
-                "--shallow-submodules",
-            ]
-        )
+    acl_checkout_dir = "ComputeLibrary"
+    os.makedirs(acl_install_dir)
+    check_call(
+        [
+            "git",
+            "clone",
+            "https://github.com/ARM-software/ComputeLibrary.git",
+            "-b",
+            "v25.02",
+            "--depth",
+            "1",
+            "--shallow-submodules",
+        ]
+    )

    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+        + acl_build_flags,
        cwd=acl_checkout_dir,
    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+    for d in ["arm_compute", "include", "utils", "support", "src"]:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")


-def replace_tag(filename) -> None:
-    with open(filename) as f:
-        lines = f.readlines()
-    for i, line in enumerate(lines):
-        if line.startswith("Tag:"):
-            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
-            print(f"Updated tag from {line} to {lines[i]}")
-            break
-
-    with open(filename, "w") as f:
-        f.writelines(lines)
-
-
-def package_cuda_wheel(wheel_path, desired_cuda) -> None:
+def update_wheel(wheel_path, desired_cuda) -> None:
    """
-    Package the cuda wheel libraries
+    Update the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
@ -102,19 +88,30 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
-
-    if "128" in desired_cuda:
+    if enable_cuda:
        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+        if "126" in desired_cuda:
+            libs_to_copy += [
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
+                "/usr/local/cuda/lib64/libcufile.so.0",
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            ]
+        elif "128" in desired_cuda:
+            libs_to_copy += [
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+                "/usr/local/cuda/lib64/libcufile.so.0",
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            ]
+    else:
+        libs_to_copy += [
+            "/opt/OpenBLAS/lib/libopenblas.so.0",
        ]
-
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
@ -123,13 +120,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
-
-    # Make sure the wheel is tagged with manylinux_2_28
-    for f in os.scandir(f"{folder}/tmp/"):
-        if f.is_dir() and f.name.endswith(".dist-info"):
-            replace_tag(f"{f.path}/WHEEL")
-            break
-
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
@ -204,10 +194,8 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
-    if enable_cuda:
-        build_vars = "MAX_JOBS=5 " + build_vars
+    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    os.system("cd /pytorch; python setup.py clean")

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
@ -254,6 +242,6 @@ if __name__ == "__main__":
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
+        update_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -19,11 +19,13 @@ import boto3

 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+    "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }

+ubuntu18_04_ami = os_amis["ubuntu18_04"]
 ubuntu20_04_ami = os_amis["ubuntu20_04"]


@ -657,6 +659,18 @@ def configure_system(
            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
        )
    host.run_cmd("pip3 install dataclasses typing-extensions")
+    # Install and switch to gcc-8 on Ubuntu-18.04
+    if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
+        host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
+        )
    if not use_conda:
        print("Installing Cython + numpy from PyPy")
        host.run_cmd("sudo pip3 install Cython")
@ -1012,7 +1026,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.8"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/caffe2/README.md
+++ b/.ci/caffe2/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
  echo 'Skipping tests'
  exit 0
 fi
+if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+fi
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
    # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest

 # Set flags (see build.sh) and build image
-sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,7 +1,6 @@
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
-FROM amd64/almalinux:8.10-20250519 as base
+FROM amd64/almalinux:8 as base

 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
@ -9,10 +8,12 @@ ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=11

+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
 RUN yum -y update
 RUN yum -y install epel-release
-# install glibc-langpack-en make sure en_US.UTF-8 locale is available
-RUN yum -y install glibc-langpack-en
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
@ -40,36 +41,31 @@ RUN bash ./install_conda.sh && rm install_conda.sh

 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
 ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH

+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+ENV DESIRED_CUDA=11.8
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6

-FROM cuda as cuda12.8
-RUN bash ./install_cuda.sh 12.8
-ENV DESIRED_CUDA=12.8
-
-FROM cuda as cuda12.9
-RUN bash ./install_cuda.sh 12.9
-ENV DESIRED_CUDA=12.9
-
-FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-ENV MKLROOT /opt/intel
-
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -77,9 +73,9 @@ RUN bash ./install_mnist.sh

 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
-COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
-COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,70 +1,82 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE_NAME="pytorch/${image}"

-CUDA_VERSION=""
-ROCM_VERSION=""
-EXTRA_BUILD_ARGS=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name and tag.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
-fi
-
-case ${DOCKER_TAG_PREFIX} in
-  cpu)
-    BASE_TARGET=base
-    ;;
-  cuda*)
-    BASE_TARGET=cuda${CUDA_VERSION}
-    ;;
-  rocm*)
-    BASE_TARGET=rocm
-    ;;
-  *)
-    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
-    exit 1
-    ;;
-esac
-
-# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-sudo systemctl daemon-reload
-sudo systemctl restart docker

 export DOCKER_BUILDKIT=1
 TOPDIR=$(git rev-parse --show-toplevel)
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-docker build \
-  --target final \
-  --progress plain \
-  --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "DEVTOOLSET_VERSION=11" \
-  ${EXTRA_BUILD_ARGS} \
-  -t ${tmp_tag} \
-  $@ \
-  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
-  ${TOPDIR}/.ci/docker/
+CUDA_VERSION=${CUDA_VERSION:-12.1}

-if [ -n "${CUDA_VERSION}" ]; then
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=11" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
  # Test that we're using the right CUDA compiler
-  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
 fi
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -50,21 +50,30 @@ if [[ "$image" == *xla* ]]; then
  exit 0
 fi

-if [[ "$image" == *-jammy* ]]; then
+if [[ "$image" == *-focal* ]]; then
+  UBUNTU_VERSION=20.04
+elif [[ "$image" == *-jammy* ]]; then
  UBUNTU_VERSION=22.04
 elif [[ "$image" == *ubuntu* ]]; then
  extract_version_from_image_name ubuntu UBUNTU_VERSION
+elif [[ "$image" == *centos* ]]; then
+  extract_version_from_image_name centos CENTOS_VERSION
 fi

 if [ -n "${UBUNTU_VERSION}" ]; then
  OS="ubuntu"
+elif [ -n "${CENTOS_VERSION}" ]; then
+  OS="centos"
 else
  echo "Unable to derive operating system base..."
  exit 1
 fi

 DOCKERFILE="${OS}/Dockerfile"
-if [[ "$image" == *rocm* ]]; then
+# When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image.
+if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
+  DOCKERFILE="${OS}-cuda/Dockerfile"
+elif [[ "$image" == *rocm* ]]; then
  DOCKERFILE="${OS}-rocm/Dockerfile"
 elif [[ "$image" == *xpu* ]]; then
  DOCKERFILE="${OS}-xpu/Dockerfile"
@ -76,6 +85,9 @@ elif [[ "$image" == *linter* ]]; then
  DOCKERFILE="linter/Dockerfile"
 fi

+# CMake 3.18 is needed to support CUDA17 language variant
+CMAKE_VERSION=3.18.5
+
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
@ -83,219 +95,285 @@ if [[ "$image" == *rocm* ]]; then
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi

-tag=$(echo $image | awk -F':' '{print $2}')
-
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
-case "$tag" in
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
-    CUDA_VERSION=12.8.1
+case "$image" in
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
+  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3-clang12-onnx)
+  pytorch-linux-focal-py3-clang10-onnx)
    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.9-clang12)
+  pytorch-linux-focal-py3.9-clang10)
    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
+  pytorch-linux-focal-py3.11-clang10)
    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc9)
+  pytorch-linux-focal-py3.9-gcc9)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-rocm-n-1-py3)
+  pytorch-linux-focal-rocm-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.2.4
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-xpu-2024.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
-    ROCM_VERSION=6.4
+    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.1
-    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-asan)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang15-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=15
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3-clang18-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=18
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    UNINSTALL_DILL=yes
@ -303,12 +381,14 @@ case "$tag" in
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
+    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
@ -316,23 +396,29 @@ case "$tag" in
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    TRITON_CPU=yes
    ;;
-  pytorch-linux-jammy-linter)
+  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+    ANACONDA_PYTHON_VERSION=3.9
+    CUDA_VERSION=11.8
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -341,7 +427,10 @@ case "$tag" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -349,6 +438,8 @@ case "$tag" in
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -364,7 +455,8 @@ case "$tag" in
      TRITON=yes
      # To ensure that any ROCm config will build using conda cmake
      # and thus have LAPACK/MKL enabled
-      fi
+      CONDA_CMAKE=yes
+    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
    fi
@ -380,37 +472,45 @@ case "$tag" in
    if [[ "$image" == *glibc* ]]; then
      extract_version_from_image_name glibc GLIBC_VERSION
    fi
+    if [[ "$image" == *cmake* ]]; then
+      extract_version_from_image_name cmake CMAKE_VERSION
+    fi
  ;;
 esac

 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-no_cache_flag=""
-progress_flag=""
-# Do not use cache and progress=plain when in CI
-if [[ -n "${CI:-}" ]]; then
-  no_cache_flag="--no-cache"
-  progress_flag="--progress=plain"
+#when using cudnn version 8 install it separately from cuda
+if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
+  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  if [[ ${CUDNN_VERSION} == 9 ]]; then
+    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  fi
 fi

 # Build image
 docker build \
-       ${no_cache_flag} \
-       ${progress_flag} \
+       --no-cache \
+       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
+       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
+       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
       --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \
       --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
       --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
+       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
+       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
@ -418,6 +518,7 @@ docker build \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
+       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -426,7 +527,6 @@ docker build \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
-       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -444,7 +544,7 @@ docker build \
 UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')

 function drun() {
-  docker run --rm "$tmp_tag" "$@"
+  docker run --rm "$tmp_tag" $*
 }

 if [[ "$OS" == "ubuntu" ]]; then
@ -492,23 +592,3 @@ if [ -n "$KATEX" ]; then
    exit 1
  fi
 fi
-
-HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
-if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
-  if [ "$HAS_TRITON" = "no" ]; then
-    echo "expecting triton to be installed, but it is not"
-    exit 1
-  fi
-elif [ "$HAS_TRITON" = "yes" ]; then
-  echo "expecting triton to not be installed, but it is"
-  exit 1
-fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -17,8 +17,9 @@ RUN bash ./install_base.sh && rm install_base.sh
 # Update CentOS git version
 RUN yum -y remove git
 RUN yum -y remove git-*
-RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
-    sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo
+RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm || \
+    (yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+    sed -i "s/packages.endpoint/packages.endpointdev/" /etc/yum.repos.d/endpoint.repo)
 RUN yum install -y git

 # Install devtoolset
@ -39,6 +40,7 @@ RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -46,6 +48,20 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -59,7 +75,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
@ -73,6 +89,12 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-f50bfa92602b45dca884a9e511e5d9ddbe8ba314
+01a22b6f16d117454b7d21ebdc691b0785b84a7f
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.26.5-1
+v2.26.2-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-b0e26b7359c147b8aa0af686c20510fb9b15990a
+0bcc8265e677e5321606a3311bf71470f14456a8
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8757738a7418249896224430ce84888e8ecdd79
+96316ce50fade7e209553aba4898cd9b82aab83b
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -30,6 +30,18 @@ install_ubuntu() {
    maybe_libomp_dev=""
  fi

+  # HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
+  # See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
+  # TODO: Eliminate this hack, we should not relay on apt-get installation
+  # See https://github.com/pytorch/pytorch/issues/144768
+  if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
+    maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
+  elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
+    maybe_libnccl_dev="libnccl2=2.26.2-1+cuda12.4 libnccl-dev=2.26.2-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
+  else
+    maybe_libnccl_dev=""
+  fi
+
  # Install common dependencies
  apt-get update
  # TODO: Some of these may not be necessary
@ -58,6 +70,7 @@ install_ubuntu() {
    libasound2-dev \
    libsndfile-dev \
    ${maybe_libomp_dev} \
+    ${maybe_libnccl_dev} \
    software-properties-common \
    wget \
    sudo \
@ -86,6 +99,9 @@ install_centos() {

  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
  numpy_deps="gcc-gfortran"
+  # Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
+  # for Caffe2. That said, we still install them to make sure the build
+  # system opts to build/use protoc and libprotobuf from third-party.
  yum install -y \
    $ccache_deps \
    $numpy_deps \
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.10.0
+  git clone https://github.com/mozilla/sccache -b v0.9.1
  cd sccache
  echo "Building sccache"
  cargo build --release
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -4,10 +4,16 @@ set -ex

 if [ -n "$CLANG_VERSION" ]; then

-  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+  if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
+    sudo apt-get update
+    # gpg-agent is not available by default on 18.04
+    sudo apt-get install  -y --no-install-recommends gpg-agent
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
+  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
-    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
  # clang's packaging is a little messed up (the runtime libs aren't
  # added into the linker path), so give it a little help
  clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
-  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+  echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
  ldconfig

  # Cleanup package manager
--- a/.ci/docker/common/install_cmake.sh
+++ b/.ci/docker/common/install_cmake.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CMAKE_VERSION" ]
+
+# Remove system cmake install so it won't get used instead
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get remove cmake -y
+    ;;
+  centos)
+    yum remove cmake -y
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
+# Turn 3.6.3 into v3.6
+path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
+file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
+
+# Download and install specific CMake version in /usr/local
+pushd /tmp
+curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
+tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
+rm -f cmake-*.tar.gz
+popd
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -7,7 +7,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
  fi

@ -62,7 +62,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
-  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
@ -75,11 +75,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # and libpython-static for torch deploy
  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"

+  # Use conda cmake in some cases. Conda cmake will be newer than our supported
+  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
+  # following builds that we know should use conda. Specifically, Ubuntu bionic
+  # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda
+  if [ -n "${CONDA_CMAKE}" ]; then
+    conda_install cmake
+  fi
+
  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
+    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
  fi

  # Install some other packages, including those needed for Python test reporting
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -3,11 +3,11 @@
 set -uex -o pipefail

 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
-PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,72 +2,207 @@

 set -ex

-arch_path=''
-targetarch=${TARGETARCH:-$(uname -m)}
-if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
-  arch_path='x86_64'
-else
-  arch_path='sbsa'
-fi
+NCCL_VERSION=v2.26.2-1
+CUDNN_VERSION=9.5.1.17

-function install_cuda {
-  version=$1
-  runfile=$2
-  major_minor=${version%.*}
-  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
-  if [[ ${arch_path} == 'sbsa' ]]; then
-      runfile="${runfile}_sbsa"
-  fi
-  runfile="${runfile}.run"
-  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
-  chmod +x ${runfile}
-  ./${runfile} --toolkit --silent
-  rm -f ${runfile}
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
 }

-function install_cudnn {
-  cuda_major_version=$1
-  cudnn_version=$2
-  mkdir tmp_cudnn && cd tmp_cudnn
+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    CUDNN_VERSION=9.1.0.70
+    NCCL_VERSION=v2.21.5-1
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_124 {
+  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+  chmod +x cuda_12.4.1_550.54.15_linux.run
+  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
-  tar xf ${filepath}.tar.xz
-  cp -a ${filepath}/include/* /usr/local/cuda/include/
-  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_062
+
+  ldconfig
 }

 function install_126 {
-  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.3 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
+  chmod +x cuda_12.6.3_560.35.05_linux.run
+  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
+  rm -f cuda_12.6.3_560.35.05_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda

-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

-  CUDA_VERSION=12.6 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

-  CUDA_VERSION=12.6 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }

-function install_129 {
-  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 12.9.1 in the same container
-  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"

-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

-  CUDA_VERSION=12.9 bash install_nccl.sh
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi

-  CUDA_VERSION=12.9 bash install_cusparselt.sh
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"

-  ldconfig
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

 function prune_126 {
@ -105,17 +240,35 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 12.8.1 in the same container
-  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
+  CUDNN_VERSION=9.7.1.26
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  # install CUDA 12.8.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
+  chmod +x cuda_12.8.0_570.86.10_linux.run
+  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

-  CUDA_VERSION=12.8 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }
@ -124,11 +277,13 @@ function install_128 {
 while test $# -gt 0
 do
    case "$1" in
-    12.6|12.6.*) install_126; prune_126
+    11.8) install_118; prune_118
        ;;
-    12.8|12.8.*) install_128;
+    12.4) install_124; prune_124
        ;;
-    12.9|12.9.*) install_129;
+    12.6) install_126; prune_126
+        ;;
+    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,63 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+NCCL_VERSION=v2.26.2-1
+CUDNN_VERSION=9.8.0.87
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_128 {
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  # install CUDA 12.8.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
+  chmod +x cuda_12.8.0_570.86.10_linux_sbsa.run
+  ./cuda_12.8.0_570.86.10_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.8) install_128;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -4,10 +4,12 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
    else
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,14 +5,25 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-8]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive"
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.3.2-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 else
    echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}"
 fi
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -13,7 +13,7 @@ clone_executorch() {
  # and fetch the target commit
  pushd executorch
  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init --recursive
+  git submodule update --init
  popd

  chown -R jenkins executorch
@ -50,7 +50,8 @@ setup_executorch() {
  pushd executorch

  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
  popd
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -17,7 +17,7 @@ if [ -n "${UBUNTU_VERSION}" ];then
                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi

-pip_install numpy scipy imageio cmake ninja
+conda_install numpy scipy imageio cmake ninja

 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
@ -35,9 +35,7 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
-#       Context: https://github.com/pytorch/pytorch/issues/150420
-cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -14,9 +14,16 @@ function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)

+  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
+  # I'm using nightly here instead. We just need to package to be able to install
+  # TIMM. Removing this once vision has a release on 3.13
+  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
+    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
+  fi
+
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+  conda_run pip uninstall -y cmake torch torchvision triton
 }

 # Pango is needed for weasyprint which is needed for doctr
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@ -2,6 +2,8 @@

 set -ex

+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
 if [ -n "${UBUNTU_VERSION}" ]; then
  apt update
  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch

 pushd pytorch
 # Install all linter dependencies
-pip install -r requirements.txt
-lintrunner init
+pip_install -r requirements.txt
+conda_run lintrunner init

 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,23 +1,26 @@
 #!/usr/bin/env bash
-# Script that installs magma from tarball inside conda environment.
-# It replaces anaconda magma-cuda package which is no longer published.
-# Execute it inside active conda environment.
-# See issue: https://github.com/pytorch/pytorch/issues/138506
+# Script that replaces the magma install from a conda package

 set -eou pipefail

-cuda_version_nodot=${1/./}
-anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+function do_install() {
+    cuda_version_nodot=${1/./}
+    anaconda_python_version=$2

-MAGMA_VERSION="2.6.1"
-magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
-(
-    set -x
-    tmp_dir=$(mktemp -d)
-    pushd ${tmp_dir}
-    curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-    tar -xvf "${magma_archive}"
-    mv include/* "${anaconda_dir}/include/"
-    mv lib/* "${anaconda_dir}/lib"
-    popd
-)
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mv include/* "${anaconda_dir}/include/"
+        mv lib/* "${anaconda_dir}/lib"
+        popd
+    )
+}
+
+do_install $1 $2
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-NCCL_VERSION=""
-if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
-elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-else
-  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
-  exit 1
-fi
-
-if [[ -n "${NCCL_VERSION}" ]]; then
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  pushd nccl
-  make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  popd
-  rm -rf nccl
-  ldconfig
-fi
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -8,6 +8,16 @@ retry () {
    "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
 }

+# A bunch of custom pip dependencies for ONNX
+pip_install \
+  beartype==0.15.0 \
+  filelock==3.9.0 \
+  flatbuffers==2.0 \
+  mock==5.0.1 \
+  ninja==1.10.2 \
+  networkx==2.5 \
+  numpy==1.24.2
+
 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
 # overwritten by onnx.
@ -19,8 +29,12 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
+
 pip_install onnxruntime==1.18.1
-pip_install onnxscript==0.3.0
+pip_install onnx==1.17.0
+pip_install onnxscript==0.2.2 --no-deps
+# required by onnxscript
+pip_install ml_dtypes

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,8 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
+

 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -ex
+
+pb_dir="/usr/temp_pb_install_dir"
+mkdir -p $pb_dir
+
+# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+# else it will fail with
+#   g++: error: ./../lib64/crti.o: No such file or directory
+ln -s /usr/lib64 "$pb_dir/lib64"
+
+curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+
+tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+NPROC=$[$(nproc) - 2]
+pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
+popd
+rm -rf $pb_dir
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-set -ex
-
-apt-get update
-# Use deadsnakes in case we need an older python version
-sudo add-apt-repository ppa:deadsnakes/ppa
-apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
-
-# Use a venv because uv and some other package managers don't support --user install
-ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-python -m venv /var/lib/jenkins/ci_env
-source /var/lib/jenkins/ci_env/bin/activate
-
-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,6 +8,10 @@ ver() {

 install_ubuntu() {
    apt-get update
+    if [[ $UBUNTU_VERSION == 18.04 ]]; then
+      # gpg-agent is not available by default on 18.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
    if [[ $UBUNTU_VERSION == 20.04 ]]; then
      # gpg-agent is not available by default on 20.04
      apt-get install -y --no-install-recommends gpg-agent
@ -19,13 +23,6 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Make sure rocm packages from repo.radeon.com have highest priority
-    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
-Package: *
-Pin: release o=repo.radeon.com
-Pin-Priority: 600
-EOF
-
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
@ -66,25 +63,17 @@ EOF
    done

    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
-    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]] || [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
-            HIP_BRANCH=rocm-6.3.x
-            VER_STR=6.3
-        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-        fi
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}-statco-hotfix
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
        popd
        rm -rf HIP clr
    fi
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,32 +1,50 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline

-set -eou pipefail
+set -ex

-function do_install() {
-    rocm_version=$1
-    rocm_version_nodot=${1//./}
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac

-    rocm_dir="/opt/rocm"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        if tar -xvf "${magma_archive}"
-        then
-            mkdir -p "${rocm_dir}/magma"
-            mv include "${rocm_dir}/magma/include"
-            mv lib "${rocm_dir}/magma/lib"
-        else
-            echo "${magma_archive} not found, skipping magma install"
-        fi
-        popd
-    )
-}
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

-do_install $1
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+git clone https://bitbucket.org/icl/magma.git
+pushd magma
+
+# Version 2.7.2 + ROCm related updates
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm
--- a/.ci/docker/common/install_swiftshader.sh
+++ b/.ci/docker/common/install_swiftshader.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${SWIFTSHADER}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+# SwiftShader
+_swiftshader_dir=/var/lib/jenkins/swiftshader
+_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
+mkdir -p $_swiftshader_dir
+_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
+
+curl --silent --show-error --location --fail --retry 3 \
+  --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
+
+tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
+
+export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -2,16 +2,14 @@

 set -ex

-mkdir -p /opt/triton
-if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
-  echo "TRITON and TRITON_CPU are not set. Exiting..."
-  exit 0
-fi
-
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-get_pip_version() {
-  conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}'
+get_conda_version() {
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
+}
+
+conda_reinstall() {
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

 if [ -n "${XPU_VERSION}" ]; then
@ -33,9 +31,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
    apt-get install -y gpg-agent
 fi

-# Keep the current cmake and numpy version here, so we can reinstall them later
-CMAKE_VERSION=$(get_pip_version cmake)
-NUMPY_VERSION=$(get_pip_version numpy)
+if [ -n "${CONDA_CMAKE}" ]; then
+  # Keep the current cmake and numpy version here, so we can reinstall them later
+  CMAKE_VERSION=$(get_conda_version cmake)
+  NUMPY_VERSION=$(get_conda_version numpy)
+fi

 if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
@ -51,13 +51,7 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
-
-# Old versions of python have setup.py in ./python; newer versions have it in ./
-if [ ! -f setup.py ]; then
-  cd python
-fi
-
-pip_install pybind11==2.13.6
+cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@ -66,38 +60,28 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 else
-  conda_run python setup.py bdist_wheel
+  pip_install .
 fi

-# Copy the wheel to /opt for multi stage docker builds
-cp dist/*.whl /opt/triton
-# Install the wheel for docker builds that don't use multi stage
-pip_install dist/*.whl
-
-# TODO: This is to make sure that the same cmake and numpy version from install conda
-# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
-# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
-# this can be removed.
-#
-# The correct numpy version also needs to be set here because conda claims that it
-# causes inconsistent environment.  Without this, conda will attempt to install the
-# latest numpy version, which fails ASAN tests with the following import error: Numba
-# needs NumPy 1.20 or less.
-# Note that we install numpy with pip as conda might not have the version we want
-if [ -n "${CMAKE_VERSION}" ]; then
-  pip_install "cmake==${CMAKE_VERSION}"
-fi
-if [ -n "${NUMPY_VERSION}" ]; then
-  pip_install "numpy==${NUMPY_VERSION}"
-fi
-if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
-  pip_install helion
+if [ -n "${CONDA_CMAKE}" ]; then
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # this can be removed.
+  #
+  # The correct numpy version also needs to be set here because conda claims that it
+  # causes inconsistent environment.  Without this, conda will attempt to install the
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
+  # needs NumPy 1.20 or less.
+  conda_reinstall cmake="${CMAKE_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 fi
--- a/.ci/docker/common/install_vulkan_sdk.sh
+++ b/.ci/docker/common/install_vulkan_sdk.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${VULKAN_SDK_VERSION}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_vulkansdk_dir=/var/lib/jenkins/vulkansdk
+_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
+
+curl \
+  --silent \
+  --show-error \
+  --location \
+  --fail \
+  --retry 3 \
+  --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+
+mkdir -p "${_vulkansdk_dir}"
+tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
+rm -rf "${_tmp_vulkansdk_targz}"
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -26,7 +26,7 @@ function install_ubuntu() {
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
-        https://apt.repos.intel.com/oneapi all main" \
+        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
@ -47,6 +47,9 @@ function install_ubuntu() {
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
+    fi
    apt-get install -y ${XPU_PACKAGES}

    # Cleanup
@ -74,7 +77,7 @@ function install_rhel() {
    tee > /etc/yum.repos.d/oneAPI.repo << EOF
 [oneAPI]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/oneapi
+baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
@ -82,6 +85,9 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.
 EOF

    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
+    fi
    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
@ -118,7 +124,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/oneapi oneAPI
+    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB

    # The xpu-smi packages
@ -141,10 +147,10 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
-else
+XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
+XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
+if [[ "$XPU_VERSION" == "2025.0" ]]; then
+    XPU_REPO_NAME="oneapi"
    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi

--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -49,11 +49,18 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME /usr/local/cuda

+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+RUN bash ./install_magma.sh 11.8
+RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+RUN bash ./install_magma.sh 12.4
+RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
+
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 RUN bash ./install_magma.sh 12.6
@ -64,13 +71,7 @@ RUN bash ./install_cuda.sh 12.8
 RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda

-FROM cuda as cuda12.9
-RUN bash ./install_cuda.sh 12.9
-RUN bash ./install_magma.sh 12.9
-RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
-
 FROM cpu as rocm
-ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@ -85,11 +86,11 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python3 python-is-python3 -y && \
+    apt-get install python -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,63 +1,83 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eoux pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

+DOCKER_IMAGE="pytorch/${image}"
+
 TOPDIR=$(git rev-parse --show-toplevel)

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
 DOCKER=${DOCKER:-docker}

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
-
-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
-
-case ${DOCKER_TAG_PREFIX} in
+case ${GPU_ARCH_TYPE} in
    cpu)
        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda*)
+    cuda)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm*)
+    rocm)
        BASE_TARGET=rocm
-        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
-        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-DOCKER_BUILDKIT=1 ${DOCKER} build \
-    --target final \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
-    "${TOPDIR}/.ci/docker/"
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -18,31 +18,28 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-# Put venv into the env vars so users don't need to activate it
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
-
-RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -15,17 +15,20 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,200 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum --enablerepo=extras install -y epel-release
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+ENV PATH /opt/conda/bin:$PATH
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -7,8 +7,8 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=13
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

@ -33,13 +33,10 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6

 FROM base as cuda
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -47,7 +44,7 @@ ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh

 FROM base as magma
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@ -64,7 +61,7 @@ ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh

 FROM ${GPU_IMAGE} as common
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
@ -87,12 +84,13 @@ RUN yum install -y \
        wget \
        which \
        xz \
-        glibc-langpack-en \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm

+RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
@ -103,7 +101,6 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
-COPY --from=base               /usr/local/lib/                       /usr/local/lib/
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
@ -117,8 +114,8 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h

 FROM common as cpu_final
-ARG BASE_CUDA_VERSION=12.6
-ARG DEVTOOLSET_VERSION=13
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
@ -157,14 +154,11 @@ ENV ROCM_PATH /opt/rocm
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
-# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-# ROCm 6.4 rocm-smi depends on system drm.h header
-RUN yum install -y libdrm-devel
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

@ -175,6 +169,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -1,6 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

-ARG GCCTOOLSET_VERSION=13
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11

 # Language variabes
 ENV LC_ALL=en_US.UTF-8
@ -35,10 +36,7 @@ RUN yum install -y \
  yasm \
  zstd \
  sudo \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gdb
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
@ -58,7 +56,6 @@ RUN git config --global --add safe.directory "*"

 FROM base as openblas
 # Install openblas
-ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh

--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -1,7 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

 # Cuda ARM build needs gcc 11
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11

 # Language variables
 ENV LC_ALL=en_US.UTF-8
@ -34,10 +34,7 @@ RUN yum install -y \
  zstd \
  libgomp \
  sudo \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain

 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -69,11 +66,8 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
-ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh

 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -5,9 +5,7 @@ ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8

-# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction.
-# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
-ARG DEVTOOLSET_VERSION=14
+ARG DEVTOOLSET_VERSION=13
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
@ -44,7 +42,6 @@ RUN yum install -y \
  llvm-devel \
  libzstd-devel \
  python3.12-devel \
-  python3.12-test \
  python3.12-setuptools \
  python3.12-pip \
  python3-virtualenv \
@ -60,8 +57,7 @@ RUN yum install -y \
  libxslt-devel \
  libxml2-devel \
  openssl-devel \
-  valgrind \
-  ninja-build
+  valgrind

 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -105,33 +101,24 @@ CMD ["/bin/bash"]

 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
-  hdf5-devel \
-  python3-h5py \
-  git
+  protobuf-devel \
+  protobuf-c-devel \
+  protobuf-lite-devel \
+  wget \
+  patch

-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
-
-# cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
-
-# build onnxruntime 1.21.0 from sources.
-# it is not possible to build it from sources using pip,
-# so just build it from upstream repository.
-# h5py is dependency of onnxruntime_training.
-# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
-# install newest flatbuffers version first:
-# for some reason old version is getting pulled in otherwise.
-# packaging package is required for onnxruntime wheel build.
-RUN pip3 install flatbuffers && \
-  pip3 install h5py==3.11.0 && \
-  pip3 install packaging && \
-  git clone https://github.com/microsoft/onnxruntime && \
-  cd onnxruntime && git checkout v1.21.0 && \
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind \
-  --build_wheel --enable_training --enable_training_apis \
-  --enable_training_ops --skip_tests --allow_running_as_root \
-  --compile_no_warning_as_error && \
-  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
-  cd .. && /bin/rm -rf ./onnxruntime
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 TOPDIR=$(git rev-parse --show-toplevel)

@ -9,111 +9,152 @@ image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE="pytorch/${image}"

-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
-OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+WITH_PUSH=${WITH_PUSH:-}

-case ${image} in
-    manylinux2_28-builder:cpu)
+case ${GPU_ARCH_TYPE} in
+    cpu)
        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28_aarch64-builder:cpu-aarch64)
+    cpu-aarch64)
        TARGET=final
-        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
-        MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.29"
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+    cpu-aarch64-2_28)
        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        ;;
+    cpu-cxx11-abi)
+        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    manylinuxs390x-builder:cpu-s390x)
+    cpu-s390x)
        TARGET=final
+        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda11*)
+    cuda)
        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28-builder:cuda12*)
+    cuda-aarch64)
        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
-        MANY_LINUX_VERSION="2_28"
-        ;;
-    manylinuxaarch64-builder:cuda*)
-        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    manylinux2_28-builder:rocm*)
+    rocm|rocm-manylinux_2_28)
        TARGET=rocm_final
-        MANY_LINUX_VERSION="2_28"
-        DEVTOOLSET_VERSION="11"
-        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        DEVTOOLSET_VERSION="9"
+        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
+            MANY_LINUX_VERSION="2_28"
+            DEVTOOLSET_VERSION="11"
+            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        fi
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    manylinux2_28-builder:xpu)
+    xpu)
        TARGET=xpu_final
+        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized image name: ${image}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

+IMAGES=''
+
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-# Only activate this if in CI
-if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-    sudo systemctl daemon-reload
-    sudo systemctl restart docker
+(
+    set -x
+
+    # Only activate this if in CI
+    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+        sudo systemctl daemon-reload
+        sudo systemctl restart docker
+    fi
+
+    DOCKER_BUILDKIT=1 docker build  \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-"dev")}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
 fi
-
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
-
-DOCKER_BUILDKIT=1 docker build  \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
-    --target "${TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
-    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -97,7 +97,7 @@ find /opt/_internal -type f -print0 \
    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
 # We do not need the Python test suites, or indeed the precompiled .pyc and
 # .pyo files. Partially cribbed from:
-#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile  # @lint-ignore
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
 find /opt/_internal \
     \( -type d -a -name test -o -name tests \) \
  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -2,7 +2,7 @@
 # Helper utilities for build
 # Script used only in CD pipeline

-OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/  # @lint-ignore
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
 CURL_DOWNLOAD_URL=https://curl.se/download

 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,9 +41,9 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==24.12.23
+flatbuffers==2.0
 #Description: cross platform serialization library
-#Pinned versions: 24.12.23
+#Pinned versions: 2.0
 #test that import:

 hypothesis==5.35.1
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.15.0
+mypy==1.14.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.14.0
@ -102,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -163,10 +163,10 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==5.29.4
-#Description:  Google's data interchange format
-#Pinned versions: 5.29.4
-#test that import: test_tensorboard.py, test/onnx/*
+protobuf==3.20.2
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py

 psutil
 #Description: information on running processes and system utilization
@ -334,12 +334,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:

-onnx==1.18.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.17.0
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:

-onnxscript==0.2.6
+onnxscript==0.2.2
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -353,7 +353,7 @@ parameterized==0.8.1
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py

-pwlf==2.2.1
+pwlf==2.2.1 ; python_version >= "3.8"
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
@ -365,9 +365,10 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

-pulp==2.9.0
+pulp==2.9.0 ; python_version >= "3.8"
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
@ -376,9 +377,3 @@ dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
-
-cmake==4.0.0
-#Description: required for building
-
-tlparse==0.3.30
-#Description: required for log parsing
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,24 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
-
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6

-sphinxext-opengraph==0.9.1
-#Description: This is used to generate PyTorch docs
-#Pinned versions: 0.9.1
-
-sphinx_sitemap==2.6.0
-#Description: This is used to generate sitemap for PyTorch docs
-#Pinned versions: 2.6.0
-
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -55,6 +46,5 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
-sphinxcontrib-mermaid==1.0.0
+sphinx-panels==0.4.1
 myst-parser==0.18.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.3.0
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@ -1 +0,0 @@
-3.3.1
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -0,0 +1,175 @@
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+ARG IMAGE_NAME
+
+FROM ${IMAGE_NAME}
+
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies (so that this step can be cached separately)
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install katex
+ARG KATEX
+COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
+RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
+
+# Install conda and other packages (e.g., numpy, pytest)
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+
+# Install gcc
+ARG GCC_VERSION
+COPY ./common/install_gcc.sh install_gcc.sh
+RUN bash ./install_gcc.sh && rm install_gcc.sh
+
+# Install clang
+ARG CLANG_VERSION
+COPY ./common/install_clang.sh install_clang.sh
+RUN bash ./install_clang.sh && rm install_clang.sh
+
+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
+# (optional) Install vision packages like OpenCV
+ARG VISION
+COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
+RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
+RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
+ENV INSTALLED_VISION ${VISION}
+
+# (optional) Install UCC
+ARG UCX_COMMIT
+ARG UCC_COMMIT
+ENV UCX_COMMIT $UCX_COMMIT
+ENV UCC_COMMIT $UCC_COMMIT
+ENV UCX_HOME /usr
+ENV UCC_HOME /usr
+ADD ./common/install_ucc.sh install_ucc.sh
+RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
+RUN rm install_ucc.sh
+
+COPY ./common/install_openssl.sh install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+RUN bash ./install_openssl.sh
+ENV OPENSSL_DIR /opt/openssl
+
+ARG INDUCTOR_BENCHMARKS
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/timm.txt timm.txt
+RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+COPY triton_version.txt triton_version.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+
+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
+# Install ccache/sccache (do this last, so we get priority in PATH)
+COPY ./common/install_cache.sh install_cache.sh
+ENV PATH /opt/cache/bin:$PATH
+# See https://github.com/pytorch/pytorch/issues/82174
+# TODO(sdym@fb.com):
+# check if this is needed after full off Xenial migration
+ENV CARGO_NET_GIT_FETCH_WITH_CLI true
+RUN bash ./install_cache.sh && rm install_cache.sh
+ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+
+# Add jni.h for java host build
+COPY ./common/install_jni.sh install_jni.sh
+COPY ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+# Install Open MPI for CUDA
+COPY ./common/install_openmpi.sh install_openmpi.sh
+RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
+RUN rm install_openmpi.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
+
+# AWS specific CUDA build guidance
+ENV TORCH_CUDA_ARCH_LIST Maxwell
+ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
+ENV CUDA_PATH /usr/local/cuda
+
+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
+# Install CUDNN
+ARG CUDNN_VERSION
+ARG CUDA_VERSION
+COPY ./common/install_cudnn.sh install_cudnn.sh
+RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN rm install_cudnn.sh
+
+# Install CUSPARSELT
+ARG CUDA_VERSION
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash install_cusparselt.sh
+RUN rm install_cusparselt.sh
+
+# Install CUDSS
+ARG CUDA_VERSION
+COPY ./common/install_cudss.sh install_cudss.sh
+RUN bash install_cudss.sh
+RUN rm install_cudss.sh
+
+# Delete /usr/local/cuda-11.X/cuda-11.X symlinks
+RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
+RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
+RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
+RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi
+
+USER jenkins
+CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -27,6 +27,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -42,6 +43,20 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -55,7 +70,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@ -100,6 +115,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@ -72,10 +73,17 @@ ARG TRITON
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
-COPY triton_xpu_version.txt triton_version.txt
+COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -83,6 +91,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION

-FROM ubuntu:${UBUNTU_VERSION} as base
+FROM ubuntu:${UBUNTU_VERSION}

 ARG UBUNTU_VERSION

@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@ -51,17 +52,9 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-# No effect if cuda not installed
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-

 # (optional) Install UCC
 ARG UCX_COMMIT
@ -74,6 +67,20 @@ ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,24 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Vulkan SDK
+ARG VULKAN_SDK_VERSION
+COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
+RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
+RUN rm install_vulkan_sdk.sh
+
+# (optional) Install swiftshader
+ARG SWIFTSHADER
+COPY ./common/install_swiftshader.sh install_swiftshader.sh
+RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
+RUN rm install_swiftshader.sh
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
@ -102,21 +127,20 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 ARG TRITON
-ARG TRITON_CPU
-
-# Create a separate stage for building Triton and Triton-CPU.  install_triton
-# will check for the presence of env vars
-FROM base as triton-builder
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
-COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
-RUN bash ./install_triton.sh
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt

-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt

 ARG EXECUTORCH
 # Build and install executorch
--- a/.ci/magma-rocm/.gitignore
+++ b/.ci/magma-rocm/.gitignore
@ -1,2 +0,0 @@
-output/
-magma-rocm*/
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,35 +0,0 @@
-SHELL=/usr/bin/env bash
-
-DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
-DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
-PACKAGE_NAME = magma-rocm
-# inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
-
-DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-	-w /builder \
-	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
-	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
-	magma-rocm/build_magma.sh
-
-.PHONY: all
-all: magma-rocm64
-all: magma-rocm63
-
-.PHONY:
-clean:
-	$(RM) -r magma-*
-	$(RM) -r output
-
-.PHONY: magma-rocm64
-magma-rocm64: DESIRED_ROCM := 6.4
-magma-rocm64:
-	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -1,48 +0,0 @@
-# Magma ROCm
-
-This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
-
-## Building
-
-Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
-
-```
-# Using `docker`
-make magma-rocm63
-
-# Using `podman`
-DOCKER_CMD=podman make magma-rocm63
-```
-
-This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
-Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
-into a tarball, with the following structure:
-
-```
-.
-├── include       # header files
-├── lib           # libmagma.so
-├── info
-│   ├── licenses  # license file
-│   └── recipe    # build script
-```
-
-More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
-Outputted binaries should be in the `output` folder.
-
-
-## Pushing
-
-Packages can be uploaded to an S3 bucket using:
-
-```
-aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
-```
-
-If you do not have upload permissions, please ping @seemethere or @soumith to gain access
-
-## New versions
-
-New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
-
-Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-# Environment variables
-# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-
-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-
-# Folders for the build
-PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
-PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
-PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
-PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
-PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
-PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
-mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
-
-# Fetch magma sources and verify checksum
-pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
-pushd magma
-git checkout ${MAGMA_VERSION}
-popd
-popd
-
-# build
-pushd ${PACKAGE_DIR}/magma
-# The build.sh script expects to be executed from the sources root folder
-INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
-popd
-
-# Package recipe, license and tarball
-# Folder and package name are backward compatible for the build workflow
-cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
-pushd ${PACKAGE_BUILD}
-tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
-echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
-popd
--- a/.ci/magma-rocm/package_files/build.sh
+++ b/.ci/magma-rocm/package_files/build.sh
@ -1,38 +0,0 @@
-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac
-
-MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
-
-cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
-echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
-    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
-fi
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
-echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
-export PATH="${PATH}:/opt/rocm/bin"
-if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
-  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
-else
-  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
-fi
-for arch in $amdgpu_targets; do
-  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
-done
-# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
-sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
-make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
-cp -R lib ${INSTALL_DIR}
-cp -R include ${INSTALL_DIR}
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_CUDA ?= 12.8
+DESIRED_CUDA ?= 11.8
 DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
 PACKAGE_NAME = magma-cuda
 CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
@ -12,25 +12,20 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh

 .PHONY: all
-all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
+all: magma-cuda124
+all: magma-cuda118

 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-cuda129
-magma-cuda129: DESIRED_CUDA := 12.9
-magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-magma-cuda129:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda128
 magma-cuda128: DESIRED_CUDA := 12.8
 magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
@ -41,3 +36,14 @@ magma-cuda128:
 magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
+
+.PHONY: magma-cuda124
+magma-cuda124: DESIRED_CUDA := 12.4
+magma-cuda124:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda118
+magma-cuda118: DESIRED_CUDA := 11.8
+magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
+magma-cuda118:
+	$(DOCKER_RUN)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,10 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-PLATFORM=""
+PLATFORM="manylinux2014_x86_64"
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@ -34,9 +36,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then

    retry apt-get update
    retry apt-get -y install zip openssl
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 # We use the package name to test the package by passing this to 'pip install'
@ -80,6 +79,8 @@ if [[ -e /opt/openssl ]]; then
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi

+
+
 mkdir -p /tmp/$WHEELHOUSE_DIR

 export PATCHELF_BIN=/usr/local/bin/patchelf
@ -110,6 +111,12 @@ case ${DESIRED_PYTHON} in
    ;;
 esac

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -202,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then

    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR

+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
@ -320,8 +333,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            # Keep the so number for XPU dependencies
+            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -15,9 +15,6 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
 export USE_CUPTI_SO=0
 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
 export USE_CUFILE=${USE_CUFILE:-1}
-export USE_SYSTEM_NCCL=1
-export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-export NCCL_LIB_DIR="/usr/local/cuda/lib64/"

 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
@ -39,8 +36,10 @@ if [[ -n "$DESIRED_CUDA" ]]; then
    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
        CUDA_VERSION=${DESIRED_CUDA}
    else
-        # cu126, cu128 etc...
-        if [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+        # cu90, cu92, cu100, cu101
+        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
+            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
+        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
        fi
    fi
@ -54,14 +53,22 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')

 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
-    12.8|12.9)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8 and will be removed in future releases
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.6)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
+    12.4)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    11.8)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
        exit 1
@ -84,15 +91,14 @@ fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true

 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 DEPS_LIST=(
@ -102,8 +108,26 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

+# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
+# since nvidia-cusparselt-cu11 is not available in PYPI
+if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
+        DEPS_SONAME+=(
+            "libcusparseLt.so.0"
+        )
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
+        )
+fi

-# CUDA_VERSION 12.6, 12.8
+
+# Turn USE_CUFILE off for CUDA 11.8, 12.4 since nvidia-cufile-cu11 and 1.9.0.20 are
+# not available in PYPI
+if [[ $CUDA_VERSION == "11.8" || $CUDA_VERSION == "12.4" ]]; then
+    export USE_CUFILE=0
+fi
+
+
+# CUDA_VERSION 12.4, 12.6, 12.8
 if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
@ -127,8 +151,6 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-            "/usr/local/cuda/lib64/libcufile.so.0"
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
@ -146,8 +168,90 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
-            "libcufile.so.0"
-            "libcufile_rdma.so.1"
+        )
+        if [[ $USE_CUFILE == 1 ]]; then
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libcufile.so.0"
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+            )
+            DEPS_SONAME+=(
+                "libcufile.so.0"
+                "libcufile_rdma.so.1"
+            )
+        fi
+    else
+        echo "Using nvidia libs from pypi."
+        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
+            '$ORIGIN/../../nvidia/cudnn/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+        )
+        if [[ $USE_CUFILE == 1 ]]; then
+            CUDA_RPATHS+=(
+                '$ORIGIN/../../nvidia/cufile/lib'
+            )
+        fi
+        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
+        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
+        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
+        export FORCE_RPATH="--force-rpath"
+        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
+        export ATEN_STATIC_CUDA=0
+        export USE_CUDA_STATIC_LINK=0
+        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+    fi
+elif [[ $CUDA_VERSION == "11.8" ]]; then
+    export USE_STATIC_CUDNN=0
+    # Try parallelizing nvcc as well
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
+    export BUILD_BUNDLE_PTXAS=1
+
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling with cudnn and cublas."
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
+            "/usr/local/cuda/lib64/libcudnn.so.9"
+            "/usr/local/cuda/lib64/libcublas.so.11"
+            "/usr/local/cuda/lib64/libcublasLt.so.11"
+            "/usr/local/cuda/lib64/libcudart.so.11.0"
+            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+            "/usr/local/cuda/lib64/libnvrtc.so.11.2"    # this is not a mistake, it links to more specific cuda version
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
+        )
+        DEPS_SONAME+=(
+            "libcudnn_adv.so.9"
+            "libcudnn_cnn.so.9"
+            "libcudnn_graph.so.9"
+            "libcudnn_ops.so.9"
+            "libcudnn_engines_runtime_compiled.so.9"
+            "libcudnn_engines_precompiled.so.9"
+            "libcudnn_heuristic.so.9"
+            "libcudnn.so.9"
+            "libcublas.so.11"
+            "libcublasLt.so.11"
+            "libcudart.so.11.0"
+            "libnvToolsExt.so.1"
+            "libnvrtc.so.11.2"
+            "libnvrtc-builtins.so.11.8"
        )
    else
        echo "Using nvidia libs from pypi."
@ -161,21 +265,20 @@ if [[ $CUDA_VERSION == 12* ]]; then
            '$ORIGIN/../../nvidia/curand/lib'
            '$ORIGIN/../../nvidia/cusolver/lib'
            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../nvidia/cusparselt/lib'
-            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
-            '$ORIGIN/../../nvidia/nvshmem/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
-            '$ORIGIN/../../nvidia/cufile/lib'
        )
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 else
    echo "Unknown cuda version $CUDA_VERSION"
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -22,7 +22,9 @@ retry () {

 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
@ -33,9 +35,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
@ -96,6 +95,12 @@ python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -164,6 +169,12 @@ fi

 )

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
 (
    set -x

--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -95,7 +95,6 @@ ROCM_SO_FILES=(
    "libroctracer64.so"
    "libroctx64.so"
    "libhipblaslt.so"
-    "libhipsparselt.so"
    "libhiprtc.so"
 )

@ -187,28 +186,20 @@ do
    OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
 done

-ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
-
 # rocBLAS library files
 ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
-ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
-ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
+ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
+ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)

 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
 HIPBLASLT_LIB_DST=lib/hipblaslt/library
-HIPBLASLT_ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
-HIPBLASLT_OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
-HIPBLASLT_LIB_FILES=($HIPBLASLT_ARCH_SPECIFIC_FILES $HIPBLASLT_OTHER_FILES)
-
-# hipsparselt library files
-HIPSPARSELT_LIB_SRC=$ROCM_HOME/lib/hipsparselt/library
-HIPSPARSELT_LIB_DST=lib/hipsparselt/library
-HIPSPARSELT_ARCH_SPECIFIC_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -E $ARCH)
-#HIPSPARSELT_OTHER_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -v gfx)
-HIPSPARSELT_LIB_FILES=($HIPSPARSELT_ARCH_SPECIFIC_FILES $HIPSPARSELT_OTHER_FILES)
+ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
+HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)

 # ROCm library files
 ROCM_SO_PATHS=()
@ -243,14 +234,12 @@ DEPS_SONAME=(
 DEPS_AUX_SRCLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
-    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_SRC/}"
    "/opt/amdgpu/share/libdrm/amdgpu.ids"
 )

 DEPS_AUX_DSTLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
-    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_DST/}"
    "share/libdrm/amdgpu.ids"
 )

--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -20,11 +20,7 @@ fi
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
 source /opt/intel/oneapi/umf/latest/env/vars.sh
-source /opt/intel/oneapi/ccl/latest/env/vars.sh
-source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
-export USE_ONEMKL=1
-export USE_XCCL=1

 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
--- a/.ci/onnx/README.md
+++ b/.ci/onnx/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -27,12 +27,6 @@ cmake --version
 echo "Environment variables:"
 env

-# The sccache wrapped version of nvcc gets put in /opt/cache/lib in docker since
-# there are some issues if it is always wrapped, so we need to add it to PATH
-# during CI builds.
-# https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97
-export PATH="/opt/cache/lib:$PATH"
-
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@ -41,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
    # disable it for now and to be fix later.
    # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@ -58,6 +52,12 @@ fi
 export USE_LLVM=/opt/llvm
 export LLVM_DIR=/opt/llvm/lib/cmake/llvm

+if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
+  # To build test_edge_op_registration
+  export BUILD_EXECUTORCH=ON
+  export USE_CUDA=0
+fi
+
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
  # intel cpu and later run tests on machines with amd cpu.
@ -171,12 +171,6 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/ccl/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # Enable XCCL build
-  export USE_XCCL=1
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
@ -283,8 +277,10 @@ else
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-      python -mpip install numpy==2.0.2
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        python -mpip install numpy==2.0.2
+      fi

      WERROR=1 python setup.py clean

@ -307,18 +303,6 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

-    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-      echo "Checking that xpu is compiled"
-      pushd dist/
-      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
-        echo "XPU support is compiled in."
-      else
-        echo "XPU support is NOT compiled in."
-        exit 1
-      fi
-      popd
-    fi
-
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x

--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -59,16 +59,78 @@ else
  export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
 fi

+###############################################################################
+# Setup XPU ENV
+###############################################################################
+if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+  set +u
+  # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+  source /opt/intel/oneapi/compiler/latest/env/vars.sh
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
+fi
+
 ###############################################################################
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
-  # We also check that there are cxx11 symbols in libtorch
+  function is_expected() {
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
+    fi
+  }
+
+  # First we check that the env var in TorchConfig.cmake is correct
+
+  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
+  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
+  if [[ ! -f "$torch_config" ]]; then
+    echo "No TorchConfig.cmake found!"
+    ls -lah "$install_root/share/cmake/Torch"
+    exit 1
+  fi
+  echo "Checking the TorchConfig.cmake"
+  cat "$torch_config"
+
+  # The sed call below is
+  #   don't print lines by default (only print the line we want)
+  # -n
+  #   execute the following expression
+  # e
+  #   replace lines that match with the first capture group and print
+  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
+  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
+  #   quote, any characters
+  #   Note the exactly one single character after the '='. In the case that the
+  #     variable is not set the '=' will be followed by a '"' immediately and the
+  #     line will fail the match and nothing will be printed; this is what we
+  #     want.  Otherwise it will capture the 0 or 1 after the '='.
+  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
+  #   replace the matched line with the capture group and print
+  # /\1/p
+  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
+  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
+    echo "gcc ABI $actual_gcc_abi not as expected."
+    exit 1
+  fi
+
+  # We also check that there are [not] cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@ -146,11 +208,35 @@ setup_link_flags () {

 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=1
+  else
+    GLIBCXX_USE_CXX11_ABI=0
+  fi
  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }

+build_example_cpp_with_incorrect_abi () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=0
+  else
+    GLIBCXX_USE_CXX11_ABI=1
+  fi
+  set +e
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ERRCODE=$?
+  set -e
+  if [ "$ERRCODE" -eq "0" ]; then
+    echo "Building example with incorrect ABI didn't throw error. Aborting."
+    exit 1
+  else
+    echo "Building example with incorrect ABI throws expected error. Proceeding."
+  fi
+}
+
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@ -160,6 +246,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
+  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
+  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    build_example_cpp_with_incorrect_abi simple-torch-test
+  fi
 else
  pushd /tmp
  python -c 'import torch'
@ -216,14 +307,6 @@ else
  fi
 fi

-###############################################################################
-# Check XPU configured correctly
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  echo "Checking that xpu is compiled"
-  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
-fi
-
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
@ -302,22 +385,10 @@ except RuntimeError as e:
 fi

 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
+# Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
-  # gcc 11 - CUDA 11.8, xpu, rocm
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
-  if [[ "$(uname -m)" == "s390x" ]]; then
-    cxx_abi="19"
-  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
-    cxx_abi="18"
-  else
-    cxx_abi="16"
-  fi
-  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
  popd
 fi
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
  # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
  unset HIP_PLATFORM
  export PYTORCH_TEST_WITH_ROCM=1
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+  # improve rccl performance for distributed tests
+  export HSA_FORCE_FINE_GRAIN_PCIE=1
 fi

 # TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -159,6 +159,11 @@ function install_torchvision() {
  fi
 }

+function install_tlparse() {
+  pip_install --user "tlparse==0.3.30"
+  PATH="$(python -m site --user-base)/bin:$PATH"
+}
+
 function install_torchrec_and_fbgemm() {
  local torchrec_commit
  torchrec_commit=$(get_pinned_commit torchrec)
@ -197,7 +202,7 @@ function install_torchrec_and_fbgemm() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,50 +1,31 @@
 #!/bin/bash

 # Script for installing sccache on the xla build job, which uses xla's docker
-# image, which has sccache installed but doesn't write the stubs.  This is
-# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
-# that will always return the same thing, ex checks for for rocm, CUDA, changing
-# the path where sccache is installed, not changing /etc/environment, and not
-# installing/downloading sccache as it is already in the docker image.
+# image and doesn't have sccache installed on it.  This is mostly copied from
+# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
+# return the same thing, ex checks for for rocm, CUDA, and changing the path
+# where sccache is installed, and not changing /etc/environment.

 set -ex -o pipefail

+install_binary() {
+  echo "Downloading sccache binary from S3 repo"
+  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
+}
+
 mkdir -p /tmp/cache/bin
+mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"

+install_binary
+chmod a+x /tmp/cache/bin/sccache
+
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ "$1" == "gcc" ]; then
-    # Do not call sccache recursively when dumping preprocessor argument
-    # For some reason it's very important for the first cached nvcc invocation
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
-for arg in "\$@"; do
-  if [ "\$arg" = "-E" ]; then
-    exec $(which "$1") "\$@"
-  fi
-done
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  else
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  fi
+  # shellcheck disable=SC2086
+  # shellcheck disable=SC2059
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
  chmod a+x "/tmp/cache/bin/$1"
 }

--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -33,15 +33,56 @@ if which sccache > /dev/null; then
  export PATH="${tmp_dir}:$PATH"
 fi

-print_cmake_info
-if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
-else
+cross_compile_arm64() {
+  # Cross compilation for arm64
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
+}
+
+build_lite_interpreter() {
+    echo "Testing libtorch (lite interpreter)."
+
+    CPP_BUILD="$(pwd)/../cpp_build"
+    # Ensure the removal of the tmp directory
+    trap 'rm -rfv ${CPP_BUILD}' EXIT
+    rm -rf "${CPP_BUILD}"
+    mkdir -p "${CPP_BUILD}/caffe2"
+
+    # It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
+    BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
+    pushd "${CPP_BUILD}/caffe2" || exit
+    VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
+    popd || exit
+
+    "${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
+}
+
+print_cmake_info
+
+if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
+elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
+  export BUILD_LITE_INTERPRETER=1
+  build_lite_interpreter
+else
+  compile_x86_64
 fi
+
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -20,4 +20,14 @@ print_cmake_info() {
  CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
  # Print all libraries under cmake rpath for debugging
  ls -la "$CONDA_INSTALLATION_DIR/../lib"
+
+  export CMAKE_EXEC
+  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
+  # where cmake dependencies couldn't be found. This seems to point to how conda
+  # links $CMAKE_EXEC to its package cache when cloning a new environment
+  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  # Adding the rpath will invalidate cmake signature, so signing it again here
+  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
+  # with an exit code 137 otherwise
+  codesign -f -s - "${CMAKE_EXEC}" || true
 }
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -42,16 +42,6 @@ test_python_all() {
  assert_git_not_dirty
 }

-test_python_mps() {
-  setup_test_python
-
-  time python test/run_test.py --verbose --mps
-  MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
-
-  assert_git_not_dirty
-}
-
-
 test_python_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -165,7 +155,6 @@ test_jit_hooks() {
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
-  brew install jpeg-turbo libpng

  pushd torchvision
  git fetch
@ -180,8 +169,7 @@ torchbench_setup_macos() {
  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
  git submodule update --init --recursive
  python setup.py clean
-  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
-  USE_OPENMP=0 python setup.py develop
+  python setup.py develop
  popd

  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@ -189,8 +177,9 @@ torchbench_setup_macos() {
  checkout_install_torchbench
 }

-pip_benchmark_deps() {
-  python -mpip install --no-input astunparse requests cython scikit-learn
+conda_benchmark_deps() {
+  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
+  conda install -y -c conda-forge librosa
 }


@ -198,7 +187,7 @@ test_torchbench_perf() {
  print_cmake_info

  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -225,61 +214,32 @@ test_torchbench_smoketest() {
  print_cmake_info

  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  # shellcheck disable=SC2119,SC2120
  torchbench_setup_macos

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

+  local backend=eager
+  local dtype=notset
  local device=mps
-  local dtypes=(undefined float16 bfloat16 notset)
-  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)

-  for backend in eager inductor; do
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"

-    echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
-    local dtype_arg="--${dtype}"
-    if [ "$dtype" == notset ]; then
-        dtype_arg="--float32"
-    fi
-    touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-    for model in "${models[@]}"; do
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-        --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
-      if [ "$backend" == "inductor" ]; then
-        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-          --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-      fi
-    done
-    if [ "$backend" == "inductor" ]; then
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-        --performance --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-        --accuracy --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
-    fi
-
-    if [ "$dtype" == notset ]; then
-      for dtype_ in notset amp; do
-        echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype_}"
-        touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv"
-        local dtype_arg="--${dtype_}"
-        if [ "$dtype_" == notset ]; then
-          dtype_arg="--float32"
-        fi
-        for model in "${models[@]}"; do
-          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-            --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
-            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" || true
-        done
-      done
-    fi
+  echo "Setup complete, launching torchbench training performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --training --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  done

+  echo "Launching torchbench inference performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done

  echo "Pytorch benchmark on mps device completed"
@ -289,7 +249,7 @@ test_hf_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  echo "Launching HuggingFace training perf run"
@ -305,7 +265,7 @@ test_timm_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  echo "Launching timm training perf run"
@ -317,6 +277,8 @@ test_timm_perf() {
  echo "timm benchmark on mps device completed"
 }

+install_tlparse
+
 if [[ $TEST_CONFIG == *"perf_all"* ]]; then
  test_torchbench_perf
  test_hf_perf
@ -328,9 +290,7 @@ elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
 elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
-  test_torchbench_smoketest "${SHARD_NUMBER}"
-elif [[ $TEST_CONFIG == *"mps"* ]]; then
-  test_python_mps
+  test_torchbench_smoketest
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
--- a/.ci/pytorch/perf_test/common.sh
+++ b/.ci/pytorch/perf_test/common.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+run_test () {
+  rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
+  "$@"
+  cd .. && rm -rf test_tmp/
+}
+
+get_runtime_of_command () {
+  TIMEFORMAT=%R
+
+  # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
+  runtime=$( { time "$@"; } 2>&1 1>/dev/null)
+  if [[ $runtime == *"Error"* ]]; then
+    exit 1
+  fi
+  runtime=${runtime#+++ $@}
+  runtime=$(python -c "print($runtime)")
+
+  echo "$runtime"
+}
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -0,0 +1,91 @@
+import argparse
+import json
+import math
+import sys
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--test-name", dest="test_name", action="store", required=True, help="test name"
+)
+parser.add_argument(
+    "--sample-stats",
+    dest="sample_stats",
+    action="store",
+    required=True,
+    help="stats from sample",
+)
+parser.add_argument(
+    "--update",
+    action="store_true",
+    help="whether to update baseline using stats from sample",
+)
+args = parser.parse_args()
+
+test_name = args.test_name
+
+if "cpu" in test_name:
+    backend = "cpu"
+elif "gpu" in test_name:
+    backend = "gpu"
+
+data_file_path = f"../{backend}_runtime.json"
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+if test_name in data:
+    mean = float(data[test_name]["mean"])
+    sigma = float(data[test_name]["sigma"])
+else:
+    # Let the test pass if baseline number doesn't exist
+    mean = sys.maxsize
+    sigma = 0.001
+
+print("population mean: ", mean)
+print("population sigma: ", sigma)
+
+# Let the test pass if baseline number is NaN (which happened in
+# the past when we didn't have logic for catching NaN numbers)
+if math.isnan(mean) or math.isnan(sigma):
+    mean = sys.maxsize
+    sigma = 0.001
+
+sample_stats_data = json.loads(args.sample_stats)
+
+sample_mean = float(sample_stats_data["mean"])
+sample_sigma = float(sample_stats_data["sigma"])
+
+print("sample mean: ", sample_mean)
+print("sample sigma: ", sample_sigma)
+
+if math.isnan(sample_mean):
+    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
+elif math.isnan(sample_sigma):
+    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
+
+z_value = (sample_mean - mean) / sigma
+
+print("z-value: ", z_value)
+
+if z_value >= 3:
+    raise Exception(  # noqa: TRY002
+        f"""\n
+z-value >= 3, there is high chance of perf regression.\n
+To reproduce this regression, run
+`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
+and compare the runtime before/after your code change.
+"""
+    )
+else:
+    print("z-value < 3, no perf regression detected.")
+    if args.update:
+        print("We will use these numbers as new baseline.")
+        new_data_file_path = f"../new_{backend}_runtime.json"
+        with open(new_data_file_path) as new_data_file:
+            new_data = json.load(new_data_file)
+        new_data[test_name] = {}
+        new_data[test_name]["mean"] = sample_mean
+        new_data[test_name]["sigma"] = max(sample_sigma, sample_mean * 0.1)
+        with open(new_data_file_path, "w") as new_data_file:
+            json.dump(new_data, new_data_file, indent=4)
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -0,0 +1,18 @@
+import json
+import sys
+
+import numpy
+
+
+sample_data_list = sys.argv[1:]
+sample_data_list = [float(v.strip()) for v in sample_data_list]
+
+sample_mean = numpy.mean(sample_data_list)
+sample_sigma = numpy.std(sample_data_list)
+
+data = {
+    "mean": sample_mean,
+    "sigma": sample_sigma,
+}
+
+print(json.dumps(data))
--- a/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mini_sequence_labeler () {
+  echo "Testing: mini sequence labeler, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 726567a455edbfda6199445922a8cfee82535664
+
+  cd scripts/mini_sequence_labeler
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py)
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mini_sequence_labeler "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mnist () {
+  echo "Testing: MNIST, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision-cpu
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch () {
+  echo "Testing: torch.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch_tensor () {
+  echo "Testing: torch.Tensor.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch_tensor "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_cudnn_lstm () {
+  echo "Testing: CuDNN LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_cudnn_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_lstm () {
+  echo "Testing: LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mlstm () {
+  echo "Testing: MLSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mlstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mnist () {
+  echo "Testing: MNIST, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  # Needs warm up to get accurate number
+  python main.py --epochs 1 --no-log
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_word_language_model () {
+  echo "Testing: word language model on Wikitext-2, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/word_language_model
+
+  cd data/wikitext-2
+
+  # Reduce dataset size, so that we can have more runs per test
+  sed -n '1,200p' test.txt > test_tmp.txt
+  sed -n '1,1000p' train.txt > train_tmp.txt
+  sed -n '1,200p' valid.txt > valid_tmp.txt
+
+  mv test_tmp.txt test.txt
+  mv train_tmp.txt train.txt
+  mv valid_tmp.txt valid.txt
+
+  cd ../..
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --cuda --epochs 1)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_word_language_model "$@"
+fi
--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -0,0 +1,14 @@
+import json
+import sys
+
+
+data_file_path = sys.argv[1]
+commit_hash = sys.argv[2]
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+data["commit"] = commit_hash
+
+with open(data_file_path, "w") as data_file:
+    json.dump(data, data_file)
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,6 +119,12 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

+# Prevent Google from indexing $install_path/_modules. This folder contains
+# generated source files.
+# NB: the following only works on gnu sed. The sed shipped with mac os is different.
+# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
+find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
+
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@ -76,7 +76,7 @@ fi
 # Environment initialization
 if [[ "$(uname)" == Darwin ]]; then
    # Install the testing dependencies
-    retry pip install -q future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
+    retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
 else
    retry pip install -qr requirements.txt || true
    retry pip install -q hypothesis protobuf pytest setuptools || true
@ -91,6 +91,7 @@ fi

 echo "Testing with:"
 pip freeze
+conda list || true

 ##############################################################################
 # Smoke tests
--- a/.ci/pytorch/short-perf-test-cpu.sh
+++ b/.ci/pytorch/short-perf-test-cpu.sh
@ -0,0 +1,71 @@
+#!/bin/bash
+
+SCRIPT_PARENT_DIR=$(dirname "${BASH_SOURCE[0]}")
+
+# shellcheck source=.ci/pytorch/common.sh
+source "$SCRIPT_PARENT_DIR/common.sh"
+
+cd .ci/pytorch/perf_test
+
+echo "Running CPU perf test for PyTorch..."
+
+pip install -q awscli
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
+fi
+
+# Find the default branch commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+while IFS='' read -r commit_id; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/cpu_runtime/"${commit_id}".json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
+aws s3 cp s3://ossci-perf-test/pytorch/cpu_runtime/"${LATEST_TESTED_COMMIT}".json cpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Prepare new baseline file
+    cp cpu_runtime.json new_cpu_runtime.json
+    python update_commit_hash.py new_cpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
+fi
+
+# Include tests
+# shellcheck source=./perf_test/test_cpu_speed_mini_sequence_labeler.sh
+. ./test_cpu_speed_mini_sequence_labeler.sh
+# shellcheck source=./perf_test/test_cpu_speed_mnist.sh
+. ./test_cpu_speed_mnist.sh
+# shellcheck source=./perf_test/test_cpu_speed_torch.sh
+. ./test_cpu_speed_torch.sh
+# shellcheck source=./perf_test/test_cpu_speed_torch_tensor.sh
+. ./test_cpu_speed_torch_tensor.sh
+
+# Run tests
+export TEST_MODE="compare_with_baseline"
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    export TEST_MODE="compare_and_update"
+fi
+
+# Operator tests
+run_test test_cpu_speed_torch ${TEST_MODE}
+run_test test_cpu_speed_torch_tensor ${TEST_MODE}
+
+# Sample model tests
+run_test test_cpu_speed_mini_sequence_labeler 20 ${TEST_MODE}
+run_test test_cpu_speed_mnist 20 ${TEST_MODE}
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_cpu_runtime.json s3://ossci-perf-test/pytorch/cpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
+fi
--- a/.ci/pytorch/short-perf-test-gpu.sh
+++ b/.ci/pytorch/short-perf-test-gpu.sh
@ -0,0 +1,76 @@
+#!/bin/bash
+
+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
+pushd .ci/pytorch/perf_test
+
+echo "Running GPU perf test for PyTorch..."
+
+# Trying to uninstall PyYAML can cause problem. Workaround according to:
+# https://github.com/pypa/pip/issues/5247#issuecomment-415571153
+pip install -q awscli --ignore-installed PyYAML
+
+# Set multipart_threshold to be sufficiently high, so that `aws s3 cp` is not a multipart read
+# More info at https://github.com/aws/aws-cli/issues/2321
+aws configure set default.s3.multipart_threshold 5GB
+UPSTREAM_DEFAULT_BRANCH="$(git remote show https://github.com/pytorch/pytorch.git | awk '/HEAD branch/ {print $NF}')"
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Get current default branch commit hash
+    DEFAULT_BRANCH_COMMIT_ID=$(git log --format="%H" -n 1)
+    export DEFAULT_BRANCH_COMMIT_ID
+fi
+
+# Find the default branch commit to test against
+git remote add upstream https://github.com/pytorch/pytorch.git
+git fetch upstream
+IFS=$'\n'
+while IFS='' read -r commit_id; do
+    if aws s3 ls s3://ossci-perf-test/pytorch/gpu_runtime/"${commit_id}".json; then
+        LATEST_TESTED_COMMIT=${commit_id}
+        break
+    fi
+done < <(git rev-list upstream/"$UPSTREAM_DEFAULT_BRANCH")
+aws s3 cp s3://ossci-perf-test/pytorch/gpu_runtime/"${LATEST_TESTED_COMMIT}".json gpu_runtime.json
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # Prepare new baseline file
+    cp gpu_runtime.json new_gpu_runtime.json
+    python update_commit_hash.py new_gpu_runtime.json "${DEFAULT_BRANCH_COMMIT_ID}"
+fi
+
+# Include tests
+# shellcheck source=./perf_test/test_gpu_speed_mnist.sh
+. ./test_gpu_speed_mnist.sh
+# shellcheck source=./perf_test/test_gpu_speed_word_language_model.sh
+. ./test_gpu_speed_word_language_model.sh
+# shellcheck source=./perf_test/test_gpu_speed_cudnn_lstm.sh
+. ./test_gpu_speed_cudnn_lstm.sh
+# shellcheck source=./perf_test/test_gpu_speed_lstm.sh
+. ./test_gpu_speed_lstm.sh
+# shellcheck source=./perf_test/test_gpu_speed_mlstm.sh
+. ./test_gpu_speed_mlstm.sh
+
+# Run tests
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    run_test test_gpu_speed_mnist 20 compare_and_update
+    run_test test_gpu_speed_word_language_model 20 compare_and_update
+    run_test test_gpu_speed_cudnn_lstm 20 compare_and_update
+    run_test test_gpu_speed_lstm 20 compare_and_update
+    run_test test_gpu_speed_mlstm 20 compare_and_update
+else
+    run_test test_gpu_speed_mnist 20 compare_with_baseline
+    run_test test_gpu_speed_word_language_model 20 compare_with_baseline
+    run_test test_gpu_speed_cudnn_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_lstm 20 compare_with_baseline
+    run_test test_gpu_speed_mlstm 20 compare_with_baseline
+fi
+
+if [[ "$COMMIT_SOURCE" == "$UPSTREAM_DEFAULT_BRANCH" ]]; then
+    # This could cause race condition if we are testing the same default branch commit twice,
+    # but the chance of them executing this line at the same time is low.
+    aws s3 cp new_gpu_runtime.json s3://ossci-perf-test/pytorch/gpu_runtime/"${DEFAULT_BRANCH_COMMIT_ID}".json --acl public-read
+fi
+
+popd
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -80,7 +80,7 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
        return functools.reduce(list.__add__, (x.result() for x in tasks), [])


-def check_lib_symbols_for_abi_correctness(lib: str) -> None:
+def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
    pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS)
@ -88,12 +88,28 @@ def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    num_pre_cxx11_symbols = len(pre_cxx11_symbols)
    print(f"num_cxx11_symbols: {num_cxx11_symbols}")
    print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}")
-    if num_pre_cxx11_symbols > 0:
-        raise RuntimeError(
-            f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+    if pre_cxx11_abi:
+        if num_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found cxx11 symbols, but there shouldn't be any, see: {cxx11_symbols[:100]}"
+            )
+        if num_pre_cxx11_symbols < 1000:
+            raise RuntimeError("Didn't find enough pre-cxx11 symbols.")
+        # Check for no recursive iterators, regression test for https://github.com/pytorch/pytorch/issues/133437
+        rec_iter_symbols = grep_symbols(
+            lib, [re.compile("std::filesystem::recursive_directory_iterator.*")]
        )
-    if num_cxx11_symbols < 100:
-        raise RuntimeError("Didn't find enought cxx11 symbols")
+        if len(rec_iter_symbols) > 0:
+            raise RuntimeError(
+                f"recursive_directory_iterator in used pre-CXX11 binaries, see; {rec_iter_symbols}"
+            )
+    else:
+        if num_pre_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+            )
+        if num_cxx11_symbols < 100:
+            raise RuntimeError("Didn't find enought cxx11 symbols")


 def main() -> None:
@ -105,8 +121,9 @@ def main() -> None:
        else:
            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"

-    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
+    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)


 if __name__ == "__main__":
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.1
 .3.0