apply

2025-10-23 14:59:34 +08:00 · 2025-02-18 11:24:34 -08:00
6862 changed files with 155720 additions and 337388 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -3,7 +3,9 @@ set -eux -o pipefail

 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

-if [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="9.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
    export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
 fi

@ -18,14 +20,13 @@ cd /
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
-pip install auditwheel==6.2.0
+pip install auditwheel
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    export USE_SYSTEM_NCCL=1
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -31,47 +31,33 @@ def build_ArmComputeLibrary() -> None:
        "build=native",
    ]
    acl_install_dir = "/acl"
-    acl_checkout_dir = os.getenv("ACL_SOURCE_DIR", "ComputeLibrary")
-    if os.path.isdir(acl_install_dir):
-        shutil.rmtree(acl_install_dir)
-    if not os.path.isdir(acl_checkout_dir) or not len(os.listdir(acl_checkout_dir)):
-        check_call(
-            [
-                "git",
-                "clone",
-                "https://github.com/ARM-software/ComputeLibrary.git",
-                "-b",
-                "v25.02",
-                "--depth",
-                "1",
-                "--shallow-submodules",
-            ]
-        )
+    acl_checkout_dir = "ComputeLibrary"
+    os.makedirs(acl_install_dir)
+    check_call(
+        [
+            "git",
+            "clone",
+            "https://github.com/ARM-software/ComputeLibrary.git",
+            "-b",
+            "v24.09",
+            "--depth",
+            "1",
+            "--shallow-submodules",
+        ]
+    )

    check_call(
-        ["scons", "Werror=1", f"-j{os.cpu_count()}"] + acl_build_flags,
+        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+        + acl_build_flags,
        cwd=acl_checkout_dir,
    )
-    for d in ["arm_compute", "include", "utils", "support", "src", "build"]:
+    for d in ["arm_compute", "include", "utils", "support", "src"]:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")


-def replace_tag(filename) -> None:
-    with open(filename) as f:
-        lines = f.readlines()
-    for i, line in enumerate(lines):
-        if line.startswith("Tag:"):
-            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
-            print(f"Updated tag from {line} to {lines[i]}")
-            break
-
-    with open(filename, "w") as f:
-        f.writelines(lines)
-
-
-def package_cuda_wheel(wheel_path, desired_cuda) -> None:
+def update_wheel(wheel_path, desired_cuda) -> None:
    """
-    Package the cuda wheel libraries
+    Update the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
@ -88,6 +74,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
+        "/usr/local/cuda/lib64/libnvToolsExt.so.1",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
@ -101,19 +88,26 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
-        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-        "/usr/local/lib/libnvpl_lapack_core.so.0",
-        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
-
-    if "129" in desired_cuda:
+    if enable_cuda:
        libs_to_copy += [
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.9",
-            "/usr/local/cuda/lib64/libcufile.so.0",
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+        if "126" in desired_cuda:
+            libs_to_copy += [
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
+            ]
+        elif "128" in desired_cuda:
+            libs_to_copy += [
+                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+            ]
+    else:
+        libs_to_copy += [
+            "/opt/OpenBLAS/lib/libopenblas.so.0",
        ]
-
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
@ -122,13 +116,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
-
-    # Make sure the wheel is tagged with manylinux_2_28
-    for f in os.scandir(f"{folder}/tmp/"):
-        if f.is_dir() and f.name.endswith(".dist-info"):
-            replace_tag(f"{f.path}/WHEEL")
-            break
-
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
@ -145,9 +132,6 @@ def complete_wheel(folder: str) -> str:
    """
    wheel_name = list_dir(f"/{folder}/dist")[0]

-    # Please note for cuda we don't run auditwheel since we use custom script to package
-    # the cuda dependencies to the wheel file using update_wheel() method.
-    # However we need to make sure filename reflects the correct Manylinux platform.
    if "pytorch" in folder and not enable_cuda:
        print("Repairing Wheel with AuditWheel")
        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
@ -159,14 +143,7 @@ def complete_wheel(folder: str) -> str:
            f"/{folder}/dist/{repaired_wheel_name}",
        )
    else:
-        repaired_wheel_name = wheel_name.replace(
-            "linux_aarch64", "manylinux_2_28_aarch64"
-        )
-        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
-        os.rename(
-            f"/{folder}/dist/{wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
+        repaired_wheel_name = wheel_name

    print(f"Copying {repaired_wheel_name} to artifacts")
    shutil.copy2(
@ -203,10 +180,8 @@ if __name__ == "__main__":
    ).decode()

    print("Building PyTorch wheel")
-    build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
-    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
-    if enable_cuda:
-        build_vars = "MAX_JOBS=5 " + build_vars
+    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    os.system("cd /pytorch; python setup.py clean")

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
@ -229,7 +204,7 @@ if __name__ == "__main__":
        else:
            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "

    if enable_mkldnn:
        build_ArmComputeLibrary()
@ -253,6 +228,6 @@ if __name__ == "__main__":
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
-        package_cuda_wheel(wheel_path, desired_cuda)
+        update_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -19,11 +19,13 @@ import boto3

 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+    "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }

+ubuntu18_04_ami = os_amis["ubuntu18_04"]
 ubuntu20_04_ami = os_amis["ubuntu20_04"]


@ -327,7 +329,7 @@ def build_ArmComputeLibrary(host: RemoteHost, git_clone_flags: str = "") -> None
        ]
    )
    host.run_cmd(
-        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v25.02 {git_clone_flags}"
+        f"git clone https://github.com/ARM-software/ComputeLibrary.git -b v24.09 {git_clone_flags}"
    )

    host.run_cmd(f"cd ComputeLibrary && scons Werror=1 -j8 {acl_build_flags}")
@ -657,6 +659,18 @@ def configure_system(
            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
        )
    host.run_cmd("pip3 install dataclasses typing-extensions")
+    # Install and switch to gcc-8 on Ubuntu-18.04
+    if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
+        host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
+        )
    if not use_conda:
        print("Installing Cython + numpy from PyPy")
        host.run_cmd("sudo pip3 install Cython")
@ -747,7 +761,7 @@ def start_build(
        version = host.check_output("cat pytorch/version.txt").strip()[:-2]
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1"
    if branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1"
    if host.using_docker():
        build_vars += " CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000"
    if enable_mkldnn:
@ -1012,7 +1026,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.8"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/caffe2/README.md
+++ b/.ci/caffe2/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/caffe2/test.sh
+++ b/.ci/caffe2/test.sh
@ -13,6 +13,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-android* ]]; then
  echo 'Skipping tests'
  exit 0
 fi
+if [[ "${BUILD_ENVIRONMENT}" == *-rocm* ]]; then
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+fi
 # These additional packages are needed for circleci ROCm builds.
 if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then
    # Need networkx 2.0 because bellmand_ford was moved in 2.1 . Scikit-image by
--- a/.ci/docker/README.md
+++ b/.ci/docker/README.md
@ -34,5 +34,5 @@ See `build.sh` for valid build environments (it's the giant switch).
 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest

 # Set flags (see build.sh) and build image
-sudo bash -c 'TRITON=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
+sudo bash -c 'PROTOBUF=1 ./build.sh pytorch-linux-bionic-py3.8-gcc9 -t myimage:latest
 ```
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,7 +1,6 @@
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-ARG ROCM_IMAGE=rocm/dev-almalinux-8:6.3-complete
-FROM amd64/almalinux:8.10-20250519 as base
+FROM amd64/almalinux:8 as base

 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
@ -9,10 +8,12 @@ ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=11

+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
 RUN yum -y update
 RUN yum -y install epel-release
-# install glibc-langpack-en make sure en_US.UTF-8 locale is available
-RUN yum -y install glibc-langpack-en
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
@ -40,36 +41,31 @@ RUN bash ./install_conda.sh && rm install_conda.sh

 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.6
+ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
 # Make things in our path by default
 ENV PATH=/usr/local/cuda-${CUDA_VERSION}/bin:$PATH

+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+ENV DESIRED_CUDA=11.8
+
+FROM cuda as cuda12.1
+RUN bash ./install_cuda.sh 12.1
+ENV DESIRED_CUDA=12.1
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+ENV DESIRED_CUDA=12.4
+
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6

-FROM cuda as cuda12.8
-RUN bash ./install_cuda.sh 12.8
-ENV DESIRED_CUDA=12.8
-
-FROM cuda as cuda12.9
-RUN bash ./install_cuda.sh 12.9
-ENV DESIRED_CUDA=12.9
-
-FROM ${ROCM_IMAGE} as rocm
-ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-ADD ./common/install_mkl.sh install_mkl.sh
-RUN bash ./install_mkl.sh && rm install_mkl.sh
-ENV MKLROOT /opt/intel
-
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -77,9 +73,9 @@ RUN bash ./install_mnist.sh

 FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
+COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
+COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
-COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
-COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,70 +1,82 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE_NAME="pytorch/${image}"

-CUDA_VERSION=""
-ROCM_VERSION=""
-EXTRA_BUILD_ARGS=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg CUDA_VERSION=${CUDA_VERSION}"
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name and tag.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    ROCM_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-    EXTRA_BUILD_ARGS="--build-arg ROCM_IMAGE=rocm/dev-almalinux-8:${ROCM_VERSION}-complete"
-fi
-
-case ${DOCKER_TAG_PREFIX} in
-  cpu)
-    BASE_TARGET=base
-    ;;
-  cuda*)
-    BASE_TARGET=cuda${CUDA_VERSION}
-    ;;
-  rocm*)
-    BASE_TARGET=rocm
-    ;;
-  *)
-    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
-    exit 1
-    ;;
-esac
-
-# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-sudo systemctl daemon-reload
-sudo systemctl restart docker

 export DOCKER_BUILDKIT=1
 TOPDIR=$(git rev-parse --show-toplevel)
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-docker build \
-  --target final \
-  --progress plain \
-  --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "DEVTOOLSET_VERSION=11" \
-  ${EXTRA_BUILD_ARGS} \
-  -t ${tmp_tag} \
-  $@ \
-  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
-  ${TOPDIR}/.ci/docker/
+CUDA_VERSION=${CUDA_VERSION:-12.1}

-if [ -n "${CUDA_VERSION}" ]; then
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=11" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
  # Test that we're using the right CUDA compiler
-  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
 fi
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -1,8 +1,4 @@
 #!/bin/bash
-# The purpose of this script is to:
-# 1. Extract the set of parameters to be used for a docker build based on the provided image name.
-# 2. Run docker build with the parameters found in step 1.
-# 3. Run the built image and print out the expected and actual versions of packages installed.

 set -ex

@ -50,21 +46,30 @@ if [[ "$image" == *xla* ]]; then
  exit 0
 fi

-if [[ "$image" == *-jammy* ]]; then
+if [[ "$image" == *-focal* ]]; then
+  UBUNTU_VERSION=20.04
+elif [[ "$image" == *-jammy* ]]; then
  UBUNTU_VERSION=22.04
 elif [[ "$image" == *ubuntu* ]]; then
  extract_version_from_image_name ubuntu UBUNTU_VERSION
+elif [[ "$image" == *centos* ]]; then
+  extract_version_from_image_name centos CENTOS_VERSION
 fi

 if [ -n "${UBUNTU_VERSION}" ]; then
  OS="ubuntu"
+elif [ -n "${CENTOS_VERSION}" ]; then
+  OS="centos"
 else
  echo "Unable to derive operating system base..."
  exit 1
 fi

 DOCKERFILE="${OS}/Dockerfile"
-if [[ "$image" == *rocm* ]]; then
+# When using ubuntu - 22.04, start from Ubuntu docker image, instead of nvidia/cuda docker image.
+if [[ "$image" == *cuda* && "$UBUNTU_VERSION" != "22.04" ]]; then
+  DOCKERFILE="${OS}-cuda/Dockerfile"
+elif [[ "$image" == *rocm* ]]; then
  DOCKERFILE="${OS}-rocm/Dockerfile"
 elif [[ "$image" == *xpu* ]]; then
  DOCKERFILE="${OS}-xpu/Dockerfile"
@ -76,6 +81,9 @@ elif [[ "$image" == *linter* ]]; then
  DOCKERFILE="linter/Dockerfile"
 fi

+# CMake 3.18 is needed to support CUDA17 language variant
+CMAKE_VERSION=3.18.5
+
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
@ -83,219 +91,226 @@ if [[ "$image" == *rocm* ]]; then
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi

-tag=$(echo $image | awk -F':' '{print $2}')
-
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
-case "$tag" in
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+case "$image" in
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.8.1
+  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
+  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.6
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.8.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-clang12-onnx)
+  pytorch-linux-focal-py3-clang10-onnx)
    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    ONNX=yes
    ;;
-  pytorch-linux-jammy-py3.9-clang12)
+  pytorch-linux-focal-py3.9-clang10)
    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.11-clang12)
+  pytorch-linux-focal-py3.11-clang10)
    ANACONDA_PYTHON_VERSION=3.11
-    CLANG_VERSION=12
+    CLANG_VERSION=10
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc9)
+  pytorch-linux-focal-py3.9-gcc9)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-rocm-n-1-py3)
+  pytorch-linux-focal-rocm-n-1-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.2.4
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    INDUCTOR_BENCHMARKS=yes
+    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+  pytorch-linux-jammy-xpu-2024.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
-    ROCM_VERSION=6.4
+    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-jammy-xpu-2025.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-xpu-2025.1-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    VISION=yes
-    XPU_VERSION=2025.1
-    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
    ANACONDA_PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-asan)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang15-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=15
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3-clang18-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=18
+    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
+    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
    UNINSTALL_DILL=yes
@ -303,36 +318,44 @@ case "$tag" in
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=12
+    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
+    CONDA_CMAKE=yes
    TRITON_CPU=yes
    ;;
-  pytorch-linux-jammy-linter)
+  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
-    CUDA_VERSION=12.8.1
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+    ANACONDA_PYTHON_VERSION=3.9
+    CUDA_VERSION=11.8
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -341,7 +364,10 @@ case "$tag" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -349,6 +375,8 @@ case "$tag" in
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
+    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -364,7 +392,8 @@ case "$tag" in
      TRITON=yes
      # To ensure that any ROCm config will build using conda cmake
      # and thus have LAPACK/MKL enabled
-      fi
+      CONDA_CMAKE=yes
+    fi
    if [[ "$image" == *centos7* ]]; then
      NINJA_VERSION=1.10.2
    fi
@ -380,37 +409,45 @@ case "$tag" in
    if [[ "$image" == *glibc* ]]; then
      extract_version_from_image_name glibc GLIBC_VERSION
    fi
+    if [[ "$image" == *cmake* ]]; then
+      extract_version_from_image_name cmake CMAKE_VERSION
+    fi
  ;;
 esac

 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-no_cache_flag=""
-progress_flag=""
-# Do not use cache and progress=plain when in CI
-if [[ -n "${CI:-}" ]]; then
-  no_cache_flag="--no-cache"
-  progress_flag="--progress=plain"
+#when using cudnn version 8 install it separately from cuda
+if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
+  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  if [[ ${CUDNN_VERSION} == 9 ]]; then
+    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
+  fi
 fi

 # Build image
 docker build \
-       ${no_cache_flag} \
-       ${progress_flag} \
+       --no-cache \
+       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
+       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
+       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
       --build-arg "DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}" \
       --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
       --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
+       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
+       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
@ -418,6 +455,7 @@ docker build \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
+       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -426,7 +464,6 @@ docker build \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
-       --build-arg "UNINSTALL_DILL=${UNINSTALL_DILL}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
@ -444,7 +481,7 @@ docker build \
 UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')

 function drun() {
-  docker run --rm "$tmp_tag" "$@"
+  docker run --rm "$tmp_tag" $*
 }

 if [[ "$OS" == "ubuntu" ]]; then
@ -492,23 +529,3 @@ if [ -n "$KATEX" ]; then
    exit 1
  fi
 fi
-
-HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
-if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
-  if [ "$HAS_TRITON" = "no" ]; then
-    echo "expecting triton to be installed, but it is not"
-    exit 1
-  fi
-elif [ "$HAS_TRITON" = "yes" ]; then
-  echo "expecting triton to not be installed, but it is"
-  exit 1
-fi
-
-# Sanity check cmake version.  Executorch reinstalls cmake and I'm not sure if
-# they support 4.0.0 yet, so exclude them from this check.
-CMAKE_VERSION=$(drun cmake --version)
-if [[ "$EXECUTORCH" != *yes* && "$CMAKE_VERSION" != *4.* ]]; then
-  echo "CMake version is not 4.0.0:"
-  drun cmake --version
-  exit 1
-fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -17,8 +17,9 @@ RUN bash ./install_base.sh && rm install_base.sh
 # Update CentOS git version
 RUN yum -y remove git
 RUN yum -y remove git-*
-RUN yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
-    sed -i 's/packages.endpoint/packages.endpointdev/' /etc/yum.repos.d/endpoint.repo
+RUN yum -y install https://packages.endpoint.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm || \
+    (yum -y install https://packages.endpointdev.com/rhel/7/os/x86_64/endpoint-repo-1.9-1.x86_64.rpm && \
+    sed -i "s/packages.endpoint/packages.endpointdev/" /etc/yum.repos.d/endpoint.repo)
 RUN yum install -y git

 # Install devtoolset
@ -39,7 +40,7 @@ RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
-ARG BUILD_ENVIRONMENT
+ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
@ -47,6 +48,20 @@ COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -60,7 +75,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
@ -74,6 +89,12 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG en_US.utf8
 ENV LC_ALL en_US.utf8

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-56392aa978594cc155fa8af48cd949f5b5f1823a
+5e4d6b6380d575e48e37e9d987fded4ec588e7bc
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +1 @@
-v2.27.3-1
+v2.25.1-1
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-ae324eeac8e102a2b40370e341460f3791353398
+e98b6fcb8df5b44eb0d0addb6767c573d37ba024
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8757738a7418249896224430ce84888e8ecdd79
+4b3bb1f8da0ded6ccd572dd1358ef45af5a1befe
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v25.02
+readonly version=v24.04
 readonly src_host=https://github.com/ARM-software
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -30,6 +30,18 @@ install_ubuntu() {
    maybe_libomp_dev=""
  fi

+  # HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
+  # See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
+  # TODO: Eliminate this hack, we should not relay on apt-get installation
+  # See https://github.com/pytorch/pytorch/issues/144768
+  if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
+    maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
+  elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
+    maybe_libnccl_dev="libnccl2=2.25.1-1+cuda12.4 libnccl-dev=2.25.1-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
+  else
+    maybe_libnccl_dev=""
+  fi
+
  # Install common dependencies
  apt-get update
  # TODO: Some of these may not be necessary
@ -58,6 +70,7 @@ install_ubuntu() {
    libasound2-dev \
    libsndfile-dev \
    ${maybe_libomp_dev} \
+    ${maybe_libnccl_dev} \
    software-properties-common \
    wget \
    sudo \
@ -86,6 +99,9 @@ install_centos() {

  ccache_deps="asciidoc docbook-dtds docbook-style-xsl libxslt"
  numpy_deps="gcc-gfortran"
+  # Note: protobuf-c-{compiler,devel} on CentOS are too old to be used
+  # for Caffe2. That said, we still install them to make sure the build
+  # system opts to build/use protoc and libprotobuf from third-party.
  yum install -y \
    $ccache_deps \
    $numpy_deps \
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.10.0
+  git clone https://github.com/mozilla/sccache -b v0.9.1
  cd sccache
  echo "Building sccache"
  cargo build --release
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -4,10 +4,16 @@ set -ex

 if [ -n "$CLANG_VERSION" ]; then

-  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+  if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
+    sudo apt-get update
+    # gpg-agent is not available by default on 18.04
+    sudo apt-get install  -y --no-install-recommends gpg-agent
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
+  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
-    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
  # clang's packaging is a little messed up (the runtime libs aren't
  # added into the linker path), so give it a little help
  clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
-  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+  echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
  ldconfig

  # Cleanup package manager
--- a/.ci/docker/common/install_cmake.sh
+++ b/.ci/docker/common/install_cmake.sh
@ -0,0 +1,31 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "$CMAKE_VERSION" ]
+
+# Remove system cmake install so it won't get used instead
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    apt-get remove cmake -y
+    ;;
+  centos)
+    yum remove cmake -y
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
+# Turn 3.6.3 into v3.6
+path=$(echo "${CMAKE_VERSION}" | sed -e 's/\([0-9].[0-9]\+\).*/v\1/')
+file="cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz"
+
+# Download and install specific CMake version in /usr/local
+pushd /tmp
+curl -Os --retry 3 "https://cmake.org/files/${path}/${file}"
+tar -C /usr/local --strip-components 1 --no-same-owner -zxf cmake-*.tar.gz
+rm -f cmake-*.tar.gz
+popd
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -6,8 +6,8 @@ set -ex
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
-  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]] || [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"  # @lint-ignore
+  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
+    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
  fi

@ -62,16 +62,11 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
-  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
-
-  # Miniforge installer doesn't install sqlite by default
-  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-    conda_install sqlite
-  fi
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.29=*openmp*"
+    conda_install "openblas==0.3.28=*openmp*"
  else
    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
  fi
@ -80,11 +75,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # and libpython-static for torch deploy
  conda_install llvmdev=8.0.0 "libpython-static=${ANACONDA_PYTHON_VERSION}"

+  # Use conda cmake in some cases. Conda cmake will be newer than our supported
+  # min version (3.5 for xenial and 3.10 for bionic), so we only do it in those
+  # following builds that we know should use conda. Specifically, Ubuntu bionic
+  # and focal cannot find conda mkl with stock cmake, so we need a cmake from conda
+  if [ -n "${CONDA_CMAKE}" ]; then
+    conda_install cmake
+  fi
+
  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    conda_run ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION})
+    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
  fi

  # Install some other packages, including those needed for Python test reporting
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -3,11 +3,11 @@
 set -uex -o pipefail

 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
-PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
+PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,72 +2,207 @@

 set -ex

-arch_path=''
-targetarch=${TARGETARCH:-$(uname -m)}
-if [ ${targetarch} = 'amd64' ] || [ "${targetarch}" = 'x86_64' ]; then
-  arch_path='x86_64'
-else
-  arch_path='sbsa'
-fi
+NCCL_VERSION=v2.25.1-1
+CUDNN_VERSION=9.5.1.17

-function install_cuda {
-  version=$1
-  runfile=$2
-  major_minor=${version%.*}
-  rm -rf /usr/local/cuda-${major_minor} /usr/local/cuda
-  if [[ ${arch_path} == 'sbsa' ]]; then
-      runfile="${runfile}_sbsa"
-  fi
-  runfile="${runfile}.run"
-  wget -q https://developer.download.nvidia.com/compute/cuda/${version}/local_installers/${runfile} -O ${runfile}
-  chmod +x ${runfile}
-  ./${runfile} --toolkit --silent
-  rm -f ${runfile}
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-${major_minor} /usr/local/cuda
+function install_cusparselt_040 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.4.0.7-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.4.0.7-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
 }

-function install_cudnn {
-  cuda_major_version=$1
-  cudnn_version=$2
-  mkdir tmp_cudnn && cd tmp_cudnn
+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_118 {
+    CUDNN_VERSION=9.1.0.70
+    NCCL_VERSION=v2.21.5-1
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
+    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
+    # install CUDA 11.8.0 in the same container
+    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
+    chmod +x cuda_11.8.0_520.61.05_linux.run
+    ./cuda_11.8.0_520.61.05_linux.run --toolkit --silent
+    rm -f cuda_11.8.0_520.61.05_linux.run
+    rm -f /usr/local/cuda && ln -s /usr/local/cuda-11.8 /usr/local/cuda
+
+    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+    mkdir tmp_cudnn && cd tmp_cudnn
+    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive.tar.xz
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/include/* /usr/local/cuda/include/
+    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda11-archive/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf tmp_cudnn
+
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl
+
+    install_cusparselt_040
+
+    ldconfig
+}
+
+function install_124 {
+  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
+  chmod +x cuda_12.4.1_550.54.15_linux.run
+  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  filepath="cudnn-linux-${arch_path}-${cudnn_version}_cuda${cuda_major_version}-archive"
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-${arch_path}/${filepath}.tar.xz
-  tar xf ${filepath}.tar.xz
-  cp -a ${filepath}/include/* /usr/local/cuda/include/
-  cp -a ${filepath}/lib/* /usr/local/cuda/lib64/
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_062
+
+  ldconfig
 }

 function install_126 {
-  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.3 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
+  chmod +x cuda_12.6.3_560.35.05_linux.run
+  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
+  rm -f cuda_12.6.3_560.35.05_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda

-  install_cudnn 12 $CUDNN_VERSION
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

-  CUDA_VERSION=12.6 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

-  CUDA_VERSION=12.6 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }

-function install_129 {
-  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 12.9.1 in the same container
-  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux
+function prune_118 {
+    echo "Pruning CUDA 11.8 and cuDNN"
+    #####################################################################################
+    # CUDA 11.8 prune static libs
+    #####################################################################################
+    export NVPRUNE="/usr/local/cuda-11.8/bin/nvprune"
+    export CUDA_LIB_DIR="/usr/local/cuda-11.8/lib64"

-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+    export GENCODE="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+    export GENCODE_CUDNN="-gencode arch=compute_35,code=sm_35 -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"

-  CUDA_VERSION=12.9 bash install_nccl.sh
+    if [[ -n "$OVERRIDE_GENCODE" ]]; then
+        export GENCODE=$OVERRIDE_GENCODE
+    fi

-  CUDA_VERSION=12.9 bash install_cusparselt.sh
+    # all CUDA libs except CuDNN and CuBLAS (cudnn and cublas need arch 3.7 included)
+    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"

-  ldconfig
+    # prune CuDNN and CuBLAS
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+    #####################################################################################
+    # CUDA 11.8 prune visual tools
+    #####################################################################################
+    export CUDA_BASE="/usr/local/cuda-11.8/"
+    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

 function prune_126 {
@ -105,17 +240,35 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 12.8.1 in the same container
-  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux
+  CUDNN_VERSION=9.7.1.26
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  # install CUDA 12.8.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
+  chmod +x cuda_12.8.0_570.86.10_linux.run
+  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 12 $CUDNN_VERSION
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

-  CUDA_VERSION=12.8 bash install_cusparselt.sh
+  install_cusparselt_063

  ldconfig
 }
@ -124,11 +277,13 @@ function install_128 {
 while test $# -gt 0
 do
    case "$1" in
-    12.6|12.6.*) install_126; prune_126
+    11.8) install_118; prune_118
        ;;
-    12.8|12.8.*) install_128;
+    12.4) install_124; prune_124
        ;;
-    12.9|12.9.*) install_129;
+    12.6) install_126; prune_126
+        ;;
+    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -0,0 +1,211 @@
+#!/bin/bash
+# Script used only in CD pipeline
+
+set -ex
+
+NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.5.1.17
+
+function install_cusparselt_062 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_cusparselt_063 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_124 {
+  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.1 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
+  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
+  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function install_126 {
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.3 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
+  chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
+  ./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
+function install_128 {
+  CUDNN_VERSION=9.7.1.26
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  # install CUDA 12.8.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
+  chmod +x cuda_12.8.0_570.86.10_linux_sbsa.run
+  ./cuda_12.8.0_570.86.10_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.8.0_570.86.10_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_063
+
+  ldconfig
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    12.6) install_126; prune_126
+        ;;
+    12.8) install_128;
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -4,10 +4,12 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.9" || ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.10.2.21_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
    else
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,14 +5,25 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-8]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.7.1.0-archive"
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.3.2-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
+    arch_path='sbsa'
+    export TARGETARCH=${TARGETARCH:-$(uname -m)}
+    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
+        arch_path='x86_64'
+    fi
+    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 else
    echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}"
 fi
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -13,7 +13,7 @@ clone_executorch() {
  # and fetch the target commit
  pushd executorch
  git checkout "${EXECUTORCH_PINNED_COMMIT}"
-  git submodule update --init --recursive
+  git submodule update --init
  popd

  chown -R jenkins executorch
@ -50,9 +50,10 @@ setup_executorch() {
  pushd executorch

  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  as_jenkins .ci/scripts/setup-linux.sh cmake || true
  popd
 }

--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -17,7 +17,7 @@ if [ -n "${UBUNTU_VERSION}" ];then
                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
 fi

-pip_install numpy scipy imageio cmake ninja
+conda_install numpy scipy imageio cmake ninja

 git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
 cmake -DCMAKE_BUILD_TYPE=Release \
@ -35,9 +35,7 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
-#       Context: https://github.com/pytorch/pytorch/issues/150420
-cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -14,9 +14,16 @@ function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)

+  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
+  # I'm using nightly here instead. We just need to package to be able to install
+  # TIMM. Removing this once vision has a release on 3.13
+  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
+    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
+  fi
+
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
-  conda_run pip uninstall -y torch torchvision triton
+  conda_run pip uninstall -y cmake torch torchvision triton
 }

 # Pango is needed for weasyprint which is needed for doctr
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@ -2,6 +2,8 @@

 set -ex

+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
 if [ -n "${UBUNTU_VERSION}" ]; then
  apt update
  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch

 pushd pytorch
 # Install all linter dependencies
-pip install -r requirements.txt
-lintrunner init
+pip_install -r requirements.txt
+conda_run lintrunner init

 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,23 +1,26 @@
 #!/usr/bin/env bash
-# Script that installs magma from tarball inside conda environment.
-# It replaces anaconda magma-cuda package which is no longer published.
-# Execute it inside active conda environment.
-# See issue: https://github.com/pytorch/pytorch/issues/138506
+# Script that replaces the magma install from a conda package

 set -eou pipefail

-cuda_version_nodot=${1/./}
-anaconda_dir=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
+function do_install() {
+    cuda_version_nodot=${1/./}
+    anaconda_python_version=$2

-MAGMA_VERSION="2.6.1"
-magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
-(
-    set -x
-    tmp_dir=$(mktemp -d)
-    pushd ${tmp_dir}
-    curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-    tar -xvf "${magma_archive}"
-    mv include/* "${anaconda_dir}/include/"
-    mv lib/* "${anaconda_dir}/lib"
-    popd
-)
+    MAGMA_VERSION="2.6.1"
+    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+
+    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
+    (
+        set -x
+        tmp_dir=$(mktemp -d)
+        pushd ${tmp_dir}
+        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        tar -xvf "${magma_archive}"
+        mv include/* "${anaconda_dir}/include/"
+        mv lib/* "${anaconda_dir}/lib"
+        popd
+    )
+}
+
+do_install $1 $2
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-NCCL_VERSION=""
-if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
-elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-else
-  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
-  exit 1
-fi
-
-if [[ -n "${NCCL_VERSION}" ]]; then
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  pushd nccl
-  make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  popd
-  rm -rf nccl
-  ldconfig
-fi
--- a/.ci/docker/common/install_ninja.sh
+++ b/.ci/docker/common/install_ninja.sh
@ -4,15 +4,10 @@ set -ex

 [ -n "$NINJA_VERSION" ]

-arch=$(uname -m)
-if [ "$arch" == "aarch64" ]; then
-    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux-aarch64.zip"
-else
-    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
-fi
+url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"

 pushd /tmp
 wget --no-verbose --output-document=ninja-linux.zip "$url"
 unzip ninja-linux.zip -d /usr/local/bin
 rm -f ninja-linux.zip
-popd
+popd
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -8,6 +8,16 @@ retry () {
    "$@" || (sleep 10 && "$@") || (sleep 20 && "$@") || (sleep 40 && "$@")
 }

+# A bunch of custom pip dependencies for ONNX
+pip_install \
+  beartype==0.15.0 \
+  filelock==3.9.0 \
+  flatbuffers==2.0 \
+  mock==5.0.1 \
+  ninja==1.10.2 \
+  networkx==2.5 \
+  numpy==1.24.2
+
 # ONNXRuntime should be installed before installing
 # onnx-weekly. Otherwise, onnx-weekly could be
 # overwritten by onnx.
@ -19,8 +29,12 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
+
 pip_install onnxruntime==1.18.1
-pip_install onnxscript==0.3.0
+pip_install onnx==1.17.0
+pip_install onnxscript==0.1.0 --no-deps
+# required by onnxscript
+pip_install ml_dtypes

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,8 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b "${OPENBLAS_VERSION:-v0.3.29}" --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
+

 OPENBLAS_BUILD_FLAGS="
 NUM_THREADS=128
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -0,0 +1,19 @@
+#!/bin/bash
+
+set -ex
+
+pb_dir="/usr/temp_pb_install_dir"
+mkdir -p $pb_dir
+
+# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+# else it will fail with
+#   g++: error: ./../lib64/crti.o: No such file or directory
+ln -s /usr/lib64 "$pb_dir/lib64"
+
+curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+
+tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+NPROC=$[$(nproc) - 2]
+pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
+popd
+rm -rf $pb_dir
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-set -ex
-
-apt-get update
-# Use deadsnakes in case we need an older python version
-sudo add-apt-repository ppa:deadsnakes/ppa
-apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
-
-# Use a venv because uv and some other package managers don't support --user install
-ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-python -m venv /var/lib/jenkins/ci_env
-source /var/lib/jenkins/ci_env/bin/activate
-
-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,6 +8,10 @@ ver() {

 install_ubuntu() {
    apt-get update
+    if [[ $UBUNTU_VERSION == 18.04 ]]; then
+      # gpg-agent is not available by default on 18.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
    if [[ $UBUNTU_VERSION == 20.04 ]]; then
      # gpg-agent is not available by default on 20.04
      apt-get install -y --no-install-recommends gpg-agent
@ -19,18 +23,6 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Make sure rocm packages from repo.radeon.com have highest priority
-    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
-Package: *
-Pin: release o=repo.radeon.com
-Pin-Priority: 600
-EOF
-
-    # we want the patch version of 6.4 instead
-    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-        ROCM_VERSION="${ROCM_VERSION}.1"
-    fi
-
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
@ -71,29 +63,17 @@ EOF
    done

    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    # ROCm 6.4 did not yet fix the regression, also HIP branch names are different
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.3) ]] && [[ $(ver $ROCM_VERSION) -lt $(ver 7.0) ]]; then
-        if [[ $(ver $ROCM_VERSION) -eq $(ver 6.4.1) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-            VER_PATCH=.1
-        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.4) ]]; then
-            HIP_BRANCH=release/rocm-rel-6.4
-            VER_STR=6.4
-        elif [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
-            HIP_BRANCH=rocm-6.3.x
-            VER_STR=6.3
-        fi
+    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b $HIP_BRANCH
+        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-${VER_STR}${VER_PATCH}-statco-hotfix
+        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
-        cp hipamd/lib/libamdhip64.so.${VER_STR}.* /opt/rocm/lib/libamdhip64.so.${VER_STR}.*
+        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
        popd
        rm -rf HIP clr
    fi
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
 	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
 		return;
 	}

--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,32 +1,50 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline

-set -eou pipefail
+set -ex

-function do_install() {
-    rocm_version=$1
-    rocm_version_nodot=${1//./}
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac

-    rocm_dir="/opt/rocm"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        if tar -xvf "${magma_archive}"
-        then
-            mkdir -p "${rocm_dir}/magma"
-            mv include "${rocm_dir}/magma/include"
-            mv lib "${rocm_dir}/magma/lib"
-        else
-            echo "${magma_archive} not found, skipping magma install"
-        fi
-        popd
-    )
-}
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

-do_install $1
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+git clone https://bitbucket.org/icl/magma.git
+pushd magma
+
+# Version 2.7.2 + ROCm related updates
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm
--- a/.ci/docker/common/install_swiftshader.sh
+++ b/.ci/docker/common/install_swiftshader.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${SWIFTSHADER}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+# SwiftShader
+_swiftshader_dir=/var/lib/jenkins/swiftshader
+_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
+mkdir -p $_swiftshader_dir
+_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
+
+curl --silent --show-error --location --fail --retry 3 \
+  --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
+
+tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
+
+export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -2,16 +2,14 @@

 set -ex

-mkdir -p /opt/triton
-if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
-  echo "TRITON and TRITON_CPU are not set. Exiting..."
-  exit 0
-fi
-
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-get_pip_version() {
-  conda_run pip list | grep -w $* | head -n 1 | awk '{print $2}'
+get_conda_version() {
+  as_jenkins conda list -n py_$ANACONDA_PYTHON_VERSION | grep -w $* | head -n 1 | awk '{print $2}'
+}
+
+conda_reinstall() {
+  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

 if [ -n "${XPU_VERSION}" ]; then
@ -33,9 +31,11 @@ if [ -n "${UBUNTU_VERSION}" ];then
    apt-get install -y gpg-agent
 fi

-# Keep the current cmake and numpy version here, so we can reinstall them later
-CMAKE_VERSION=$(get_pip_version cmake)
-NUMPY_VERSION=$(get_pip_version numpy)
+if [ -n "${CONDA_CMAKE}" ]; then
+  # Keep the current cmake and numpy version here, so we can reinstall them later
+  CMAKE_VERSION=$(get_conda_version cmake)
+  NUMPY_VERSION=$(get_conda_version numpy)
+fi

 if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
@ -51,13 +51,7 @@ as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
-
-# Old versions of python have setup.py in ./python; newer versions have it in ./
-if [ ! -f setup.py ]; then
-  cd python
-fi
-
-pip_install pybind11==2.13.6
+cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@ -66,38 +60,28 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install -e .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install -e .
 else
-  conda_run python setup.py bdist_wheel
+  pip_install -e .
 fi

-# Copy the wheel to /opt for multi stage docker builds
-cp dist/*.whl /opt/triton
-# Install the wheel for docker builds that don't use multi stage
-pip_install dist/*.whl
-
-# TODO: This is to make sure that the same cmake and numpy version from install conda
-# script is used. Without this step, the newer cmake version (3.25.2) downloaded by
-# triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
-# this can be removed.
-#
-# The correct numpy version also needs to be set here because conda claims that it
-# causes inconsistent environment.  Without this, conda will attempt to install the
-# latest numpy version, which fails ASAN tests with the following import error: Numba
-# needs NumPy 1.20 or less.
-# Note that we install numpy with pip as conda might not have the version we want
-if [ -n "${CMAKE_VERSION}" ]; then
-  pip_install "cmake==${CMAKE_VERSION}"
-fi
-if [ -n "${NUMPY_VERSION}" ]; then
-  pip_install "numpy==${NUMPY_VERSION}"
-fi
-if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
-  pip_install helion
+if [ -n "${CONDA_CMAKE}" ]; then
+  # TODO: This is to make sure that the same cmake and numpy version from install conda
+  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
+  # triton build step via pip will fail to detect conda MKL. Once that issue is fixed,
+  # this can be removed.
+  #
+  # The correct numpy version also needs to be set here because conda claims that it
+  # causes inconsistent environment.  Without this, conda will attempt to install the
+  # latest numpy version, which fails ASAN tests with the following import error: Numba
+  # needs NumPy 1.20 or less.
+  conda_reinstall cmake="${CMAKE_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 fi
--- a/.ci/docker/common/install_vulkan_sdk.sh
+++ b/.ci/docker/common/install_vulkan_sdk.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${VULKAN_SDK_VERSION}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_vulkansdk_dir=/var/lib/jenkins/vulkansdk
+_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
+
+curl \
+  --silent \
+  --show-error \
+  --location \
+  --fail \
+  --retry 3 \
+  --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+
+mkdir -p "${_vulkansdk_dir}"
+tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
+rm -rf "${_tmp_vulkansdk_targz}"
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -26,7 +26,7 @@ function install_ubuntu() {
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
-        https://apt.repos.intel.com/oneapi all main" \
+        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
@ -74,7 +74,7 @@ function install_rhel() {
    tee > /etc/yum.repos.d/oneAPI.repo << EOF
 [oneAPI]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/oneapi
+baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
@ -118,7 +118,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/oneapi oneAPI
+    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB

    # The xpu-smi packages
@ -141,10 +141,10 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi

-# Default use Intel® oneAPI Deep Learning Essentials 2025.0
-if [[ "$XPU_VERSION" == "2025.1" ]]; then
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.1"
-else
+XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
+XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
+if [[ "$XPU_VERSION" == "2025.0" ]]; then
+    XPU_REPO_NAME="oneapi"
    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi

--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -49,11 +49,18 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
 ENV CUDA_HOME /usr/local/cuda

+FROM cuda as cuda11.8
+RUN bash ./install_cuda.sh 11.8
+RUN bash ./install_magma.sh 11.8
+RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
+
+FROM cuda as cuda12.4
+RUN bash ./install_cuda.sh 12.4
+RUN bash ./install_magma.sh 12.4
+RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
+
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 RUN bash ./install_magma.sh 12.6
@ -64,13 +71,7 @@ RUN bash ./install_cuda.sh 12.8
 RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda

-FROM cuda as cuda12.9
-RUN bash ./install_cuda.sh 12.9
-RUN bash ./install_magma.sh 12.9
-RUN ln -sf /usr/local/cuda-12.9 /usr/local/cuda
-
 FROM cpu as rocm
-ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@ -85,11 +86,11 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python3 python-is-python3 -y && \
+    apt-get install python -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,63 +1,83 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eoux pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

+DOCKER_IMAGE="pytorch/${image}"
+
 TOPDIR=$(git rev-parse --show-toplevel)

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
 DOCKER=${DOCKER:-docker}

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
-
-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
-
-case ${DOCKER_TAG_PREFIX} in
+case ${GPU_ARCH_TYPE} in
    cpu)
        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda*)
+    cuda)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm*)
+    rocm)
        BASE_TARGET=rocm
-        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
-        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-DOCKER_BUILDKIT=1 ${DOCKER} build \
-    --target final \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
-    "${TOPDIR}/.ci/docker/"
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -18,31 +18,28 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-# Put venv into the env vars so users don't need to activate it
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
-
-RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -15,17 +15,20 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -0,0 +1,200 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=11.8
+
+ARG GPU_IMAGE=centos:7
+FROM centos:7 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+ARG DEVTOOLSET_VERSION=9
+
+# Note: This is required patch since CentOS have reached EOL
+# otherwise any yum install setp will fail
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+# Just add everything as a safe.directory for git since these will be used in multiple places with git
+RUN git config --global --add safe.directory '*'
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+# Note: After running yum-config-manager --enable rhel-server-rhscl-7-rpms
+# patch is required once again. Somehow this steps adds mirror.centos.org
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum --enablerepo=extras install -y epel-release
+
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+RUN yum install -y autoconf aclocal automake make sudo
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+# EPEL for cmake
+FROM base as patchelf
+# Install patchelf
+ADD ./common/install_patchelf.sh install_patchelf.sh
+RUN bash ./install_patchelf.sh && rm install_patchelf.sh
+RUN cp $(which patchelf) /patchelf
+
+FROM patchelf as python
+# build python
+COPY manywheel/build_scripts /build_scripts
+ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=python             /opt/python                           /opt/python
+COPY --from=python             /opt/_internal                        /opt/_internal
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.1
+ARG DEVTOOLSET_VERSION=9
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+ENV PATH /opt/conda/bin:$PATH
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake is already installed inside the rocm base image, so remove if present
+RUN rpm -e cmake || true
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake
+
+# ninja
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+
+FROM cpu_final as rocm_final
+ARG ROCM_VERSION=3.7
+ARG PYTORCH_ROCM_ARCH
+ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+# Adding ROCM_PATH env var so that LoadHip.cmake (even with logic updated for ROCm6.0)
+# find HIP works for ROCm5.7. Not needed for ROCm6.0 and above.
+# Remove below when ROCm5.7 is not in support matrix anymore.
+ENV ROCM_PATH /opt/rocm
+ENV MKLROOT /opt/intel
+# No need to install ROCm as base docker image should have full ROCm install
+#ADD ./common/install_rocm.sh install_rocm.sh
+#RUN ROCM_VERSION=${ROCM_VERSION} bash ./install_rocm.sh && rm install_rocm.sh
+ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
+RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
+# cmake3 is needed for the MIOpen build
+RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -0,0 +1,153 @@
+# syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
+ARG BASE_CUDA_VERSION=10.2
+ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
+FROM quay.io/pypa/manylinux2014_x86_64 as base
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
+RUN yum install -y yum-utils centos-release-scl sudo
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
+
+
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+
+FROM base as cuda
+ARG BASE_CUDA_VERSION=10.2
+# Install CUDA
+ADD ./common/install_cuda.sh install_cuda.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
+
+FROM base as intel
+# MKL
+ADD ./common/install_mkl.sh install_mkl.sh
+RUN bash ./install_mkl.sh && rm install_mkl.sh
+
+FROM base as magma
+ARG BASE_CUDA_VERSION=10.2
+# Install magma
+ADD ./common/install_magma.sh install_magma.sh
+RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
+
+FROM base as jni
+# Install java jni header
+ADD ./common/install_jni.sh install_jni.sh
+ADD ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+FROM base as libpng
+# Install libpng
+ADD ./common/install_libpng.sh install_libpng.sh
+RUN bash ./install_libpng.sh && rm install_libpng.sh
+
+FROM ${GPU_IMAGE} as common
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+RUN yum install -y \
+        aclocal \
+        autoconf \
+        automake \
+        bison \
+        bzip2 \
+        curl \
+        diffutils \
+        file \
+        git \
+        make \
+        patch \
+        perl \
+        unzip \
+        util-linux \
+        wget \
+        which \
+        xz \
+        yasm
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
+
+RUN yum swap -y git git236-core
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# Install LLVM version
+COPY --from=openssl            /opt/openssl                          /opt/openssl
+COPY --from=base               /opt/python                           /opt/python
+COPY --from=base               /opt/_internal                        /opt/_internal
+COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
+COPY --from=intel              /opt/intel                            /opt/intel
+COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
+COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
+COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
+COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
+COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
+COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
+COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
+COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
+
+FROM common as cpu_final
+ARG BASE_CUDA_VERSION=10.2
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
+ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
+
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+# ninja
+RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
+RUN yum install -y ninja-build
+
+FROM cpu_final as cuda_final
+RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -7,8 +7,8 @@ ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=13
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-gcc gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+ARG DEVTOOLSET_VERSION=11
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel yum-utils gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

@ -26,20 +26,17 @@ ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh


-# remove unnecessary python versions
+# remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6

 FROM base as cuda
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -47,7 +44,7 @@ ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh

 FROM base as magma
-ARG BASE_CUDA_VERSION=12.6
+ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
@ -64,7 +61,7 @@ ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh

 FROM ${GPU_IMAGE} as common
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
@ -87,12 +84,13 @@ RUN yum install -y \
        wget \
        which \
        xz \
-        glibc-langpack-en \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-        gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
+        glibc-langpack-en
+RUN yum install -y \
+    https://repo.ius.io/ius-release-el7.rpm \
+    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm

+RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
@ -103,7 +101,6 @@ ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
-COPY --from=base               /usr/local/lib/                       /usr/local/lib/
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
@ -117,8 +114,8 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h

 FROM common as cpu_final
-ARG BASE_CUDA_VERSION=12.6
-ARG DEVTOOLSET_VERSION=13
+ARG BASE_CUDA_VERSION=11.8
+ARG DEVTOOLSET_VERSION=11
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
@ -157,14 +154,11 @@ ENV ROCM_PATH /opt/rocm
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
-# replace the libdrm in /opt/amdgpu with custom amdgpu.ids lookup path
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-# ROCm 6.4 rocm-smi depends on system drm.h header
-RUN yum install -y libdrm-devel
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

@ -175,6 +169,6 @@ ENV XPU_DRIVER_TYPE ROLLING
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.1
+ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -1,8 +1,9 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

-ARG GCCTOOLSET_VERSION=13
+# Graviton needs GCC 10 or above for the build. GCC12 is the default version in almalinux-8.
+ARG GCCTOOLSET_VERSION=11

-# Language variables
+# Language variabes
 ENV LC_ALL=en_US.UTF-8
 ENV LANG=en_US.UTF-8
 ENV LANGUAGE=en_US.UTF-8
@ -35,16 +36,7 @@ RUN yum install -y \
  yasm \
  zstd \
  sudo \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${GCCTOOLSET_VERSION}-gdb
-
-# (optional) Install non-default Ninja version
-ARG NINJA_VERSION
-COPY ./common/install_ninja.sh install_ninja.sh
-RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
-RUN rm install_ninja.sh
+  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain

 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
@ -58,13 +50,12 @@ RUN git config --global --add safe.directory "*"

 FROM base as openblas
 # Install openblas
-ARG OPENBLAS_VERSION
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh

 FROM base as final

-# remove unnecessary python versions
+# remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -0,0 +1,94 @@
+FROM quay.io/pypa/manylinux2014_aarch64 as base
+
+
+# Graviton needs GCC 10 for the build
+ARG DEVTOOLSET_VERSION=10
+
+# Language variabes
+ENV LC_ALL=en_US.UTF-8
+ENV LANG=en_US.UTF-8
+ENV LANGUAGE=en_US.UTF-8
+
+# Installed needed OS packages. This is to support all
+# the binary builds (torch, vision, audio, text, data)
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  autoconf \
+  automake \
+  bison \
+  bzip2 \
+  curl \
+  diffutils \
+  file \
+  git \
+  make \
+  patch \
+  perl \
+  unzip \
+  util-linux \
+  wget \
+  which \
+  xz \
+  yasm \
+  less \
+  zstd \
+  libgomp \
+  sudo \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
+  devtoolset-${DEVTOOLSET_VERSION}-binutils
+
+# Ensure the expected devtoolset is used
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+
+# git236+ would refuse to run git commands in repos owned by other users
+# Which causes version check to fail, as pytorch repo is bind-mounted into the image
+# Override this behaviour by treating every folder as safe
+# For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
+RUN git config --global --add safe.directory "*"
+
+
+###############################################################################
+# libglfortran.a hack
+#
+# libgfortran.a from quay.io/pypa/manylinux2014_aarch64 is not compiled with -fPIC.
+# This causes __stack_chk_guard@@GLIBC_2.17 on pytorch build. To solve, get
+# ubuntu's libgfortran.a which is compiled with -fPIC
+# NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
+###############################################################################
+RUN cd ~/ \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
+  && ar x ~/libgfortran-10-dev.deb \
+  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
+  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
+
+# install cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
+FROM openssl as final
+# remove unncessary python versions
+RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
+RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
+RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
+RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -1,7 +1,7 @@
 FROM quay.io/pypa/manylinux_2_28_aarch64 as base

 # Cuda ARM build needs gcc 11
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11

 # Language variables
 ENV LC_ALL=en_US.UTF-8
@ -34,10 +34,7 @@ RUN yum install -y \
  zstd \
  libgomp \
  sudo \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gdb
+  gcc-toolset-${DEVTOOLSET_VERSION}-toolchain

 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
@ -60,7 +57,7 @@ RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem

 FROM openssl as final
-# remove unnecessary python versions
+# remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
@ -69,11 +66,8 @@ RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
-ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu* install_cusparselt.sh
+ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh

 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -5,9 +5,7 @@ ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8

-# there is a bugfix in gcc >= 14 for precompiled headers and s390x vectorization interaction.
-# with earlier gcc versions test/inductor/test_cpu_cpp_wrapper.py will fail.
-ARG DEVTOOLSET_VERSION=14
+ARG DEVTOOLSET_VERSION=13
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
 RUN yum -y install epel-release
@ -44,7 +42,6 @@ RUN yum install -y \
  llvm-devel \
  libzstd-devel \
  python3.12-devel \
-  python3.12-test \
  python3.12-setuptools \
  python3.12-pip \
  python3-virtualenv \
@ -60,8 +57,7 @@ RUN yum install -y \
  libxslt-devel \
  libxml2-devel \
  openssl-devel \
-  valgrind \
-  ninja-build
+  valgrind

 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -105,35 +101,24 @@ CMD ["/bin/bash"]

 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
-  hdf5-devel \
-  python3-h5py \
-  git
+  protobuf-devel \
+  protobuf-c-devel \
+  protobuf-lite-devel \
+  wget \
+  patch

-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
-
-# cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
-
-# build onnxruntime 1.21.0 from sources.
-# it is not possible to build it from sources using pip,
-# so just build it from upstream repository.
-# h5py is dependency of onnxruntime_training.
-# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
-# h5py 3.11.0 doesn't build with numpy >= 2.3.0.
-# install newest flatbuffers version first:
-# for some reason old version is getting pulled in otherwise.
-# packaging package is required for onnxruntime wheel build.
-RUN pip3 install flatbuffers && \
-  pip3 install cython 'pkgconfig>=1.5.5' 'setuptools>=77' 'numpy<2.3.0' && \
-  pip3 install --no-build-isolation h5py==3.11.0 && \
-  pip3 install packaging && \
-  git clone https://github.com/microsoft/onnxruntime && \
-  cd onnxruntime && git checkout v1.21.0 && \
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind \
-  --build_wheel --enable_training --enable_training_apis \
-  --enable_training_ops --skip_tests --allow_running_as_root \
-  --compile_no_warning_as_error && \
-  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
-  cd .. && /bin/rm -rf ./onnxruntime
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 TOPDIR=$(git rev-parse --show-toplevel)

@ -9,111 +9,151 @@ image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE="pytorch/${image}"

-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
-OPENBLAS_VERSION=${OPENBLAS_VERSION:-}
+WITH_PUSH=${WITH_PUSH:-}

-case ${image} in
-    manylinux2_28-builder:cpu)
+case ${GPU_ARCH_TYPE} in
+    cpu)
        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28_aarch64-builder:cpu-aarch64)
+    cpu-aarch64)
        TARGET=final
-        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13 --build-arg NINJA_VERSION=1.12.1"
-        MANY_LINUX_VERSION="2_28_aarch64"
-        OPENBLAS_VERSION="v0.3.29"
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
+        MANY_LINUX_VERSION="aarch64"
        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+    cpu-aarch64-2_28)
        TARGET=final
+        DOCKER_TAG=cpu-aarch64
+        GPU_IMAGE=arm64v8/almalinux:8
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
+        MANY_LINUX_VERSION="2_28_aarch64"
+        ;;
+    cpu-cxx11-abi)
+        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    manylinuxs390x-builder:cpu-s390x)
+    cpu-s390x)
        TARGET=final
+        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda11*)
+    cuda)
        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinux2_28-builder:cuda12*)
+    cuda-aarch64)
        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
-        MANY_LINUX_VERSION="2_28"
-        ;;
-    manylinuxaarch64-builder:cuda*)
-        TARGET=cuda_final
-        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        GPU_IMAGE=arm64v8/centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    manylinux2_28-builder:rocm*)
+    rocm|rocm-manylinux_2_28)
        TARGET=rocm_final
-        MANY_LINUX_VERSION="2_28"
-        DEVTOOLSET_VERSION="11"
-        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
+        DEVTOOLSET_VERSION="9"
+        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
+            MANY_LINUX_VERSION="2_28"
+            DEVTOOLSET_VERSION="11"
+            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        fi
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    manylinux2_28-builder:xpu)
+    xpu)
        TARGET=xpu_final
+        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized image name: ${image}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

+IMAGES=''
+
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-# Only activate this if in CI
-if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-    sudo systemctl daemon-reload
-    sudo systemctl restart docker
+(
+    set -x
+
+    if [ "$(uname -m)" != "s390x" ]; then
+        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+        sudo systemctl daemon-reload
+        sudo systemctl restart docker
+    fi
+
+    DOCKER_BUILDKIT=1 docker build  \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
 fi
-
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
-
-DOCKER_BUILDKIT=1 docker build  \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "OPENBLAS_VERSION=${OPENBLAS_VERSION}" \
-    --target "${TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
-    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -97,7 +97,7 @@ find /opt/_internal -type f -print0 \
    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
 # We do not need the Python test suites, or indeed the precompiled .pyc and
 # .pyo files. Partially cribbed from:
-#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile  # @lint-ignore
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
 find /opt/_internal \
     \( -type d -a -name test -o -name tests \) \
  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -2,8 +2,8 @@
 # Helper utilities for build
 # Script used only in CD pipeline

-OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/  # @lint-ignore
-CURL_DOWNLOAD_URL=https://curl.se/download
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download

 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf

--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,9 +41,9 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==24.12.23
+flatbuffers==2.0
 #Description: cross platform serialization library
-#Pinned versions: 24.12.23
+#Pinned versions: 2.0
 #test that import:

 hypothesis==5.35.1
@ -90,10 +90,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.16.0
+mypy==1.13.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.16.0
+#Pinned versions: 1.10.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -102,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -163,10 +163,10 @@ pillow==11.0.0
 #Pinned versions: 10.3.0
 #test that import:

-protobuf==5.29.4
-#Description:  Google's data interchange format
-#Pinned versions: 5.29.4
-#test that import: test_tensorboard.py, test/onnx/*
+protobuf==3.20.2
+#Description:  Google’s data interchange format
+#Pinned versions: 3.20.1
+#test that import: test_tensorboard.py

 psutil
 #Description: information on running processes and system utilization
@ -294,7 +294,7 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.6
+jinja2==3.1.5
 #Description: jinja2 template engine
 #Pinned versions: 3.1.4
 #test that import:
@ -334,12 +334,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:

-onnx==1.18.0
-#Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
+onnx==1.17.0
+#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:

-onnxscript==0.2.6
+onnxscript==0.1.0
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -353,7 +353,7 @@ parameterized==0.8.1
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py

-pwlf==2.2.1
+pwlf==2.2.1 ; python_version >= "3.8"
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
@ -365,9 +365,10 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

-pulp==2.9.0
+pulp==2.9.0 ; python_version >= "3.8"
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
@ -376,13 +377,3 @@ dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
-
-cmake==4.0.0
-#Description: required for building
-
-tlparse==0.3.30
-#Description: required for log parsing
-
-cuda-bindings>=12.0,<13.0
-#Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
-#test that import: test_cuda.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,24 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@pytorch_sphinx_theme2#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
-
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6

-sphinxext-opengraph==0.9.1
-#Description: This is used to generate PyTorch docs
-#Pinned versions: 0.9.1
-
-sphinx_sitemap==2.6.0
-#Description: This is used to generate sitemap for PyTorch docs
-#Pinned versions: 2.6.0
-
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -55,6 +46,5 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
-sphinxcontrib-mermaid==1.0.0
+sphinx-panels==0.4.1
 myst-parser==0.18.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.2.0
--- a/.ci/docker/triton_xpu_version.txt
+++ b/.ci/docker/triton_xpu_version.txt
@ -1 +0,0 @@
-3.4.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -0,0 +1,175 @@
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+ARG IMAGE_NAME
+
+FROM ${IMAGE_NAME}
+
+ARG UBUNTU_VERSION
+ARG CUDA_VERSION
+
+ENV DEBIAN_FRONTEND noninteractive
+
+# Install common dependencies (so that this step can be cached separately)
+COPY ./common/install_base.sh install_base.sh
+RUN bash ./install_base.sh && rm install_base.sh
+
+# Install user
+COPY ./common/install_user.sh install_user.sh
+RUN bash ./install_user.sh && rm install_user.sh
+
+# Install katex
+ARG KATEX
+COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
+RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
+
+# Install conda and other packages (e.g., numpy, pytest)
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+
+# Install gcc
+ARG GCC_VERSION
+COPY ./common/install_gcc.sh install_gcc.sh
+RUN bash ./install_gcc.sh && rm install_gcc.sh
+
+# Install clang
+ARG CLANG_VERSION
+COPY ./common/install_clang.sh install_clang.sh
+RUN bash ./install_clang.sh && rm install_clang.sh
+
+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
+# (optional) Install vision packages like OpenCV
+ARG VISION
+COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
+RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
+RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
+ENV INSTALLED_VISION ${VISION}
+
+# (optional) Install UCC
+ARG UCX_COMMIT
+ARG UCC_COMMIT
+ENV UCX_COMMIT $UCX_COMMIT
+ENV UCC_COMMIT $UCC_COMMIT
+ENV UCX_HOME /usr
+ENV UCC_HOME /usr
+ADD ./common/install_ucc.sh install_ucc.sh
+RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
+RUN rm install_ucc.sh
+
+COPY ./common/install_openssl.sh install_openssl.sh
+ENV OPENSSL_ROOT_DIR /opt/openssl
+RUN bash ./install_openssl.sh
+ENV OPENSSL_DIR /opt/openssl
+
+ARG INDUCTOR_BENCHMARKS
+ARG ANACONDA_PYTHON_VERSION
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/timm.txt timm.txt
+RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
+ARG TRITON
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton.txt triton.txt
+COPY triton_version.txt triton_version.txt
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+
+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
+# Install ccache/sccache (do this last, so we get priority in PATH)
+COPY ./common/install_cache.sh install_cache.sh
+ENV PATH /opt/cache/bin:$PATH
+# See https://github.com/pytorch/pytorch/issues/82174
+# TODO(sdym@fb.com):
+# check if this is needed after full off Xenial migration
+ENV CARGO_NET_GIT_FETCH_WITH_CLI true
+RUN bash ./install_cache.sh && rm install_cache.sh
+ENV CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+
+# Add jni.h for java host build
+COPY ./common/install_jni.sh install_jni.sh
+COPY ./java/jni.h jni.h
+RUN bash ./install_jni.sh && rm install_jni.sh
+
+# Install Open MPI for CUDA
+COPY ./common/install_openmpi.sh install_openmpi.sh
+RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
+RUN rm install_openmpi.sh
+
+# Include BUILD_ENVIRONMENT environment variable in image
+ARG BUILD_ENVIRONMENT
+ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
+
+# AWS specific CUDA build guidance
+ENV TORCH_CUDA_ARCH_LIST Maxwell
+ENV TORCH_NVCC_FLAGS "-Xfatbin -compress-all"
+ENV CUDA_PATH /usr/local/cuda
+
+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
+# Install CUDNN
+ARG CUDNN_VERSION
+ARG CUDA_VERSION
+COPY ./common/install_cudnn.sh install_cudnn.sh
+RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN rm install_cudnn.sh
+
+# Install CUSPARSELT
+ARG CUDA_VERSION
+COPY ./common/install_cusparselt.sh install_cusparselt.sh
+RUN bash install_cusparselt.sh
+RUN rm install_cusparselt.sh
+
+# Install CUDSS
+ARG CUDA_VERSION
+COPY ./common/install_cudss.sh install_cudss.sh
+RUN bash install_cudss.sh
+RUN rm install_cudss.sh
+
+# Delete /usr/local/cuda-11.X/cuda-11.X symlinks
+RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
+RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
+RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
+RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi
+
+USER jenkins
+CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -25,9 +25,9 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
-ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -43,6 +43,20 @@ ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -56,7 +70,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@ -101,6 +115,12 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
@ -72,10 +73,17 @@ ARG TRITON
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
-COPY triton_xpu_version.txt triton_version.txt
+COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -83,6 +91,12 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION

-FROM ubuntu:${UBUNTU_VERSION} as base
+FROM ubuntu:${UBUNTU_VERSION}

 ARG UBUNTU_VERSION

@ -28,6 +28,7 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh

 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
 ARG DOCS
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
@ -51,17 +52,9 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-COPY ./common/install_cusparselt.sh install_cusparselt.sh
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu* install_cusparselt.sh
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-# No effect if cuda not installed
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-

 # (optional) Install UCC
 ARG UCX_COMMIT
@ -74,6 +67,20 @@ ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh

+# (optional) Install protobuf for ONNX
+ARG PROTOBUF
+COPY ./common/install_protobuf.sh install_protobuf.sh
+RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
+RUN rm install_protobuf.sh
+ENV INSTALLED_PROTOBUF ${PROTOBUF}
+
+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -81,6 +88,24 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Vulkan SDK
+ARG VULKAN_SDK_VERSION
+COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
+RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
+RUN rm install_vulkan_sdk.sh
+
+# (optional) Install swiftshader
+ARG SWIFTSHADER
+COPY ./common/install_swiftshader.sh install_swiftshader.sh
+RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
+RUN rm install_swiftshader.sh
+
+# (optional) Install non-default CMake version
+ARG CMAKE_VERSION
+COPY ./common/install_cmake.sh install_cmake.sh
+RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
+RUN rm install_cmake.sh
+
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
@ -102,21 +127,20 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 ARG TRITON
-ARG TRITON_CPU
-
-# Create a separate stage for building Triton and Triton-CPU.  install_triton
-# will check for the presence of env vars
-FROM base as triton-builder
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
-COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
-RUN bash ./install_triton.sh
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt

-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt

 ARG EXECUTORCH
 # Build and install executorch
--- a/.ci/magma-rocm/.gitignore
+++ b/.ci/magma-rocm/.gitignore
@ -1,2 +0,0 @@
-output/
-magma-rocm*/
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,35 +0,0 @@
-SHELL=/usr/bin/env bash
-
-DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
-DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
-PACKAGE_NAME = magma-rocm
-# inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
-
-DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-	-w /builder \
-	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
-	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/almalinux-builder:rocm${DESIRED_ROCM}" \
-	magma-rocm/build_magma.sh
-
-.PHONY: all
-all: magma-rocm64
-all: magma-rocm63
-
-.PHONY:
-clean:
-	$(RM) -r magma-*
-	$(RM) -r output
-
-.PHONY: magma-rocm64
-magma-rocm64: DESIRED_ROCM := 6.4
-magma-rocm64:
-	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -1,48 +0,0 @@
-# Magma ROCm
-
-This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
-
-## Building
-
-Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
-
-```
-# Using `docker`
-make magma-rocm63
-
-# Using `podman`
-DOCKER_CMD=podman make magma-rocm63
-```
-
-This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
-Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
-into a tarball, with the following structure:
-
-```
-.
-├── include       # header files
-├── lib           # libmagma.so
-├── info
-│   ├── licenses  # license file
-│   └── recipe    # build script
-```
-
-More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
-Outputted binaries should be in the `output` folder.
-
-
-## Pushing
-
-Packages can be uploaded to an S3 bucket using:
-
-```
-aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
-```
-
-If you do not have upload permissions, please ping @seemethere or @soumith to gain access
-
-## New versions
-
-New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
-
-Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-# Environment variables
-# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-
-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-
-# Folders for the build
-PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
-PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
-PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
-PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
-PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
-PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
-mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
-
-# Fetch magma sources and verify checksum
-pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
-pushd magma
-git checkout ${MAGMA_VERSION}
-popd
-popd
-
-# build
-pushd ${PACKAGE_DIR}/magma
-# The build.sh script expects to be executed from the sources root folder
-INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
-popd
-
-# Package recipe, license and tarball
-# Folder and package name are backward compatible for the build workflow
-cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
-pushd ${PACKAGE_BUILD}
-tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
-echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
-popd
--- a/.ci/magma-rocm/package_files/build.sh
+++ b/.ci/magma-rocm/package_files/build.sh
@ -1,38 +0,0 @@
-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac
-
-MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
-
-cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
-echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
-    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
-fi
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
-echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
-export PATH="${PATH}:/opt/rocm/bin"
-if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
-  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
-else
-  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
-fi
-for arch in $amdgpu_targets; do
-  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
-done
-# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
-sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
-make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
-cp -R lib ${INSTALL_DIR}
-cp -R include ${INSTALL_DIR}
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_CUDA ?= 12.8
+DESIRED_CUDA ?= 11.8
 DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
 PACKAGE_NAME = magma-cuda
 CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
@ -12,25 +12,20 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/almalinux-builder:cuda${DESIRED_CUDA}-main" \
+	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh

 .PHONY: all
-all: magma-cuda129
 all: magma-cuda128
 all: magma-cuda126
+all: magma-cuda124
+all: magma-cuda118

 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output

-.PHONY: magma-cuda129
-magma-cuda129: DESIRED_CUDA := 12.9
-magma-cuda129: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
-magma-cuda129:
-	$(DOCKER_RUN)
-
 .PHONY: magma-cuda128
 magma-cuda128: DESIRED_CUDA := 12.8
 magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
@ -41,3 +36,14 @@ magma-cuda128:
 magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
+
+.PHONY: magma-cuda124
+magma-cuda124: DESIRED_CUDA := 12.4
+magma-cuda124:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda118
+magma-cuda118: DESIRED_CUDA := 11.8
+magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
+magma-cuda118:
+	$(DOCKER_RUN)
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -18,10 +18,12 @@ retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }

-PLATFORM=""
+PLATFORM="manylinux2014_x86_64"
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
@ -31,11 +33,9 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
    # shellcheck disable=SC2046
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
+
    retry apt-get update
    retry apt-get -y install zip openssl
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 # We use the package name to test the package by passing this to 'pip install'
@ -79,6 +79,8 @@ if [[ -e /opt/openssl ]]; then
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi

+
+
 mkdir -p /tmp/$WHEELHOUSE_DIR

 export PATCHELF_BIN=/usr/local/bin/patchelf
@ -97,7 +99,6 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
-retry pip install -q cmake
 python setup.py clean
 retry pip install -qr requirements.txt
 case ${DESIRED_PYTHON} in
@ -110,6 +111,12 @@ case ${DESIRED_PYTHON} in
    ;;
 esac

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -151,7 +158,7 @@ if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    CMAKE_FRESH=1 python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
+    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
 else
    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
@ -202,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then

    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR

+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
@ -320,8 +333,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
-            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
+            # Keep the so number for XPU dependencies
+            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -15,9 +15,6 @@ export INSTALL_TEST=0 # dont install test binaries into site-packages
 export USE_CUPTI_SO=0
 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
 export USE_CUFILE=${USE_CUFILE:-1}
-export USE_SYSTEM_NCCL=1
-export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-export NCCL_LIB_DIR="/usr/local/cuda/lib64/"

 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
@ -39,8 +36,10 @@ if [[ -n "$DESIRED_CUDA" ]]; then
    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
        CUDA_VERSION=${DESIRED_CUDA}
    else
-        # cu126, cu128 etc...
-        if [[ ${#DESIRED_CUDA} -eq 5 ]]; then
+        # cu90, cu92, cu100, cu101
+        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
+            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
+        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
        fi
    fi
@ -54,18 +53,22 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')

 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
-    12.8|12.9)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    12.8)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0;10.0;12.0+PTX" #Ripping out 5.0 and 6.0 due to ld error
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        # WAR to resolve the ld error in libtorch build with CUDA 12.9
-        if [[ "$DESIRED_CUDA" == "cu129" && "$PACKAGE_TYPE" == "libtorch" ]]; then
-            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
-        fi
        ;;
    12.6)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
+    12.4)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
+    11.8)
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
        exit 1
@ -88,15 +91,14 @@ fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true

 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 DEPS_LIST=(
@ -106,12 +108,31 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

+# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
+# since nvidia-cusparselt-cu11 is not available in PYPI
+if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
+        DEPS_SONAME+=(
+            "libcusparseLt.so.0"
+        )
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
+        )
+fi

-# CUDA_VERSION 12.6, 12.8, 12.9
+
+# Turn USE_CUFILE off for CUDA 11.8, 12.4 since nvidia-cufile-cu11 and 1.9.0.20 are
+# not available in PYPI
+if [[ $CUDA_VERSION == "11.8" || $CUDA_VERSION == "12.4" ]]; then
+    export USE_CUFILE=0
+fi
+
+
+# CUDA_VERSION 12.4, 12.6, 12.8
 if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
@ -127,10 +148,9 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libcublasLt.so.12"
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
            "/usr/local/cuda/lib64/libcudart.so.12"
+            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-            "/usr/local/cuda/lib64/libcufile.so.0"
-            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
@ -145,10 +165,93 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libcublasLt.so.12"
            "libcusparseLt.so.0"
            "libcudart.so.12"
+            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
-            "libcufile.so.0"
-            "libcufile_rdma.so.1"
+        )
+        if [[ $USE_CUFILE == 1 ]]; then
+            DEPS_LIST+=(
+                "/usr/local/cuda/lib64/libcufile.so.0"
+                "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+            )
+            DEPS_SONAME+=(
+                "libcufile.so.0"
+                "libcufile_rdma.so.1"
+            )
+        fi
+    else
+        echo "Using nvidia libs from pypi."
+        CUDA_RPATHS=(
+            '$ORIGIN/../../nvidia/cublas/lib'
+            '$ORIGIN/../../nvidia/cuda_cupti/lib'
+            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
+            '$ORIGIN/../../nvidia/cuda_runtime/lib'
+            '$ORIGIN/../../nvidia/cudnn/lib'
+            '$ORIGIN/../../nvidia/cufft/lib'
+            '$ORIGIN/../../nvidia/curand/lib'
+            '$ORIGIN/../../nvidia/cusolver/lib'
+            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../cusparselt/lib'
+            '$ORIGIN/../../nvidia/nccl/lib'
+            '$ORIGIN/../../nvidia/nvtx/lib'
+        )
+        if [[ $USE_CUFILE == 1 ]]; then
+            CUDA_RPATHS+=(
+                '$ORIGIN/../../nvidia/cufile/lib'
+            )
+        fi
+        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
+        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
+        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
+        export FORCE_RPATH="--force-rpath"
+        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
+        export ATEN_STATIC_CUDA=0
+        export USE_CUDA_STATIC_LINK=0
+        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
+    fi
+elif [[ $CUDA_VERSION == "11.8" ]]; then
+    export USE_STATIC_CUDNN=0
+    # Try parallelizing nvcc as well
+    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
+    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
+    export BUILD_BUNDLE_PTXAS=1
+
+    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
+        echo "Bundling with cudnn and cublas."
+        DEPS_LIST+=(
+            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
+            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
+            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
+            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
+            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
+            "/usr/local/cuda/lib64/libcudnn.so.9"
+            "/usr/local/cuda/lib64/libcublas.so.11"
+            "/usr/local/cuda/lib64/libcublasLt.so.11"
+            "/usr/local/cuda/lib64/libcudart.so.11.0"
+            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
+            "/usr/local/cuda/lib64/libnvrtc.so.11.2"    # this is not a mistake, it links to more specific cuda version
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
+        )
+        DEPS_SONAME+=(
+            "libcudnn_adv.so.9"
+            "libcudnn_cnn.so.9"
+            "libcudnn_graph.so.9"
+            "libcudnn_ops.so.9"
+            "libcudnn_engines_runtime_compiled.so.9"
+            "libcudnn_engines_precompiled.so.9"
+            "libcudnn_heuristic.so.9"
+            "libcudnn.so.9"
+            "libcublas.so.11"
+            "libcublasLt.so.11"
+            "libcudart.so.11.0"
+            "libnvToolsExt.so.1"
+            "libnvrtc.so.11.2"
+            "libnvrtc-builtins.so.11.8"
        )
    else
        echo "Using nvidia libs from pypi."
@ -162,21 +265,20 @@ if [[ $CUDA_VERSION == 12* ]]; then
            '$ORIGIN/../../nvidia/curand/lib'
            '$ORIGIN/../../nvidia/cusolver/lib'
            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../nvidia/cusparselt/lib'
-            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
-            '$ORIGIN/../../nvidia/nvshmem/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
-            '$ORIGIN/../../nvidia/cufile/lib'
        )
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
+        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
+        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
+        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 else
    echo "Unknown cuda version $CUDA_VERSION"
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -22,7 +22,9 @@ retry () {

 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
+if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
+    retry yum install -q -y zip openssl
+elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
@ -33,9 +35,6 @@ elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
-else
-    echo "Unknown OS: '$OS_NAME'"
-    exit 1
 fi

 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
@ -92,11 +91,16 @@ if [[ -z "$PYTORCH_ROOT" ]]; then
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
-retry pip install -q cmake
 python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -165,6 +169,12 @@ fi

 )

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
 (
    set -x

--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -95,7 +95,6 @@ ROCM_SO_FILES=(
    "libroctracer64.so"
    "libroctx64.so"
    "libhipblaslt.so"
-    "libhipsparselt.so"
    "libhiprtc.so"
 )

@ -187,28 +186,20 @@ do
    OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
 done

-ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; separated arch list to bar for grep
-
 # rocBLAS library files
 ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
-ROCBLAS_ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
-ROCBLAS_OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ROCBLAS_ARCH_SPECIFIC_FILES $OTHER_FILES)
+ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
+ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
+ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)

 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
 HIPBLASLT_LIB_DST=lib/hipblaslt/library
-HIPBLASLT_ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
-HIPBLASLT_OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
-HIPBLASLT_LIB_FILES=($HIPBLASLT_ARCH_SPECIFIC_FILES $HIPBLASLT_OTHER_FILES)
-
-# hipsparselt library files
-HIPSPARSELT_LIB_SRC=$ROCM_HOME/lib/hipsparselt/library
-HIPSPARSELT_LIB_DST=lib/hipsparselt/library
-HIPSPARSELT_ARCH_SPECIFIC_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -E $ARCH)
-#HIPSPARSELT_OTHER_FILES=$(ls $HIPSPARSELT_LIB_SRC | grep -v gfx)
-HIPSPARSELT_LIB_FILES=($HIPSPARSELT_ARCH_SPECIFIC_FILES $HIPSPARSELT_OTHER_FILES)
+ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
+OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
+HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)

 # ROCm library files
 ROCM_SO_PATHS=()
@ -243,14 +234,12 @@ DEPS_SONAME=(
 DEPS_AUX_SRCLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
-    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_SRC/}"
    "/opt/amdgpu/share/libdrm/amdgpu.ids"
 )

 DEPS_AUX_DSTLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
-    "${HIPSPARSELT_LIB_FILES[@]/#/$HIPSPARSELT_LIB_DST/}"
    "share/libdrm/amdgpu.ids"
 )

--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -20,11 +20,7 @@ fi
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
 source /opt/intel/oneapi/umf/latest/env/vars.sh
-source /opt/intel/oneapi/ccl/latest/env/vars.sh
-source /opt/intel/oneapi/mpi/latest/env/vars.sh
 export USE_STATIC_MKL=1
-export USE_ONEMKL=1
-export USE_XCCL=1

 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
--- a/.ci/onnx/README.md
+++ b/.ci/onnx/README.md
@ -10,3 +10,5 @@ example: `py2-cuda9.0-cudnn7-ubuntu16.04`. The Docker images that are
 built on Jenkins and are used in triggered builds already have this
 environment variable set in their manifest. Also see
 `./docker/jenkins/*/Dockerfile` and search for `BUILD_ENVIRONMENT`.
+
+Our Jenkins installation is located at https://ci.pytorch.org/jenkins/.
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -27,12 +27,6 @@ cmake --version
 echo "Environment variables:"
 env

-# The sccache wrapped version of nvcc gets put in /opt/cache/lib in docker since
-# there are some issues if it is always wrapped, so we need to add it to PATH
-# during CI builds.
-# https://github.com/pytorch/pytorch/blob/0b6c0898e6c352c8ea93daec854e704b41485375/.ci/docker/common/install_cache.sh#L97
-export PATH="/opt/cache/lib:$PATH"
-
 if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
  # Use jemalloc during compilation to mitigate https://github.com/pytorch/pytorch/issues/116289
  export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2
@ -41,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
    # disable it for now and to be fix later.
    # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@ -58,6 +52,12 @@ fi
 export USE_LLVM=/opt/llvm
 export LLVM_DIR=/opt/llvm/lib/cmake/llvm

+if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
+  # To build test_edge_op_registration
+  export BUILD_EXECUTORCH=ON
+  export USE_CUDA=0
+fi
+
 if ! which conda; then
  # In ROCm CIs, we are doing cross compilation on build machines with
  # intel cpu and later run tests on machines with amd cpu.
@ -171,15 +171,8 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/ccl/latest/env/vars.sh
-  # shellcheck disable=SC1091
-  source /opt/intel/oneapi/mpi/latest/env/vars.sh
-  # Enable XCCL build
-  export USE_XCCL=1
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
-  export TORCH_XPU_ARCH_LIST=pvc
 fi

 # sccache will fail for CUDA builds if all cores are used for compiling
@ -198,7 +191,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -257,7 +250,6 @@ if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
  set -e -o pipefail

  get_bazel
-  python3 tools/optional_submodules.py checkout_eigen

  # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
  # the runner
@ -284,8 +276,10 @@ else
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-      python -mpip install numpy==2.0.2
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        python -mpip install numpy==2.0.2
+      fi

      WERROR=1 python setup.py clean

@ -308,18 +302,6 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

-    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-      echo "Checking that xpu is compiled"
-      pushd dist/
-      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
-        echo "XPU support is compiled in."
-      else
-        echo "XPU support is NOT compiled in."
-        exit 1
-      fi
-      popd
-    fi
-
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x

@ -395,10 +377,8 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
-      MAX_JOBS=$(nproc --ignore=4)
-      export MAX_JOBS
-    fi
+    MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -59,16 +59,78 @@ else
  export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
 fi

+###############################################################################
+# Setup XPU ENV
+###############################################################################
+if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+  set +u
+  # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+  source /opt/intel/oneapi/compiler/latest/env/vars.sh
+  source /opt/intel/oneapi/pti/latest/env/vars.sh
+fi
+
 ###############################################################################
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
-  # We also check that there are cxx11 symbols in libtorch
+  function is_expected() {
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
+    fi
+  }
+
+  # First we check that the env var in TorchConfig.cmake is correct
+
+  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
+  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
+  if [[ ! -f "$torch_config" ]]; then
+    echo "No TorchConfig.cmake found!"
+    ls -lah "$install_root/share/cmake/Torch"
+    exit 1
+  fi
+  echo "Checking the TorchConfig.cmake"
+  cat "$torch_config"
+
+  # The sed call below is
+  #   don't print lines by default (only print the line we want)
+  # -n
+  #   execute the following expression
+  # e
+  #   replace lines that match with the first capture group and print
+  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
+  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
+  #   quote, any characters
+  #   Note the exactly one single character after the '='. In the case that the
+  #     variable is not set the '=' will be followed by a '"' immediately and the
+  #     line will fail the match and nothing will be printed; this is what we
+  #     want.  Otherwise it will capture the 0 or 1 after the '='.
+  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
+  #   replace the matched line with the capture group and print
+  # /\1/p
+  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
+  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
+    echo "gcc ABI $actual_gcc_abi not as expected."
+    exit 1
+  fi
+
+  # We also check that there are [not] cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@ -146,11 +208,35 @@ setup_link_flags () {

 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=1
+  else
+    GLIBCXX_USE_CXX11_ABI=0
+  fi
  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }

+build_example_cpp_with_incorrect_abi () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=0
+  else
+    GLIBCXX_USE_CXX11_ABI=1
+  fi
+  set +e
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ERRCODE=$?
+  set -e
+  if [ "$ERRCODE" -eq "0" ]; then
+    echo "Building example with incorrect ABI didn't throw error. Aborting."
+    exit 1
+  else
+    echo "Building example with incorrect ABI throws expected error. Proceeding."
+  fi
+}
+
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@ -160,6 +246,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
+  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
+  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    build_example_cpp_with_incorrect_abi simple-torch-test
+  fi
 else
  pushd /tmp
  python -c 'import torch'
@ -216,14 +307,6 @@ else
  fi
 fi

-###############################################################################
-# Check XPU configured correctly
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  echo "Checking that xpu is compiled"
-  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
-fi
-
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
@ -302,22 +385,10 @@ except RuntimeError as e:
 fi

 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11 - GCC 13
+# Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html
-  # gcc-11 is ABI16, gcc-13 is ABI18, gcc-14 is ABI19
-  # gcc 11 - CUDA 11.8, xpu, rocm
-  # gcc 13 - CUDA 12.6, 12.8 and cpu
-  # Please see issue for reference: https://github.com/pytorch/pytorch/issues/152426
-  if [[ "$(uname -m)" == "s390x" ]]; then
-    cxx_abi="19"
-  elif [[ "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'rocm'* ]]; then
-    cxx_abi="18"
-  else
-    cxx_abi="16"
-  fi
-  python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi10${cxx_abi}' else 1)"
+  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
  popd
 fi
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -13,8 +13,12 @@ if [[ "${BUILD_ENVIRONMENT}" == *rocm* ]]; then
  # HIP_PLATFORM is auto-detected by hipcc; unset to avoid build errors
  unset HIP_PLATFORM
  export PYTORCH_TEST_WITH_ROCM=1
+  # temporary to locate some kernel issues on the CI nodes
+  export HSAKMT_DEBUG_LEVEL=4
+  # improve rccl performance for distributed tests
+  export HSA_FORCE_FINE_GRAIN_PCIE=1
 fi

-# TODO: Reenable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
+# TODO: Renable libtorch testing for MacOS, see https://github.com/pytorch/pytorch/issues/62598
 # shellcheck disable=SC2034
 BUILD_TEST_LIBTORCH=0
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -159,6 +159,11 @@ function install_torchvision() {
  fi
 }

+function install_tlparse() {
+  pip_install --user "tlparse==0.3.30"
+  PATH="$(python -m site --user-base)/bin:$PATH"
+}
+
 function install_torchrec_and_fbgemm() {
  local torchrec_commit
  torchrec_commit=$(get_pinned_commit torchrec)
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,50 +1,31 @@
 #!/bin/bash

 # Script for installing sccache on the xla build job, which uses xla's docker
-# image, which has sccache installed but doesn't write the stubs.  This is
-# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
-# that will always return the same thing, ex checks for for rocm, CUDA, changing
-# the path where sccache is installed, not changing /etc/environment, and not
-# installing/downloading sccache as it is already in the docker image.
+# image and doesn't have sccache installed on it.  This is mostly copied from
+# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
+# return the same thing, ex checks for for rocm, CUDA, and changing the path
+# where sccache is installed, and not changing /etc/environment.

 set -ex -o pipefail

+install_binary() {
+  echo "Downloading sccache binary from S3 repo"
+  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
+}
+
 mkdir -p /tmp/cache/bin
+mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"

+install_binary
+chmod a+x /tmp/cache/bin/sccache
+
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ "$1" == "gcc" ]; then
-    # Do not call sccache recursively when dumping preprocessor argument
-    # For some reason it's very important for the first cached nvcc invocation
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
-for arg in "\$@"; do
-  if [ "\$arg" = "-E" ]; then
-    exec $(which "$1") "\$@"
-  fi
-done
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  else
-    cat >"/tmp/cache/bin/$1" <<EOF
-#!/bin/sh
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which "$1") "\$@"
-else
-  exec $(which "$1") "\$@"
-fi
-EOF
-  fi
+  # shellcheck disable=SC2086
+  # shellcheck disable=SC2059
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
  chmod a+x "/tmp/cache/bin/$1"
 }

--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -33,15 +33,56 @@ if which sccache > /dev/null; then
  export PATH="${tmp_dir}:$PATH"
 fi

-print_cmake_info
-if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
-else
+cross_compile_arm64() {
+  # Cross compilation for arm64
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel --plat-name macosx_11_0_arm64
+  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
+}
+
+build_lite_interpreter() {
+    echo "Testing libtorch (lite interpreter)."
+
+    CPP_BUILD="$(pwd)/../cpp_build"
+    # Ensure the removal of the tmp directory
+    trap 'rm -rfv ${CPP_BUILD}' EXIT
+    rm -rf "${CPP_BUILD}"
+    mkdir -p "${CPP_BUILD}/caffe2"
+
+    # It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
+    BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
+    pushd "${CPP_BUILD}/caffe2" || exit
+    VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
+    popd || exit
+
+    "${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
+}
+
+print_cmake_info
+
+if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
+elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
+  export BUILD_LITE_INTERPRETER=1
+  build_lite_interpreter
+else
+  compile_x86_64
 fi
+
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -20,4 +20,14 @@ print_cmake_info() {
  CONDA_INSTALLATION_DIR=$(dirname "$CMAKE_EXEC")
  # Print all libraries under cmake rpath for debugging
  ls -la "$CONDA_INSTALLATION_DIR/../lib"
+
+  export CMAKE_EXEC
+  # Explicitly add conda env lib folder to cmake rpath to address the flaky issue
+  # where cmake dependencies couldn't be found. This seems to point to how conda
+  # links $CMAKE_EXEC to its package cache when cloning a new environment
+  install_name_tool -add_rpath @executable_path/../lib "${CMAKE_EXEC}" || true
+  # Adding the rpath will invalidate cmake signature, so signing it again here
+  # to trust the executable. EXC_BAD_ACCESS (SIGKILL (Code Signature Invalid))
+  # with an exit code 137 otherwise
+  codesign -f -s - "${CMAKE_EXEC}" || true
 }
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -42,16 +42,6 @@ test_python_all() {
  assert_git_not_dirty
 }

-test_python_mps() {
-  setup_test_python
-
-  time python test/run_test.py --verbose --mps
-  MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
-
-  assert_git_not_dirty
-}
-
-
 test_python_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -165,7 +155,6 @@ test_jit_hooks() {
 torchbench_setup_macos() {
  git clone --recursive https://github.com/pytorch/vision torchvision
  git clone --recursive https://github.com/pytorch/audio torchaudio
-  brew install jpeg-turbo libpng

  pushd torchvision
  git fetch
@ -180,8 +169,7 @@ torchbench_setup_macos() {
  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
  git submodule update --init --recursive
  python setup.py clean
-  #TODO: Remove me, when figure out how to make TorchAudio find brew installed openmp
-  USE_OPENMP=0 python setup.py develop
+  python setup.py develop
  popd

  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
@ -189,8 +177,9 @@ torchbench_setup_macos() {
  checkout_install_torchbench
 }

-pip_benchmark_deps() {
-  python -mpip install --no-input astunparse requests cython scikit-learn
+conda_benchmark_deps() {
+  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
+  conda install -y -c conda-forge librosa
 }


@ -198,7 +187,7 @@ test_torchbench_perf() {
  print_cmake_info

  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -225,61 +214,32 @@ test_torchbench_smoketest() {
  print_cmake_info

  echo "Launching torchbench setup"
-  pip_benchmark_deps
+  conda_benchmark_deps
  # shellcheck disable=SC2119,SC2120
  torchbench_setup_macos

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

+  local backend=eager
+  local dtype=notset
  local device=mps
-  local dtypes=(undefined float16 bfloat16 notset)
-  local dtype=${dtypes[$1]}
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam sam_fast pytorch_unet stable_diffusion_text_encoder speech_transformer Super_SloMo doctr_det_predictor doctr_reco_predictor timm_resnet timm_vovnet vgg16)

-  for backend in eager inductor; do
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"

-    echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
-    local dtype_arg="--${dtype}"
-    if [ "$dtype" == notset ]; then
-        dtype_arg="--float32"
-    fi
-    touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-    for model in "${models[@]}"; do
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-        --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
-      if [ "$backend" == "inductor" ]; then
-        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-          --accuracy --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_accuracy.csv" || true
-      fi
-    done
-    if [ "$backend" == "inductor" ]; then
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-        --performance --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_performance.csv" || true
-      PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/huggingface.py \
-        --accuracy --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-        --output "$TEST_REPORTS_DIR/inductor_${backend}_huggingface_${dtype}_inference_${device}_accuracy.csv" || true
-    fi
-
-    if [ "$dtype" == notset ]; then
-      for dtype_ in notset amp; do
-        echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype_}"
-        touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv"
-        local dtype_arg="--${dtype_}"
-        if [ "$dtype_" == notset ]; then
-          dtype_arg="--float32"
-        fi
-        for model in "${models[@]}"; do
-          PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-            --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
-            --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype_}_training_${device}_performance.csv" || true
-        done
-      done
-    fi
+  echo "Setup complete, launching torchbench training performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --training --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  done

+  echo "Launching torchbench inference performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done

  echo "Pytorch benchmark on mps device completed"
@ -289,7 +249,7 @@ test_hf_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  echo "Launching HuggingFace training perf run"
@ -305,7 +265,7 @@ test_timm_perf() {
  print_cmake_info
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  pip_benchmark_deps
+  conda_benchmark_deps
  torchbench_setup_macos

  echo "Launching timm training perf run"
@ -317,6 +277,8 @@ test_timm_perf() {
  echo "timm benchmark on mps device completed"
 }

+install_tlparse
+
 if [[ $TEST_CONFIG == *"perf_all"* ]]; then
  test_torchbench_perf
  test_hf_perf
@ -328,9 +290,7 @@ elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
 elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
  test_timm_perf
 elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
-  test_torchbench_smoketest "${SHARD_NUMBER}"
-elif [[ $TEST_CONFIG == *"mps"* ]]; then
-  test_python_mps
+  test_torchbench_smoketest
 elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
--- a/.ci/pytorch/perf_test/common.sh
+++ b/.ci/pytorch/perf_test/common.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -e
+
+run_test () {
+  rm -rf test_tmp/ && mkdir test_tmp/ && cd test_tmp/
+  "$@"
+  cd .. && rm -rf test_tmp/
+}
+
+get_runtime_of_command () {
+  TIMEFORMAT=%R
+
+  # runtime=$( { time ($@ &> /dev/null); } 2>&1 1>/dev/null)
+  runtime=$( { time "$@"; } 2>&1 1>/dev/null)
+  if [[ $runtime == *"Error"* ]]; then
+    exit 1
+  fi
+  runtime=${runtime#+++ $@}
+  runtime=$(python -c "print($runtime)")
+
+  echo "$runtime"
+}
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -0,0 +1,91 @@
+import argparse
+import json
+import math
+import sys
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument(
+    "--test-name", dest="test_name", action="store", required=True, help="test name"
+)
+parser.add_argument(
+    "--sample-stats",
+    dest="sample_stats",
+    action="store",
+    required=True,
+    help="stats from sample",
+)
+parser.add_argument(
+    "--update",
+    action="store_true",
+    help="whether to update baseline using stats from sample",
+)
+args = parser.parse_args()
+
+test_name = args.test_name
+
+if "cpu" in test_name:
+    backend = "cpu"
+elif "gpu" in test_name:
+    backend = "gpu"
+
+data_file_path = f"../{backend}_runtime.json"
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+if test_name in data:
+    mean = float(data[test_name]["mean"])
+    sigma = float(data[test_name]["sigma"])
+else:
+    # Let the test pass if baseline number doesn't exist
+    mean = sys.maxsize
+    sigma = 0.001
+
+print("population mean: ", mean)
+print("population sigma: ", sigma)
+
+# Let the test pass if baseline number is NaN (which happened in
+# the past when we didn't have logic for catching NaN numbers)
+if math.isnan(mean) or math.isnan(sigma):
+    mean = sys.maxsize
+    sigma = 0.001
+
+sample_stats_data = json.loads(args.sample_stats)
+
+sample_mean = float(sample_stats_data["mean"])
+sample_sigma = float(sample_stats_data["sigma"])
+
+print("sample mean: ", sample_mean)
+print("sample sigma: ", sample_sigma)
+
+if math.isnan(sample_mean):
+    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
+elif math.isnan(sample_sigma):
+    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
+
+z_value = (sample_mean - mean) / sigma
+
+print("z-value: ", z_value)
+
+if z_value >= 3:
+    raise Exception(  # noqa: TRY002
+        f"""\n
+z-value >= 3, there is high chance of perf regression.\n
+To reproduce this regression, run
+`cd .ci/pytorch/perf_test/ && bash {test_name}.sh` on your local machine
+and compare the runtime before/after your code change.
+"""
+    )
+else:
+    print("z-value < 3, no perf regression detected.")
+    if args.update:
+        print("We will use these numbers as new baseline.")
+        new_data_file_path = f"../new_{backend}_runtime.json"
+        with open(new_data_file_path) as new_data_file:
+            new_data = json.load(new_data_file)
+        new_data[test_name] = {}
+        new_data[test_name]["mean"] = sample_mean
+        new_data[test_name]["sigma"] = max(sample_sigma, sample_mean * 0.1)
+        with open(new_data_file_path, "w") as new_data_file:
+            json.dump(new_data, new_data_file, indent=4)
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -0,0 +1,18 @@
+import json
+import sys
+
+import numpy
+
+
+sample_data_list = sys.argv[1:]
+sample_data_list = [float(v.strip()) for v in sample_data_list]
+
+sample_mean = numpy.mean(sample_data_list)
+sample_sigma = numpy.std(sample_data_list)
+
+data = {
+    "mean": sample_mean,
+    "sigma": sample_sigma,
+}
+
+print(json.dumps(data))
--- a/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mini_sequence_labeler.sh
@ -0,0 +1,43 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mini_sequence_labeler () {
+  echo "Testing: mini sequence labeler, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 726567a455edbfda6199445922a8cfee82535664
+
+  cd scripts/mini_sequence_labeler
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py)
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mini_sequence_labeler "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_mnist.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_cpu_speed_mnist () {
+  echo "Testing: MNIST, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision-cpu
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch () {
+  echo "Testing: torch.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch "$@"
+fi
--- a/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
+++ b/.ci/pytorch/perf_test/test_cpu_speed_torch_tensor.sh
@ -0,0 +1,29 @@
+#!/bin/bash
+
+. ./common.sh
+
+test_cpu_speed_torch_tensor () {
+  echo "Testing: torch.Tensor.*, CPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/yf225/perf-tests.git
+
+  if [ "$1" == "compare_with_baseline" ]; then
+    export ARGS=(--compare ../cpu_runtime.json)
+  elif [ "$1" == "compare_and_update" ]; then
+    export ARGS=(--compare ../cpu_runtime.json --update ../new_cpu_runtime.json)
+  elif [ "$1" == "update_only" ]; then
+    export ARGS=(--update ../new_cpu_runtime.json)
+  fi
+
+  if ! python perf-tests/modules/test_cpu_torch_tensor.py "${ARGS[@]}"; then
+    echo "To reproduce this regression, run \`cd .ci/pytorch/perf_test/ && bash ${FUNCNAME[0]}.sh\` on your local machine and compare the runtime before/after your code change."
+    exit 1
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_cpu_speed_torch_tensor "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_cudnn_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_cudnn_lstm () {
+  echo "Testing: CuDNN LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python cudnn_lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_cudnn_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_lstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_lstm () {
+  echo "Testing: LSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python lstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_lstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mlstm.sh
@ -0,0 +1,44 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mlstm () {
+  echo "Testing: MLSTM, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/benchmark.git
+
+  cd benchmark/
+
+  git checkout 43dfb2c0370e70ef37f249dc09aff9f0ccd2ddb0
+
+  cd scripts/
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python mlstm.py --skip-cpu-governor-check)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mlstm "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_mnist.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_mnist () {
+  echo "Testing: MNIST, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/mnist
+
+  conda install -c pytorch torchvision
+
+  # Download data
+  python main.py --epochs 0
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  # Needs warm up to get accurate number
+  python main.py --epochs 1 --no-log
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --epochs 1 --no-log)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_mnist "$@"
+fi
--- a/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
+++ b/.ci/pytorch/perf_test/test_gpu_speed_word_language_model.sh
@ -0,0 +1,53 @@
+#!/bin/bash
+set -e
+
+. ./common.sh
+
+test_gpu_speed_word_language_model () {
+  echo "Testing: word language model on Wikitext-2, GPU"
+
+  export OMP_NUM_THREADS=4
+  export MKL_NUM_THREADS=4
+
+  git clone https://github.com/pytorch/examples.git -b perftests
+
+  cd examples/word_language_model
+
+  cd data/wikitext-2
+
+  # Reduce dataset size, so that we can have more runs per test
+  sed -n '1,200p' test.txt > test_tmp.txt
+  sed -n '1,1000p' train.txt > train_tmp.txt
+  sed -n '1,200p' valid.txt > valid_tmp.txt
+
+  mv test_tmp.txt test.txt
+  mv train_tmp.txt train.txt
+  mv valid_tmp.txt valid.txt
+
+  cd ../..
+
+  SAMPLE_ARRAY=()
+  NUM_RUNS=$1
+
+  for (( i=1; i<=NUM_RUNS; i++ )) do
+    runtime=$(get_runtime_of_command python main.py --cuda --epochs 1)
+    echo "$runtime"
+    SAMPLE_ARRAY+=("${runtime}")
+  done
+
+  cd ../..
+
+  stats=$(python ../get_stats.py "${SAMPLE_ARRAY[@]}")
+  echo "Runtime stats in seconds:"
+  echo "$stats"
+
+  if [ "$2" == "compare_with_baseline" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}"
+  elif [ "$2" == "compare_and_update" ]; then
+    python ../compare_with_baseline.py --test-name "${FUNCNAME[0]}" --sample-stats "${stats}" --update
+  fi
+}
+
+if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
+  run_test test_gpu_speed_word_language_model "$@"
+fi
--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -0,0 +1,14 @@
+import json
+import sys
+
+
+data_file_path = sys.argv[1]
+commit_hash = sys.argv[2]
+
+with open(data_file_path) as data_file:
+    data = json.load(data_file)
+
+data["commit"] = commit_hash
+
+with open(data_file_path, "w") as data_file:
+    json.dump(data, data_file)
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,6 +119,12 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

+# Prevent Google from indexing $install_path/_modules. This folder contains
+# generated source files.
+# NB: the following only works on gnu sed. The sed shipped with mac os is different.
+# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
+find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
+
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.1
 .2.0