[Dynamo] Support for proxying frozen dataclasses

ghstack-source-id: fb6556cd2f9424fe223147471fe95126441954d9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/134846
2025-10-29 19:24:55 +08:00 · 2024-09-01 13:30:12 -07:00
6617 changed files with 194186 additions and 430138 deletions
--- a/.bazelversion
+++ b/.bazelversion
@ -1 +1 @@
-6.5.0
+6.1.1
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -0,0 +1,26 @@
 [pt]
  is_oss=1
 [buildfile]
  name = BUCK.oss
  includes = //tools/build_defs/select.bzl
 [repositories]
  bazel_skylib = third_party/bazel-skylib/
  ovr_config = .
 [download]
  in_build = true
 [cxx]
  cxxflags = -std=c++17
  ldflags = -Wl,--no-undefined
  should_remap_host_platform = true
  cpp = /usr/bin/clang
  cc = /usr/bin/clang
  cxx = /usr/bin/clang++
  cxxpp = /usr/bin/clang++
  ld = /usr/bin/clang++
 [project]
  default_flavors_mode=all
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -1,19 +0,0 @@
 # Aarch64 (ARM/Graviton) Support Scripts
 Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
 * torch
 * torchvision
 * torchaudio
 * torchtext
 * torchdata
 ## Aarch64_ci_build.sh
 This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
 ### Usage
 ```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
 __NOTE:__ CI build is currently __EXPERMINTAL__
 ## Build_aarch64_wheel.py
 This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
 ### Usage
 ```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -1,32 +0,0 @@
 #!/bin/bash
 set -eux -o pipefail
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
    export TORCH_CUDA_ARCH_LIST="9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
    export TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0"
 fi
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh
 ###############################################################################
 # Run aarch64 builder python
 ###############################################################################
 cd /
 # adding safe directory for git as the permissions will be
 # on the mounted pytorch repo
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
 pip install auditwheel==6.2.0
 if [ "$DESIRED_CUDA" = "cpu" ]; then
    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
 else
    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
 fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -1,21 +0,0 @@
 #!/bin/bash
 set -eux -o pipefail
 # This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
 # By creating symlinks from desired /opt/python to /usr/local/bin/
 NUMPY_VERSION=2.0.2
 if [[ "$DESIRED_PYTHON"  == "3.13" || "$DESIRED_PYTHON" == "3.13t" ]]; then
    NUMPY_VERSION=2.1.2
 fi
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 source $SCRIPTPATH/../manywheel/set_desired_python.sh
 pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2
 for tool in python python3 pip pip3 ninja scons patchelf; do
    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
 done
 python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -1,256 +0,0 @@
 #!/usr/bin/env python3
 # encoding: UTF-8
 import os
 import shutil
 from subprocess import check_call, check_output
 def list_dir(path: str) -> list[str]:
    """'
    Helper for getting paths for Python
    """
    return check_output(["ls", "-1", path]).decode().split("\n")
 def build_ArmComputeLibrary() -> None:
    """
    Using ArmComputeLibrary for aarch64 PyTorch
    """
    print("Building Arm Compute Library")
    acl_build_flags = [
        "debug=0",
        "neon=1",
        "opencl=0",
        "os=linux",
        "openmp=1",
        "cppthreads=0",
        "arch=armv8a",
        "multi_isa=1",
        "fixed_format_kernels=1",
        "build=native",
    ]
    acl_install_dir = "/acl"
    acl_checkout_dir = "ComputeLibrary"
    os.makedirs(acl_install_dir)
    check_call(
        [
            "git",
            "clone",
            "https://github.com/ARM-software/ComputeLibrary.git",
            "-b",
            "v25.02",
            "--depth",
            "1",
            "--shallow-submodules",
        ]
    )
    check_call(
        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
        + acl_build_flags,
        cwd=acl_checkout_dir,
    )
    for d in ["arm_compute", "include", "utils", "support", "src"]:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 def replace_tag(filename) -> None:
    with open(filename) as f:
        lines = f.readlines()
    for i, line in enumerate(lines):
        if line.startswith("Tag:"):
            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
            print(f"Updated tag from {line} to {lines[i]}")
            break
    with open(filename, "w") as f:
        f.writelines(lines)
 def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    """
    Package the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
    os.mkdir(f"{folder}/tmp")
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
    libs_to_copy = [
        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
        "/usr/local/cuda/lib64/libcudnn.so.9",
        "/usr/local/cuda/lib64/libcublas.so.12",
        "/usr/local/cuda/lib64/libcublasLt.so.12",
        "/usr/local/cuda/lib64/libcudart.so.12",
        "/usr/local/cuda/lib64/libcufft.so.11",
        "/usr/local/cuda/lib64/libcusparse.so.12",
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
        "/usr/local/cuda/lib64/libnvToolsExt.so.1",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
        "/lib64/libgomp.so.1",
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
        "/usr/local/lib/libnvpl_lapack_core.so.0",
        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
    if "128" in desired_cuda:
        libs_to_copy += [
            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
            "/usr/local/cuda/lib64/libcufile.so.0",
            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
        os.system(
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
    # Make sure the wheel is tagged with manylinux_2_28
    for f in os.scandir(f"{folder}/tmp/"):
        if f.is_dir() and f.name.endswith(".dist-info"):
            replace_tag(f"{f.path}/WHEEL")
            break
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
        f"{folder}/cuda_wheel/{wheelname}",
        f"{folder}/{wheelname}",
        copy_function=shutil.copy2,
    )
    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
 def complete_wheel(folder: str) -> str:
    """
    Complete wheel build and put in artifact location
    """
    wheel_name = list_dir(f"/{folder}/dist")[0]
    # Please note for cuda we don't run auditwheel since we use custom script to package
    # the cuda dependencies to the wheel file using update_wheel() method.
    # However we need to make sure filename reflects the correct Manylinux platform.
    if "pytorch" in folder and not enable_cuda:
        print("Repairing Wheel with AuditWheel")
        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
        os.rename(
            f"/{folder}/wheelhouse/{repaired_wheel_name}",
            f"/{folder}/dist/{repaired_wheel_name}",
        )
    else:
        repaired_wheel_name = wheel_name.replace(
            "linux_aarch64", "manylinux_2_28_aarch64"
        )
        print(f"Renaming {wheel_name} wheel to {repaired_wheel_name}")
        os.rename(
            f"/{folder}/dist/{wheel_name}",
            f"/{folder}/dist/{repaired_wheel_name}",
        )
    print(f"Copying {repaired_wheel_name} to artifacts")
    shutil.copy2(
        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
    )
    return repaired_wheel_name
 def parse_arguments():
    """
    Parse inline arguments
    """
    from argparse import ArgumentParser
    parser = ArgumentParser("AARCH64 wheels python CD")
    parser.add_argument("--debug", action="store_true")
    parser.add_argument("--build-only", action="store_true")
    parser.add_argument("--test-only", type=str)
    parser.add_argument("--enable-mkldnn", action="store_true")
    parser.add_argument("--enable-cuda", action="store_true")
    return parser.parse_args()
 if __name__ == "__main__":
    """
    Entry Point
    """
    args = parse_arguments()
    enable_mkldnn = args.enable_mkldnn
    enable_cuda = args.enable_cuda
    branch = check_output(
        ["git", "rev-parse", "--abbrev-ref", "HEAD"], cwd="/pytorch"
    ).decode()
    print("Building PyTorch wheel")
    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
    os.system("cd /pytorch; python setup.py clean")
    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
    if override_package_version is not None:
        version = override_package_version
        build_vars += (
            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
        )
    elif branch in ["nightly", "main"]:
        build_date = (
            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
            .decode()
            .replace("-", "")
        )
        version = (
            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
        )
        if enable_cuda:
            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
        else:
            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
    elif branch.startswith(("v1.", "v2.")):
        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1 : branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
    if enable_mkldnn:
        build_ArmComputeLibrary()
        print("build pytorch with mkldnn+acl backend")
        build_vars += (
            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
            "ACL_ROOT_DIR=/acl "
            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
            "ACL_INCLUDE_DIR=/acl/build "
            "ACL_LIBRARY=/acl/build "
        )
        if enable_cuda:
            build_vars += "BLAS=NVPL "
        else:
            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
    else:
        print("build pytorch without mkldnn backend")
    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
    if enable_cuda:
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
        package_cuda_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -1,87 +0,0 @@
 #!/usr/bin/env python3
 import os
 import shutil
 import sys
 from subprocess import check_call
 from tempfile import TemporaryDirectory
 from auditwheel.elfutils import elf_file_filter
 from auditwheel.lddtree import lddtree
 from auditwheel.patcher import Patchelf
 from auditwheel.repair import copylib
 from auditwheel.wheeltools import InWheelCtx
 def replace_tag(filename):
    with open(filename) as f:
        lines = f.read().split("\\n")
    for i, line in enumerate(lines):
        if not line.startswith("Tag: "):
            continue
        lines[i] = line.replace("-linux_", "-manylinux2014_")
        print(f"Updated tag from {line} to {lines[i]}")
    with open(filename, "w") as f:
        f.write("\\n".join(lines))
 class AlignedPatchelf(Patchelf):
    def set_soname(self, file_name: str, new_soname: str) -> None:
        check_call(
            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
        )
    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
        check_call(
            [
                "patchelf",
                "--page-size",
                "65536",
                "--replace-needed",
                soname,
                new_soname,
                file_name,
            ]
        )
 def embed_library(whl_path, lib_soname, update_tag=False):
    patcher = AlignedPatchelf()
    out_dir = TemporaryDirectory()
    whl_name = os.path.basename(whl_path)
    tmp_whl_name = os.path.join(out_dir.name, whl_name)
    with InWheelCtx(whl_path) as ctx:
        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
        ctx.out_wheel = tmp_whl_name
        new_lib_path, new_lib_soname = None, None
        for filename, _ in elf_file_filter(ctx.iter_files()):
            if not filename.startswith("torch/lib"):
                continue
            libtree = lddtree(filename)
            if lib_soname not in libtree["needed"]:
                continue
            lib_path = libtree["libs"][lib_soname]["path"]
            if lib_path is None:
                print(f"Can't embed {lib_soname} as it could not be found")
                break
            if lib_path.startswith(torchlib_path):
                continue
            if new_lib_path is None:
                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
            patcher.replace_needed(filename, lib_soname, new_lib_soname)
            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
        if update_tag:
            # Add manylinux2014 tag
            for filename in ctx.iter_files():
                if os.path.basename(filename) != "WHEEL":
                    continue
                replace_tag(filename)
    shutil.move(tmp_whl_name, whl_path)
 if __name__ == "__main__":
    embed_library(
        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
    )
--- a/.ci/docker/android/AndroidManifest.xml
+++ b/.ci/docker/android/AndroidManifest.xml
@ -0,0 +1 @@
 <manifest package="org.pytorch.deps" />
--- a/.ci/docker/android/build.gradle
+++ b/.ci/docker/android/build.gradle
@ -0,0 +1,66 @@
 buildscript {
    ext {
        minSdkVersion = 21
        targetSdkVersion = 28
        compileSdkVersion = 28
        buildToolsVersion = '28.0.3'
        coreVersion = "1.2.0"
        extJUnitVersion = "1.1.1"
        runnerVersion = "1.2.0"
        rulesVersion = "1.2.0"
        junitVersion = "4.12"
    }
    repositories {
        google()
        mavenLocal()
        mavenCentral()
        jcenter()
    }
    dependencies {
        classpath 'com.android.tools.build:gradle:4.1.2'
        classpath 'com.vanniktech:gradle-maven-publish-plugin:0.14.2'
    }
 }
 repositories {
    google()
    jcenter()
 }
 apply plugin: 'com.android.library'
 android {
    compileSdkVersion rootProject.compileSdkVersion
    buildToolsVersion rootProject.buildToolsVersion
    defaultConfig {
        minSdkVersion minSdkVersion
        targetSdkVersion targetSdkVersion
    }
    sourceSets {
        main {
            manifest.srcFile 'AndroidManifest.xml'
        }
    }
 }
 dependencies {
    implementation 'com.android.support:appcompat-v7:28.0.0'
    implementation 'androidx.appcompat:appcompat:1.0.0'
    implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
    implementation 'com.google.code.findbugs:jsr305:3.0.1'
    implementation 'com.facebook.soloader:nativeloader:0.10.5'
    implementation 'junit:junit:' + rootProject.junitVersion
    implementation 'androidx.test:core:' + rootProject.coreVersion
    implementation 'junit:junit:' + rootProject.junitVersion
    implementation 'androidx.test:core:' + rootProject.coreVersion
    implementation 'androidx.test.ext:junit:' + rootProject.extJUnitVersion
    implementation 'androidx.test:rules:' + rootProject.rulesVersion
    implementation 'androidx.test:runner:' + rootProject.runnerVersion
 }
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -0,0 +1,5 @@
 0.6b
 manylinux_2_17
 rocm6.2
 7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
 e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -1,8 +1,4 @@
 #!/bin/bash
 # The purpose of this script is to:
 # 1. Extract the set of parameters to be used for a docker build based on the provided image name.
 # 2. Run docker build with the parameters found in step 1.
 # 3. Run the built image and print out the expected and actual versions of packages installed.
 set -ex
@ -90,20 +86,30 @@ CMAKE_VERSION=3.18.5
 _UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
 _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 if [[ "$image" == *rocm* ]]; then
  _UCX_COMMIT=cc312eaa4655c0cc5c2bcd796db938f90563bcf6
  _UCC_COMMIT=0c0fc21559835044ab107199e334f7157d6a0d3d
 fi
 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc11)
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.6.3
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=11
+    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
@ -128,6 +134,36 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
@ -143,80 +179,6 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3.12-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.6-cudnn9-py3.13-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.6.3
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
@ -231,8 +193,50 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
    CUDA_VERSION=12.1.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
  pytorch-linux-focal-py3-clang10-onnx)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -240,8 +244,18 @@ case "$image" in
    CONDA_CMAKE=yes
    ONNX=yes
    ;;
-  pytorch-linux-focal-py3.9-clang10)
+  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=9
    LLVMDEV=yes
    PROTOBUF=yes
    ANDROID=yes
    ANDROID_NDK_VERSION=r21e
    GRADLE_VERSION=6.8.3
    NINJA_VERSION=1.9.0
    ;;
  pytorch-linux-focal-py3.8-clang10)
    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -262,8 +276,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-py3.9-gcc9)
+  pytorch-linux-focal-py3.8-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -272,34 +286,26 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=11
+    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.2.4
+    ROCM_VERSION=6.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=11
+    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.3
+    ROCM_VERSION=6.1
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
@ -312,19 +318,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
+    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -335,8 +330,8 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
@ -360,14 +355,8 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3-clang18-asan)
+  pytorch-linux-jammy-py3.8-gcc11)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=18
    CONDA_CMAKE=yes
    VISION=yes
    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -385,19 +374,11 @@ case "$image" in
    EXECUTORCH=yes
    ;;
  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.6
+    CUDA_VERSION=12.4
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3.12-triton-cpu)
    CUDA_VERSION=12.6
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=11
    CONDA_CMAKE=yes
    TRITON_CPU=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
@ -419,6 +400,9 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping sccache due to the following issue
    # https://github.com/pytorch/pytorch/issues/121559
    SKIP_SCCACHE_INSTALL=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -431,6 +415,9 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping sccache due to the following issue
    # https://github.com/pytorch/pytorch/issues/121559
    SKIP_SCCACHE_INSTALL=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -507,6 +494,8 @@ docker build \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "ANDROID=${ANDROID}" \
       --build-arg "ANDROID_NDK=${ANDROID_NDK_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
@ -514,13 +503,12 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx906;gfx90a}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -108,10 +108,17 @@ ENV CMAKE_C_COMPILER cc
 ENV CMAKE_CXX_COMPILER c++
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
 # Install AOTriton (Early fail)
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-ebe8522378c3f9944aaaef44868f5ececdd845fc
+69472e5c43481324ad923ceb29392ab72830acee
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +1 @@
-461c12871f336fe6f57b55d6a297f13ef209161b
+340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/nccl-cu11.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu11.txt
@ -1 +0,0 @@
 v2.21.5-1
--- a/.ci/docker/ci_commit_pins/nccl-cu12.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu12.txt
@ -1 +0,0 @@
 v2.26.2-1
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-5d535d7a2d4b435b1b5c1177fd8f04a12b942b9a
+ac3470188b914c5d7a5058a7e28b9eb685a62427
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -1 +0,0 @@
 c7711371cace304afe265c1ffa906415ab82fc66
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -0,0 +1 @@
 21eae954efa5bf584da70324b640288c3ee7aede
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-0bcc8265e677e5321606a3311bf71470f14456a8
+1b2f15840e0d70eec50d84c7a0575cb835524def
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-96316ce50fade7e209553aba4898cd9b82aab83b
+dedb7bdf339a3546896d4820366ca562c586bfa0
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,7 +1,7 @@
 set -euo pipefail
-readonly version=v25.02
+readonly version=v24.04
-readonly src_host=https://github.com/ARM-software
+readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary
 # Clone ACL
--- a/.ci/docker/common/install_android.sh
+++ b/.ci/docker/common/install_android.sh
@ -0,0 +1,112 @@
 #!/bin/bash
 set -ex
 [ -n "${ANDROID_NDK}" ]
 _https_amazon_aws=https://ossci-android.s3.amazonaws.com
 apt-get update
 apt-get install -y --no-install-recommends autotools-dev autoconf unzip
 apt-get autoclean && apt-get clean
 rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 pushd /tmp
 curl -Os --retry 3 $_https_amazon_aws/android-ndk-${ANDROID_NDK}-linux-x86_64.zip
 popd
 _ndk_dir=/opt/ndk
 mkdir -p "$_ndk_dir"
 unzip -qo /tmp/android*.zip -d "$_ndk_dir"
 _versioned_dir=$(find "$_ndk_dir/" -mindepth 1 -maxdepth 1 -type d)
 mv "$_versioned_dir"/* "$_ndk_dir"/
 rmdir "$_versioned_dir"
 rm -rf /tmp/*
 # Install OpenJDK
 # https://hub.docker.com/r/picoded/ubuntu-openjdk-8-jdk/dockerfile/
 sudo apt-get update && \
    apt-get install -y openjdk-8-jdk && \
    apt-get install -y ant && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /var/cache/oracle-jdk8-installer;
 # Fix certificate issues, found as of
 # https://bugs.launchpad.net/ubuntu/+source/ca-certificates-java/+bug/983302
 sudo apt-get update && \
    apt-get install -y ca-certificates-java && \
    apt-get clean && \
    update-ca-certificates -f && \
    rm -rf /var/lib/apt/lists/* && \
    rm -rf /var/cache/oracle-jdk8-installer;
 export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
 # Installing android sdk
 # https://github.com/circleci/circleci-images/blob/staging/android/Dockerfile.m4
 _tmp_sdk_zip=/tmp/android-sdk-linux.zip
 _android_home=/opt/android/sdk
 rm -rf $_android_home
 sudo mkdir -p $_android_home
 curl --silent --show-error --location --fail --retry 3 --output /tmp/android-sdk-linux.zip $_https_amazon_aws/android-sdk-linux-tools3859397-build-tools2803-2902-platforms28-29.zip
 sudo unzip -q $_tmp_sdk_zip -d $_android_home
 rm $_tmp_sdk_zip
 sudo chmod -R 777 $_android_home
 export ANDROID_HOME=$_android_home
 export ADB_INSTALL_TIMEOUT=120
 export PATH="${ANDROID_HOME}/tools:${ANDROID_HOME}/tools/bin:${ANDROID_HOME}/platform-tools:${PATH}"
 echo "PATH:${PATH}"
 # Installing Gradle
 echo "GRADLE_VERSION:${GRADLE_VERSION}"
 _gradle_home=/opt/gradle
 sudo rm -rf $gradle_home
 sudo mkdir -p $_gradle_home
 curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip
 sudo unzip -q /tmp/gradle.zip -d $_gradle_home
 rm /tmp/gradle.zip
 sudo chmod -R 777 $_gradle_home
 export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION
 alias gradle="${GRADLE_HOME}/bin/gradle"
 export PATH="${GRADLE_HOME}/bin/:${PATH}"
 echo "PATH:${PATH}"
 gradle --version
 mkdir /var/lib/jenkins/gradledeps
 cp build.gradle /var/lib/jenkins/gradledeps
 cp AndroidManifest.xml /var/lib/jenkins/gradledeps
 pushd /var/lib/jenkins
 export GRADLE_LOCAL_PROPERTIES=gradledeps/local.properties
 rm -f $GRADLE_LOCAL_PROPERTIES
 echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
 echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
 chown -R jenkins /var/lib/jenkins/gradledeps
 chgrp -R jenkins /var/lib/jenkins/gradledeps
 sudo -H -u jenkins $GRADLE_HOME/bin/gradle -Pandroid.useAndroidX=true -p /var/lib/jenkins/gradledeps -g /var/lib/jenkins/.gradle --refresh-dependencies --debug --stacktrace assemble
 chown -R jenkins /var/lib/jenkins/.gradle
 chgrp -R jenkins /var/lib/jenkins/.gradle
 popd
 rm -rf /var/lib/jenkins/.gradle/daemon
 # Cache vision models used by the test
 source "$(dirname "${BASH_SOURCE[0]}")/cache_vision_models.sh"
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -0,0 +1,23 @@
 #!/bin/bash
 set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 TARBALL='aotriton.tar.bz2'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
 AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
 curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
 ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
 if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
  echo " which does not match the expected value ${SHA256}."
  exit
 fi
 tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -32,12 +32,8 @@ install_ubuntu() {
  # HACK: UCC testing relies on libnccl library from NVIDIA repo, and version 2.16 crashes
  # See https://github.com/pytorch/pytorch/pull/105260#issuecomment-1673399729
  # TODO: Eliminate this hack, we should not relay on apt-get installation
  # See https://github.com/pytorch/pytorch/issues/144768
  if [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "11.8"* ]]; then
    maybe_libnccl_dev="libnccl2=2.15.5-1+cuda11.8 libnccl-dev=2.15.5-1+cuda11.8 --allow-downgrades --allow-change-held-packages"
  elif [[ "$UBUNTU_VERSION" == "20.04"* && "$CUDA_VERSION" == "12.4"* ]]; then
    maybe_libnccl_dev="libnccl2=2.26.2-1+cuda12.4 libnccl-dev=2.26.2-1+cuda12.4 --allow-downgrades --allow-change-held-packages"
  else
    maybe_libnccl_dev=""
  fi
@ -80,8 +76,7 @@ install_ubuntu() {
    vim \
    unzip \
    gpg-agent \
-    gdb \
+    gdb
    bc
  # Should resolve issues related to various apt package repository cert issues
  # see: https://github.com/pytorch/pytorch/issues/65931
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.9.1
+  git clone https://github.com/pytorch/sccache
  cd sccache
  echo "Building sccache"
  cargo build --release
@ -19,10 +19,6 @@ install_ubuntu() {
  rm -rf sccache
  apt-get remove -y cargo rustc
  apt-get autoclean && apt-get clean
  echo "Downloading old sccache binary from S3 repo for PCH builds"
  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache-0.2.14a
  chmod 755 /opt/cache/bin/sccache-0.2.14a
 }
 install_binary() {
@ -36,42 +32,22 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"
 # Setup compiler cache
-install_ubuntu
+if [ -n "$ROCM_VERSION" ]; then
  curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
 else
  ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
  # TODO: Install the pre-built binary from S3 as building from source
  # https://github.com/pytorch/sccache has started failing mysteriously
  # in which sccache server couldn't start with the following error:
  #   sccache: error: Invalid argument (os error 22)
  install_binary
 fi
 chmod a+x /opt/cache/bin/sccache
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ $1 == "gcc" ]; then
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/opt/cache/bin/$1"
    # Do not call sccache recursively when dumping preprocessor argument
    # For some reason it's very important for the first cached nvcc invocation
    cat >"/opt/cache/bin/$1" <<EOF
 #!/bin/sh
 # sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
 for arg in "\$@"; do
  if [ "\$arg" = "-E" ]; then
    exec $(which $1) "\$@"
  fi
 done
 if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
  exec sccache $(which $1) "\$@"
 else
  exec $(which $1) "\$@"
 fi
 EOF
  else
    cat >"/opt/cache/bin/$1" <<EOF
 #!/bin/sh
 if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
  exec sccache $(which $1) "\$@"
 else
  exec $(which $1) "\$@"
 fi
 EOF
  fi
  chmod a+x "/opt/cache/bin/$1"
 }
@ -112,7 +88,7 @@ if [ -n "$ROCM_VERSION" ]; then
    TOPDIR=$(dirname $OLDCOMP)
    WRAPPED="$TOPDIR/original/$COMPNAME"
    mv "$OLDCOMP" "$WRAPPED"
-    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" >"$OLDCOMP"
+    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP"
    chmod a+x "$OLDCOMP"
  }
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -13,18 +13,11 @@ if [ -n "$CLANG_VERSION" ]; then
  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
  fi
  sudo apt-get update
-  if [[ $CLANG_VERSION -ge 18 ]]; then
+  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
-    apt-get install -y libomp-${CLANG_VERSION}-dev libclang-rt-${CLANG_VERSION}-dev clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
  else
    apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
  fi
  # Install dev version of LLVM.
  if [ -n "$LLVMDEV" ]; then
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -25,8 +25,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda
-  SCRIPT_FOLDER="$( cd "$(dirname "$0")" ; pwd -P )"
+  source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
  source "${SCRIPT_FOLDER}/common_utils.sh"
  pushd /tmp
  wget -q "${BASE_URL}/${CONDA_FILE}"
@ -66,10 +65,23 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.29=*openmp*"
+    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
      NUMPY_VERSION=1.24.4
    else
      NUMPY_VERSION=1.26.2
    fi
  else
-    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
+    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
      NUMPY_VERSION=1.26.0
    else
      NUMPY_VERSION=1.21.2
    fi
  fi
  conda_install ${CONDA_COMMON_DEPS}
  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -85,13 +97,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
+    conda_install magma-cuda$(TMP=${CUDA_VERSION/./};echo ${TMP%.*[0-9]}) -c pytorch
  fi
  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
  pip_install numpy=="$NUMPY_VERSION"
  pip_install -U scikit-learn
  if [ -n "$DOCS" ]; then
    apt-get update
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
 function check_var {
    if [ -z "$1" ]; then
@ -22,13 +22,6 @@ function do_cpython_build {
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
    local additional_flags=""
    if [ "$py_ver" == "3.13.0t" ]; then
        additional_flags=" --disable-gil"
        mv cpython-3.13/ cpython-3.13t/
    fi
    pushd $py_folder
    local prefix="/opt/_internal/cpython-${py_ver}"
@ -44,10 +37,8 @@ function do_cpython_build {
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
    make -j40 > /dev/null
    make install > /dev/null
@ -70,7 +61,7 @@ function do_cpython_build {
    # install setuptools since python 3.12 is required to use distutils
    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
-    ln -sf ${prefix} /opt/python/${abi_tag}
+    ln -s ${prefix} /opt/python/${abi_tag}
 }
 function build_cpython {
@ -78,14 +69,7 @@ function build_cpython {
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
-
+    if [ "$py_ver" = "3.13.0" ]; then
    if [ "$py_ver" = "3.13.0t" ]; then
        PY_VER_SHORT="3.13"
        PYT_VER_SHORT="3.13t"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
    elif [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,8 +2,8 @@
 set -ex
-NCCL_VERSION=v2.26.2-1
+NCCL_VERSION=v2.21.5-1
-CUDNN_VERSION=9.5.1.17
+CUDNN_VERSION=9.1.0.70
 function install_cusparselt_040 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -16,6 +16,17 @@ function install_cusparselt_040 {
    rm -rf tmp_cusparselt
 }
 function install_cusparselt_052 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.5.2.1-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
@ -27,20 +38,7 @@ function install_cusparselt_062 {
    rm -rf tmp_cusparselt
 }
 function install_cusparselt_063 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
 function install_118 {
    CUDNN_VERSION=9.1.0.70
    NCCL_VERSION=v2.21.5-1
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
@ -73,9 +71,41 @@ function install_118 {
    ldconfig
 }
 function install_121 {
    echo "Installing CUDA 12.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
    rm -rf /usr/local/cuda-12.1 /usr/local/cuda
    # install CUDA 12.1.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/12.1.1/local_installers/cuda_12.1.1_530.30.02_linux.run
    chmod +x cuda_12.1.1_530.30.02_linux.run
    ./cuda_12.1.1_530.30.02_linux.run --toolkit --silent
    rm -f cuda_12.1.1_530.30.02_linux.run
    rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.1 /usr/local/cuda
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn && cd tmp_cudnn
    wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
    tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
    cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf tmp_cudnn
    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
    cd nccl && make -j src.build
    cp -a build/include/* /usr/local/cuda/include/
    cp -a build/lib/* /usr/local/cuda/lib64/
    cd ..
    rm -rf nccl
    install_cusparselt_052
    ldconfig
 }
 function install_124 {
-  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@ -107,39 +137,6 @@ function install_124 {
  ldconfig
 }
 function install_126 {
  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
  # install CUDA 12.6.3 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
  chmod +x cuda_12.6.3_560.35.05_linux.run
  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
  rm -f cuda_12.6.3_560.35.05_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
  cd nccl && make -j src.build
  cp -a build/include/* /usr/local/cuda/include/
  cp -a build/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf nccl
  install_cusparselt_063
  ldconfig
 }
 function prune_118 {
    echo "Pruning CUDA 11.8 and cuDNN"
    #####################################################################################
@ -171,6 +168,37 @@ function prune_118 {
    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2022.3.0 $CUDA_BASE/nsight-systems-2022.4.2/
 }
 function prune_121 {
  echo "Pruning CUDA 12.1"
  #####################################################################################
  # CUDA 12.1 prune static libs
  #####################################################################################
    export NVPRUNE="/usr/local/cuda-12.1/bin/nvprune"
    export CUDA_LIB_DIR="/usr/local/cuda-12.1/lib64"
    export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
    if [[ -n "$OVERRIDE_GENCODE" ]]; then
        export GENCODE=$OVERRIDE_GENCODE
    fi
    # all CUDA libs except CuDNN and CuBLAS
    ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
    # prune CuDNN and CuBLAS
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
    $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
    #####################################################################################
    # CUDA 12.1 prune visual tools
    #####################################################################################
    export CUDA_BASE="/usr/local/cuda-12.1/"
    rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2023.1.0 $CUDA_BASE/nsight-systems-2023.1.2/
 }
 function prune_124 {
  echo "Pruning CUDA 12.4"
  #####################################################################################
@ -199,92 +227,22 @@ function prune_124 {
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
-  # CUDA 12.4 prune visual tools
+  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }
 function prune_126 {
  echo "Pruning CUDA 12.6"
  #####################################################################################
  # CUDA 12.6 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.6 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.6/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
 }
 function install_128 {
  CUDNN_VERSION=9.7.1.26
  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
  chmod +x cuda_12.8.0_570.86.10_linux.run
  ./cuda_12.8.0_570.86.10_linux.run --toolkit --silent
  rm -f cuda_12.8.0_570.86.10_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
  cd nccl && make -j src.build
  cp -a build/include/* /usr/local/cuda/include/
  cp -a build/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf nccl
  install_cusparselt_063
  ldconfig
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    11.8) install_118; prune_118
        ;;
    12.1) install_121; prune_121
        ;;
    12.4) install_124; prune_124
        ;;
    12.6) install_126; prune_126
        ;;
    12.8) install_128;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -3,36 +3,35 @@
 set -ex
-NCCL_VERSION=v2.26.2-1
+NCCL_VERSION=v2.21.5-1
 CUDNN_VERSION=9.8.0.87
-function install_cusparselt_063 {
+function install_cusparselt_052 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }
-function install_128 {
+function install_124 {
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
-  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.8.0 in the same container
+  # install CUDA 12.4.1 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
-  chmod +x cuda_12.8.0_570.86.10_linux_sbsa.run
+  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
-  ./cuda_12.8.0_570.86.10_linux_sbsa.run --toolkit --silent
+  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.8.0_570.86.10_linux_sbsa.run
+  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.8 /usr/local/cuda
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn
@ -45,16 +44,47 @@ function install_128 {
  cd ..
  rm -rf nccl
-  install_cusparselt_063
+  install_cusparselt_052
  ldconfig
 }
 function prune_124 {
  echo "Pruning CUDA 12.4"
  #####################################################################################
  # CUDA 12.4 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.8) install_128;
+    12.4) install_124; prune_124
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -4,11 +4,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
+    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-8]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.3.2-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
@ -21,11 +13,17 @@ elif [[ ${CUDA_VERSION:0:4} == "12.4" ]]; then
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 else
    echo "Not sure which libcusparselt version to install for this ${CUDA_VERSION}"
 fi
 tar xf ${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -36,23 +36,25 @@ install_conda_dependencies() {
 }
 install_pip_dependencies() {
-  pushd executorch
+  pushd executorch/.ci/docker
-  as_jenkins bash install_executorch.sh
+  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
-
+  # binaries later, ExecuTorch only needs CPU
-  # A workaround, ExecuTorch has moved to numpy 2.0 which is not compatible with the current
+  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-  # numba and scipy version used in PyTorch CI
+  # Install all Python dependencies
-  conda_run pip uninstall -y numba scipy
+  pip_install -r requirements-ci.txt
  popd
 }
 setup_executorch() {
  pushd executorch
  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
  export PYTHON_EXECUTABLE=python
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-  as_jenkins .ci/scripts/setup-linux.sh --build-tool cmake || true
+  as_jenkins .ci/scripts/setup-linux.sh cmake
  popd
 }
--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -35,9 +35,7 @@ git clone https://github.com/halide/Halide.git
 pushd Halide
 git checkout ${COMMIT} && git submodule update --init --recursive
 pip_install -r requirements.txt
-# NOTE: pybind has a requirement for cmake > 3.5 so set the minimum cmake version here with a flag
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
 #       Context: https://github.com/pytorch/pytorch/issues/150420
 cmake -G Ninja -DCMAKE_POLICY_VERSION_MINIMUM=3.5 -DCMAKE_BUILD_TYPE=Release -S . -B build
 cmake --build build
 test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
 cmake --install build --prefix ${CONDA_PREFIX}
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -7,13 +7,14 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 function install_huggingface() {
  local version
  commit=$(get_pinned_commit huggingface)
  pip_install pandas==2.0.3
  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }
 function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)
-
+  pip_install pandas==2.0.3
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
  conda_run pip uninstall -y cmake torch torchvision triton
--- a/.ci/docker/common/install_magma.sh
+++ b/.ci/docker/common/install_magma.sh
@ -3,6 +3,8 @@
 set -eou pipefail
 MAGMA_VERSION="2.5.2"
 function do_install() {
    cuda_version=$1
    cuda_version_nodot=${1/./}
@ -15,7 +17,7 @@ function do_install() {
        set -x
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
        tar -xvf "${magma_archive}"
        mkdir -p "${cuda_dir}/magma"
        mv include "${cuda_dir}/magma/include"
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,26 +0,0 @@
 #!/usr/bin/env bash
 # Script that replaces the magma install from a conda package
 set -eou pipefail
 function do_install() {
    cuda_version_nodot=${1/./}
    anaconda_python_version=$2
    MAGMA_VERSION="2.6.1"
    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
    (
        set -x
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
        tar -xvf "${magma_archive}"
        mv include/* "${anaconda_dir}/include/"
        mv lib/* "${anaconda_dir}/lib"
        popd
    )
 }
 do_install $1 $2
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,21 +10,6 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi
 IS_UBUNTU=0
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
  ubuntu)
    IS_UBUNTU=1
    ;;
  centos|almalinux)
    IS_UBUNTU=0
    ;;
  *)
    echo "Unable to determine OS..."
    exit 1
    ;;
 esac
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -43,6 +28,12 @@ else
 fi
 ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
 # Install custom MIOpen + COMgr for ROCm >= 4.0.1
 if [[ $ROCM_INT -lt 40001 ]]; then
    echo "ROCm version < 4.0.1; will not install custom MIOpen"
    exit 0
 fi
 # Function to retry functions that sometimes timeout or have flaky failures
 retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
@ -60,49 +51,70 @@ else
    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
 fi
 # MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
 MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_USE_COMGR=ON
 -DMIOPEN_BUILD_DRIVER=OFF
 "
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
+# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
+if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-else
+    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
    echo "ROCm ${ROCM_VERSION} does not need any patches, do not build from source"
    exit 0
-fi
+elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
-
+    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
-
+    exit 0
-if [[ ${IS_UBUNTU} == 1 ]]; then
+elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
-  apt-get remove -y miopen-hip
+    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
 elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
 elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
 elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
 elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
 elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
 elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
 else
-  # Workaround since almalinux manylinux image already has this and cget doesn't like that
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
-  rm -rf /usr/local/lib/pkgconfig/sqlite3.pc
+    exit 1
  # Versioned package name needs regex match
  # Use --noautoremove to prevent other rocm packages from being uninstalled
  yum remove -y miopen-hip* --noautoremove
 fi
 yum remove -y miopen-hip
 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
-# Don't build CK to save docker build time
+# Don't build MLIR to save docker build time
-sed -i '/composable_kernel/d' requirements.txt
+# since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
    sed -i '/rocMLIR/d' requirements.txt
 elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
    sed -i '/llvm-project-mlir/d' requirements.txt
 fi
 ## MIOpen minimum requirements
 cmake -P install_deps.cmake --minimum
 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-if [[ ${IS_UBUNTU} == 1 ]]; then
+yum clean all
-  apt-get autoclean && apt-get clean
+rm -rf /var/cache/yum
-  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+rm -rf /var/lib/yum/yumdb
-else
+rm -rf /var/lib/yum/history
  yum clean all
  rm -rf /var/cache/yum
  rm -rf /var/lib/yum/yumdb
  rm -rf /var/lib/yum/history
 fi
 ## Build MIOpen
 mkdir -p build
@ -110,7 +122,7 @@ cd build
 PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
    ${MIOPEN_CMAKE_COMMON_FLAGS} \
    ${MIOPEN_CMAKE_DB_FLAGS} \
-    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}"
+    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
 make MIOpen -j $(nproc)
 # Build MIOpen package
@ -119,11 +131,7 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget
-if [[ ${IS_UBUNTU} == 1 ]]; then
+yum install -y miopen-*.rpm
  sudo dpkg -i miopen-hip*.deb
 else
  yum install -y miopen-*.rpm
 fi
 popd
 rm -rf MIOpen
--- a/.ci/docker/common/install_ninja.sh
+++ b/.ci/docker/common/install_ninja.sh
@ -4,15 +4,10 @@ set -ex
 [ -n "$NINJA_VERSION" ]
-arch=$(uname -m)
+url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
 if [ "$arch" == "aarch64" ]; then
    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux-aarch64.zip"
 else
    url="https://github.com/ninja-build/ninja/releases/download/v${NINJA_VERSION}/ninja-linux.zip"
 fi
 pushd /tmp
 wget --no-verbose --output-document=ninja-linux.zip "$url"
 unzip ninja-linux.zip -d /usr/local/bin
 rm -f ninja-linux.zip
-popd
+popd
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -15,7 +15,7 @@ pip_install \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
-  networkx==2.5 \
+  networkx==2.0 \
  numpy==1.24.2
 # ONNXRuntime should be installed before installing
@ -30,16 +30,17 @@ pip_install \
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.18.1
+pip_install onnxruntime==1.18
-pip_install onnx==1.17.0
+pip_install onnx==1.16.0
-pip_install onnxscript==0.2.2 --no-deps
+# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
 pip_install onnxscript==0.1.0.dev20240613 --no-deps
 # required by onnxscript
 pip_install ml_dtypes
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
 IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
-as_jenkins echo 'import transformers; transformers.GPTJForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gptj");' > "${IMPORT_SCRIPT_FILENAME}"
+as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
 # Need a PyTorch version for transformers to work
 pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,7 @@
 set -ex
 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.29 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules
 OPENBLAS_BUILD_FLAGS="
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -62,22 +62,6 @@ install_ubuntu() {
        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
    done
    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
        # clr build needs CppHeaderParser but can only find it using conda's python
        /opt/conda/bin/python -m pip install CppHeaderParser
        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
        HIP_COMMON_DIR=$(readlink -f HIP)
        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
        mkdir -p clr/build
        pushd clr/build
        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
        make -j
        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
        popd
        rm -rf HIP clr
    fi
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -12,7 +12,7 @@ case "$ID" in
    apt-get install -y libpciaccess-dev pkg-config
    apt-get clean
    ;;
-  centos|almalinux)
+  centos)
    yum install -y libpciaccess-devel pkgconfig
    ;;
  *)
@ -115,7 +115,7 @@ index a5007ffc..13fa07fc 100644
 	if (!fp) {
 -		fprintf(stderr, "%s: %s\n", AMDGPU_ASIC_ID_TABLE,
 -			strerror(errno));
-+		//fprintf(stderr, "amdgpu.ids: No such file or directory\n");
+		fprintf(stderr, "amdgpu.ids: No such file or directory\n");
 		return;
 	}
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -3,18 +3,6 @@
 set -ex
 # Magma build scripts need `python`
 ln -sf /usr/bin/python3 /usr/bin/python
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
  almalinux)
    yum install -y gcc-gfortran
    ;;
  *)
    echo "No preinstalls to build magma..."
    ;;
 esac
 MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -12,14 +12,14 @@ conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }
-if [ -n "${XPU_VERSION}" ]; then
+if [ -n "${ROCM_VERSION}" ]; then
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton-rocm"
 elif [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 elif [ -n "${TRITON_CPU}" ]; then
  TRITON_REPO="https://github.com/triton-lang/triton-cpu"
  TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/triton-lang/triton"
+  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton"
 fi
@ -47,10 +47,9 @@ chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/
-as_jenkins git clone --recursive ${TRITON_REPO} triton
+as_jenkins git clone ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
 cd python
 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
@ -60,15 +59,15 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9
-  CXX=g++-9 pip_install .
+  CXX=g++-9 pip_install -e .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9
-  CXX=g++-9 pip_install .
+  CXX=g++-9 pip_install -e .
 else
-  pip_install .
+  pip_install -e .
 fi
 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -8,12 +8,6 @@ else
  with_cuda=no
 fi
 if [[ -d "/opt/rocm" ]]; then
  with_rocm=/opt/rocm
 else
  with_rocm=no
 fi
 function install_ucx() {
  set -ex
  git clone --recursive https://github.com/openucx/ucx.git
@ -25,7 +19,6 @@ function install_ucx() {
  ./configure --prefix=$UCX_HOME      \
      --enable-mt                     \
      --with-cuda=$with_cuda          \
      --with-rocm=$with_rocm          \
      --enable-profiling              \
      --enable-stats
  time make -j
@ -43,29 +36,12 @@ function install_ucc() {
  git submodule update --init --recursive
  ./autogen.sh
  # We only run distributed tests on Tesla M60 and A10G
  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
  if [[ -n "$ROCM_VERSION" ]]; then
    if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
      amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
    else
      amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
    fi
    for arch in $amdgpu_targets; do
      HIP_OFFLOAD="$HIP_OFFLOAD --offload-arch=$arch"
    done
  else
    HIP_OFFLOAD="all-arch-no-native"
  fi
  ./configure --prefix=$UCC_HOME          \
    --with-ucx=$UCX_HOME                  \
    --with-cuda=$with_cuda                \
-    --with-nvcc-gencode="${NVCC_GENCODE}" \
+    --with-nvcc-gencode="${NVCC_GENCODE}"
    --with-rocm=$with_rocm                \
    --with-rocm-arch="${HIP_OFFLOAD}"
  time make -j
  sudo make install
--- a/.ci/docker/common/install_user.sh
+++ b/.ci/docker/common/install_user.sh
@ -2,13 +2,6 @@
 set -ex
 # Since version 24 the system ships with user 'ubuntu' that has id 1000
 # We need a work-around to enable id 1000 usage for this script
 if [[ $UBUNTU_VERSION == 24.04 ]]; then
    # touch is used to disable harmless error message
    touch /var/mail/ubuntu && chown ubuntu /var/mail/ubuntu && userdel -r ubuntu
 fi
 # Mirror jenkins user in container
 # jenkins user as ec2-user should have the same user-id
 echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -24,10 +24,10 @@ function install_ubuntu() {
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
+        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
+    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
+        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
+        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
    # Update the packages list and repository index
    apt-get update
@ -41,16 +41,14 @@ function install_ubuntu() {
        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
        apt-get install -y intel-ocloc
    fi
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
-    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+    if [ -n "$XPU_VERSION" ]; then
-        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
    else
        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
    fi
    apt-get install -y ${XPU_PACKAGES}
    # Cleanup
    apt-get autoclean && apt-get clean
@ -60,13 +58,13 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
    elif [[ "${ID}" == "almalinux" ]]; then
        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.8"
+        VERSION_ID="8.6"
    fi
    dnf install -y 'dnf-command(config-manager)'
@ -74,21 +72,16 @@ function install_rhel() {
    dnf config-manager --add-repo \
        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
    # To add the online network network package repository for the Intel Support Packages
-    tee > /etc/yum.repos.d/oneAPI.repo << EOF
+    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
-[oneAPI]
+[intel-for-pytorch-gpu-dev]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
+baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 EOF
    # Install Intel Support Packages
    if [[ "$XPU_VERSION" == "2025.0" ]]; then
        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
    fi
    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
    # Compute and Media Runtimes
@ -103,6 +96,8 @@ EOF
    dnf install -y --refresh \
        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
        level-zero-devel
    # Install Intel Support Packages
    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev
    # Cleanup
    dnf clean all
@ -124,7 +119,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
+    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
    # The xpu-smi packages
@ -136,7 +131,7 @@ function install_sles() {
    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel
    # Install Intel Support Packages
-    zypper install -y ${XPU_PACKAGES}
+    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev
 }
@ -147,13 +142,6 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi
 XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
 XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
 if [[ "$XPU_VERSION" == "2025.0" ]]; then
    XPU_REPO_NAME="oneapi"
    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
 fi
 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,39 +1,47 @@
-ARG CUDA_VERSION=12.4
+ARG CUDA_VERSION=10.2
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-FROM amd64/almalinux:8 as base
+FROM centos:7 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
-ARG DEVTOOLSET_VERSION=11
+ARG DEVTOOLSET_VERSION=9
-
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-ENV LC_ALL en_US.UTF-8
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-ENV LANG en_US.UTF-8
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-ENV LANGUAGE en_US.UTF-8
+RUN yum update -y
-
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
 RUN yum -y update
 RUN yum -y install epel-release
 RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
-ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 # EPEL for cmake
 RUN yum --enablerepo=extras install -y epel-release
-# cmake-3.18.4 from pip
+# cmake
-RUN yum install -y python3-pip && \
+RUN yum install -y cmake3 && \
-    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
-    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 RUN yum install -y autoconf aclocal automake make sudo
 RUN rm -rf /usr/local/cuda-*
 FROM base as openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf
 FROM base as openssl
 # Install openssl
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 FROM base as conda
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
@ -41,7 +49,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh
 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.4
+ARG CUDA_VERSION=10.2
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
@ -62,10 +70,6 @@ FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 ENV DESIRED_CUDA=12.6
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -75,7 +79,6 @@ FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 # Final step
 FROM ${BASE_TARGET} as final
@ -88,8 +91,7 @@ COPY ./common/install_jni.sh install_jni.sh
 COPY ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
-ENV PATH /opt/conda/bin:$PATH
+ENV  PATH /opt/conda/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 COPY --from=mnist  /usr/local/mnist /usr/local/mnist
 RUN rm -rf /usr/local/cuda
 RUN chmod o+rw /usr/local
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -37,21 +37,15 @@ esac
 (
  set -x
  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
  sudo systemctl daemon-reload
  sudo systemctl restart docker
  docker build \
    --target final \
    --progress plain \
    --build-arg "BASE_TARGET=${BASE_TARGET}" \
    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-    --build-arg "DEVTOOLSET_VERSION=11" \
+    --build-arg "DEVTOOLSET_VERSION=9" \
    -t ${DOCKER_IMAGE_NAME} \
    $@ \
-    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
    ${TOPDIR}/.ci/docker/
 )
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -56,21 +56,16 @@ RUN bash ./install_cuda.sh 11.8
 RUN bash ./install_magma.sh 11.8
 RUN ln -sf /usr/local/cuda-11.8 /usr/local/cuda
 FROM cuda as cuda12.1
 RUN bash ./install_cuda.sh 12.1
 RUN bash ./install_magma.sh 12.1
 RUN ln -sf /usr/local/cuda-12.1 /usr/local/cuda
 FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 RUN bash ./install_magma.sh 12.4
 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda
 FROM cuda as cuda12.6
 RUN bash ./install_cuda.sh 12.6
 RUN bash ./install_magma.sh 12.6
 RUN ln -sf /usr/local/cuda-12.6 /usr/local/cuda
 FROM cuda as cuda12.8
 RUN bash ./install_cuda.sh 12.8
 RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda
 FROM cpu as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
@ -92,6 +87,13 @@ RUN apt-get update -y && \
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
 # Install patchelf
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,7 +39,17 @@ case ${GPU_ARCH_TYPE} in
        BASE_TARGET=rocm
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
        else
            echo "ERROR: rocm regex failed"
            exit 1
        fi
        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
            PYTORCH_ROCM_ARCH+=";gfx942"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -25,8 +25,7 @@ ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 # Install cuda and cudnn
 ARG CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,7 +10,6 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 ARG DEVTOOLSET_VERSION=9
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
@ -144,10 +143,6 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.1
 ARG DEVTOOLSET_VERSION=9
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 ENV PATH /opt/conda/bin:$PATH
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
@ -198,3 +193,10 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -0,0 +1,153 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=10.2
 ARG GPU_IMAGE=nvidia/cuda:${BASE_CUDA_VERSION}-devel-centos7
 FROM quay.io/pypa/manylinux2014_x86_64 as base
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y wget curl perl util-linux xz bzip2 git patch which perl zlib-devel
 RUN yum install -y yum-utils centos-release-scl sudo
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
 ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 FROM base as openssl
 # Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 # remove unncessary python versions
 RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
 RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh
 FROM base as intel
 # MKL
 ADD ./common/install_mkl.sh install_mkl.sh
 RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM base as magma
 ARG BASE_CUDA_VERSION=10.2
 # Install magma
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh
 FROM base as jni
 # Install java jni header
 ADD ./common/install_jni.sh install_jni.sh
 ADD ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh
 FROM base as libpng
 # Install libpng
 ADD ./common/install_libpng.sh install_libpng.sh
 RUN bash ./install_libpng.sh && rm install_libpng.sh
 FROM ${GPU_IMAGE} as common
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8
 RUN yum install -y \
        aclocal \
        autoconf \
        automake \
        bison \
        bzip2 \
        curl \
        diffutils \
        file \
        git \
        make \
        patch \
        perl \
        unzip \
        util-linux \
        wget \
        which \
        xz \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
 # Override this behaviour by treating every folder as safe
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # Install LLVM version
 COPY --from=openssl            /opt/openssl                          /opt/openssl
 COPY --from=base               /opt/python                           /opt/python
 COPY --from=base               /opt/_internal                        /opt/_internal
 COPY --from=base               /usr/local/bin/auditwheel             /usr/local/bin/auditwheel
 COPY --from=intel              /opt/intel                            /opt/intel
 COPY --from=base               /usr/local/bin/patchelf               /usr/local/bin/patchelf
 COPY --from=libpng             /usr/local/bin/png*                   /usr/local/bin/
 COPY --from=libpng             /usr/local/bin/libpng*                /usr/local/bin/
 COPY --from=libpng             /usr/local/include/png*               /usr/local/include/
 COPY --from=libpng             /usr/local/include/libpng*            /usr/local/include/
 COPY --from=libpng             /usr/local/lib/libpng*                /usr/local/lib/
 COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/lib/pkgconfig
 COPY --from=jni                /usr/local/include/jni.h              /usr/local/include/jni.h
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.2
 RUN yum install -y yum-utils centos-release-scl
 RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
 RUN yum install -y devtoolset-7-gcc devtoolset-7-gcc-c++ devtoolset-7-gcc-gfortran devtoolset-7-binutils
 ENV PATH=/opt/rh/devtoolset-7/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:$LD_LIBRARY_PATH
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 # ninja
 RUN yum install -y http://repo.okay.com.mx/centos/7/x86_64/release/okay-release-1-1.noarch.rpm
 RUN yum install -y ninja-build
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 FROM common as rocm_final
 ARG ROCM_VERSION=3.7
 # Install ROCm
 ADD ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
 # cmake is already installed inside the rocm base image, but both 2 and 3 exist
 # cmake3 is needed for the later MIOpen custom build, so that step is last.
 RUN yum install -y cmake3 && \
    rm -f /usr/bin/cmake && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -1,4 +1,5 @@
 # syntax = docker/dockerfile:experimental
 ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=11.8
 ARG GPU_IMAGE=amd64/almalinux:8
 FROM quay.io/pypa/manylinux_2_28_x86_64 as base
@ -116,49 +117,30 @@ COPY --from=jni                /usr/local/include/jni.h              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=11.8
 ARG DEVTOOLSET_VERSION=11
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
 RUN bash ./install_conda.sh && rm install_conda.sh
 ENV PATH /opt/conda/bin:$PATH
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # Install setuptools and wheel for python 3.12/3.13
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
    /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
    done;
-
+# cmake-3.18.4 from pip
 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
-    ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
-FROM cpu_final as rocm_final
+FROM common as rocm_final
-ARG ROCM_VERSION=6.0
+ARG ROCM_VERSION=3.7
-ARG PYTORCH_ROCM_ARCH
+# Install ROCm
-ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
+ADD ./common/install_rocm.sh install_rocm.sh
-ARG DEVTOOLSET_VERSION=11
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
-ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
-# Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
-# below workaround helps avoid error
+RUN yum install -y cmake3 && \
-ENV ROCM_PATH /opt/rocm
+    rm -f /usr/bin/cmake && \
-# cmake-3.28.4 from pip to get enable_language(HIP)
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
 # and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
@ -168,7 +150,8 @@ ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
 # Install setuptools and wheel for python 3.13
 RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
 ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -38,12 +38,6 @@ RUN yum install -y \
  sudo \
  gcc-toolset-${GCCTOOLSET_VERSION}-toolchain
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
 COPY ./common/install_ninja.sh install_ninja.sh
 RUN if [ -n "${NINJA_VERSION}" ]; then bash ./install_ninja.sh; fi
 RUN rm install_ninja.sh
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
@ -54,11 +48,6 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/op
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
 FROM base as openblas
 # Install openblas
 ADD ./common/install_openblas.sh install_openblas.sh
 RUN bash ./install_openblas.sh && rm install_openblas.sh
 FROM base as final
 # remove unncessary python versions
@ -66,5 +55,3 @@ RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -61,7 +61,7 @@ RUN git config --global --add safe.directory "*"
 # NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
 ###############################################################################
 RUN cd ~/ \
-  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
  && ar x ~/libgfortran-10-dev.deb \
  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -1,20 +1,17 @@
-FROM quay.io/pypa/manylinux_2_28_s390x as base
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
 # Language variables
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8
 ARG DEVTOOLSET_VERSION=13
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
-RUN yum -y install epel-release
+RUN apt update ; apt upgrade -y
-RUN yum -y update
+RUN apt install -y \
-RUN yum install -y \
+  build-essential \
  sudo \
  autoconf \
  automake \
  bison \
  bzip2 \
  curl \
  diffutils \
@ -27,40 +24,19 @@ RUN yum install -y \
  util-linux \
  wget \
  which \
-  xz \
+  xz-utils \
  yasm \
  less \
  zstd \
  libgomp \
  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
  gcc-toolset-${DEVTOOLSET_VERSION}-binutils \
  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  cmake \
-  rust \
+  python3 \
-  cargo \
+  python3-dev \
-  llvm-devel \
+  python3-setuptools \
-  libzstd-devel \
+  python3-yaml \
-  python3.12-devel \
+  python3-typing-extensions \
-  python3.12-setuptools \
+  libblas-dev \
-  python3.12-pip \
+  libopenblas-dev \
-  python3-virtualenv \
+  liblapack-dev \
-  python3.12-pyyaml \
+  libatlas-base-dev
  python3.12-numpy \
  python3.12-wheel \
  python3.12-cryptography \
  blas-devel \
  openblas-devel \
  lapack-devel \
  atlas-devel \
  libjpeg-devel \
  libxslt-devel \
  libxml2-devel \
  openssl-devel \
  valgrind
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
@ -68,8 +44,14 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"
-# installed python doesn't have development parts. Rebuild it from scratch
+FROM base as openssl
-RUN /bin/rm -rf /opt/_internal /opt/python /usr/local/*/*
+# Install openssl (this must precede `build python` step)
 # (In order to have a proper SSL module, Python is compiled
 # against a recent openssl [see env vars above], which is linked
 # statically. We delete openssl afterwards.)
 ADD ./common/install_openssl.sh install_openssl.sh
 RUN bash ./install_openssl.sh && rm install_openssl.sh
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
 # EPEL for cmake
 FROM base as patchelf
@ -82,43 +64,10 @@ FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
 ENV SSL_CERT_FILE=
 RUN bash build_scripts/build.sh && rm -r build_scripts
-FROM base as final
+FROM openssl as final
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel  /usr/local/bin/auditwheel
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
 RUN alternatives --set python /usr/bin/python3.12
 RUN alternatives --set python3 /usr/bin/python3.12
 RUN pip-3.12 install typing_extensions
 ENTRYPOINT []
 CMD ["/bin/bash"]
 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
 # - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
  protobuf-devel \
  protobuf-c-devel \
  protobuf-lite-devel \
  wget \
  patch
 RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
 RUN cd ~ && \
  git clone https://github.com/jax-ml/ml_dtypes && \
  cd ml_dtypes && \
  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
  python3 setup.py bdist_wheel && \
  pip3 install dist/*.whl && \
  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -48,7 +48,7 @@ case ${GPU_ARCH_TYPE} in
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
    cpu-cxx11-abi)
@ -61,7 +61,7 @@ case ${GPU_ARCH_TYPE} in
    cpu-s390x)
        TARGET=final
        DOCKER_TAG=cpu-s390x
-        GPU_IMAGE=s390x/almalinux:8
+        GPU_IMAGE=redhat/ubi9
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
@ -87,18 +87,22 @@ case ${GPU_ARCH_TYPE} in
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    rocm|rocm-manylinux_2_28)
+    rocm)
        TARGET=rocm_final
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
-        DEVTOOLSET_VERSION="9"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
-        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
-            MANY_LINUX_VERSION="2_28"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
-            DEVTOOLSET_VERSION="11"
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
-            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        else
            echo "ERROR: rocm regex failed"
            exit 1
        fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
-        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
+            PYTORCH_ROCM_ARCH+=";gfx942"
        fi
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
        ;;
    xpu)
        TARGET=xpu_final
@ -120,17 +124,7 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-
+    DOCKER_BUILDKIT=1 docker build \
    # Only activate this if in CI
    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
        sudo systemctl daemon-reload
        sudo systemctl restart docker
    fi
    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
@ -140,7 +134,7 @@ fi
        "${TOPDIR}/.ci/docker/"
 )
-GITHUB_REF=${GITHUB_REF:-"dev")}
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -16,27 +16,37 @@ CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
 PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel"
 if [ "$(uname -m)" != "s390x" ] ; then
    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel"
 else
    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel"
 fi
 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
 source $MY_DIR/build_utils.sh
-# Development tools and libraries
+if [ "$(uname -m)" != "s390x" ] ; then
-yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    # Dependencies for compiling Python that we want to remove from
-    automake which file \
+    # the final image after compiling Python
-    ${PYTHON_COMPILE_DEPS}
+    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
    # Libraries that are allowed as part of the manylinux1 profile
    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
    # Development tools and libraries
    yum -y install bzip2 make git patch unzip bison yasm diffutils \
        automake which file cmake28 \
        kernel-devel-`uname -r` \
        ${PYTHON_COMPILE_DEPS}
 else
    # Dependencies for compiling Python that we want to remove from
    # the final image after compiling Python
    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
    # Libraries that are allowed as part of the manylinux1 profile
    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
    # Development tools and libraries
    apt install -y bzip2 make git patch unzip diffutils \
        automake which file cmake \
        linux-headers-virtual \
        ${PYTHON_COMPILE_DEPS}
 fi
 # Install newest autoconf
 build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
@ -82,13 +92,16 @@ ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel
 # Clean up development headers and other unnecessary stuff for
 # final image
-yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+if [ "$(uname -m)" != "s390x" ] ; then
-    avahi freetype bitstream-vera-fonts \
+    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
-    ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+        avahi freetype bitstream-vera-fonts \
-yum -y install ${MANYLINUX1_DEPS}
+        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
-yum -y clean all > /dev/null 2>&1
+    yum -y install ${MANYLINUX1_DEPS}
-yum list installed
+    yum -y clean all > /dev/null 2>&1
-
+    yum list installed
 else
    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
 fi
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
 # Strip what we can -- and ignore errors, because this just attempts to strip
--- a/.ci/docker/manywheel/build_scripts/build_utils.sh
+++ b/.ci/docker/manywheel/build_scripts/build_utils.sh
@ -3,7 +3,7 @@
 # Script used only in CD pipeline
 OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source/old/1.1.1/
-CURL_DOWNLOAD_URL=https://curl.se/download
+CURL_DOWNLOAD_URL=https://curl.askapache.com/download
 AUTOCONF_DOWNLOAD_URL=https://ftp.gnu.org/gnu/autoconf
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -1,12 +1,10 @@
 # cf. https://github.com/pypa/manylinux/issues/53
 import sys
 from urllib.request import urlopen
 GOOD_SSL = "https://google.com"
 BAD_SSL = "https://self-signed.badssl.com"
 import sys
 print("Testing SSL certificate checking for Python:", sys.version)
@ -14,8 +12,14 @@ if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
    print("This version never checks SSL certs; skipping tests")
    sys.exit(0)
 if sys.version_info[0] >= 3:
    from urllib.request import urlopen
-EXC = OSError
+    EXC = OSError
 else:
    from urllib import urlopen
    EXC = IOError
 print(f"Connecting to {GOOD_SSL} should work")
 urlopen(GOOD_SSL)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -5,7 +5,7 @@
 #Pinned versions: 1.6
 #test that import:
-boto3==1.35.42
+boto3==1.19.12
 #Description: AWS SDK for python
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
@ -30,14 +30,9 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
-expecttest==0.3.0
+expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
 #Pinned versions: 0.3.0
 #test that import:
 fbscribelogger==0.1.7
 #Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:
@ -90,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:
-mypy==1.14.0
+mypy==1.10.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.14.0
+#Pinned versions: 1.10.0
 #test that import: test_typing.py, test_type_hints.py
 networkx==2.8.8
@ -109,7 +104,7 @@ networkx==2.8.8
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py
 numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
+numba==0.54.1 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
@ -118,7 +113,7 @@ numba==0.55.2 ; python_version == "3.10"
 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
-#Pinned versions: 1.26.2
+#Pinned versions: 1.20
 #test that import: test_view_ops.py, test_unary_ufuncs.py, test_type_promotion.py,
 #test_type_info.py, test_torch.py, test_tensorexpr_pybind.py, test_tensorexpr.py,
 #test_tensorboard.py, test_tensor_creation_ops.py, test_static_runtime.py,
@ -128,12 +123,6 @@ numba==0.55.2 ; python_version == "3.10"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
 numpy==2.1.2; python_version >= "3.13"
 pandas==2.0.3; python_version < "3.13"
 pandas==2.2.3; python_version >= "3.13"
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -145,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
-optree==0.13.0
+optree==0.12.1
 #Description: A library for tree manipulation
-#Pinned versions: 0.13.0
+#Pinned versions: 0.12.1
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -158,7 +147,7 @@ optree==0.13.0
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py
-pillow==11.0.0
+pillow==10.3.0
 #Description:  Python Imaging Library fork
 #Pinned versions: 10.3.0
 #test that import:
@ -193,11 +182,6 @@ pytest-rerunfailures>=10.3
 #Pinned versions:
 #test that import:
 pytest-subtests==0.13.1
 #Description: plugin for subtest support
 #Pinned versions:
 #test that import:
 #pytest-benchmark
 #Description: fixture for benchmarking code
 #Pinned versions: 3.2.3
@ -245,7 +229,7 @@ scikit-image==0.22.0 ; python_version >= "3.10"
 #test that import:
 scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.12.0 ; python_version == "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@ -264,7 +248,7 @@ tb-nightly==2.13.0a20230426
 #test that import:
 # needed by torchgen utils
-typing-extensions>=4.10.0
+typing-extensions
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@ -280,21 +264,26 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #test that import:
 #lintrunner is supported on aarch64-linux only from 0.12.4 version
-lintrunner==0.12.7
+lintrunner==0.12.5
 #Description: all about linters!
-#Pinned versions: 0.12.7
+#Pinned versions: 0.12.5
 #test that import:
 redis>=4.0.0
 #Description: redis database
 #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)
 rockset==1.0.3
 #Description: queries Rockset
 #Pinned versions: 1.0.3
 #test that import:
 ghstack==0.8.0
 #Description: ghstack tool
 #Pinned versions: 0.8.0
 #test that import:
-jinja2==3.1.6
+jinja2==3.1.4
 #Description: jinja2 template engine
 #Pinned versions: 3.1.4
 #test that import:
@ -304,76 +293,42 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
-z3-solver==4.12.6.0
+z3-solver==4.12.2.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:
-tensorboard==2.13.0 ; python_version < "3.13"
+tensorboard==2.13.0
 tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard
 pywavelets==1.4.1 ; python_version < "3.12"
-pywavelets==1.7.0 ; python_version >= "3.12"
+pywavelets==1.5.0 ; python_version >= "3.12"
 #Description: This is a requirement of scikit-image, we need to pin
 # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
 #Pinned versions: 1.4.1
 #test that import:
-lxml==5.3.0
+lxml==5.0.0
 #Description: This is a requirement of unittest-xml-reporting
 # Python-3.9 binaries
 PyGithub==2.3.0
-sympy==1.13.3
+sympy==1.12.1 ; python_version == "3.8"
 sympy==1.13.1 ; python_version >= "3.9"
 #Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
 #Pinned versions:
 #test that import:
-onnx==1.17.0
+onnx==1.16.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
-onnxscript==0.2.2
+onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
 parameterized==0.8.1
 #Description: Parameterizes unittests, both the tests themselves and the entire testing class
 #Pinned versions:
 #test that import:
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py
 pwlf==2.2.1 ; python_version >= "3.8"
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
 # To build PyTorch itself
 astunparse
 PyYAML
 pyzstd
 setuptools
 ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"
 pulp==2.9.0 ; python_version >= "3.8"
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
 dataclasses_json==0.6.7
 #Description: required for data pipeline and scripts under tools/stats
 #Pinned versions: 0.6.7
 #test that import:
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -14,8 +14,7 @@ matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
-tensorboard==2.13.0 ; python_version < "3.13"
+tensorboard==2.13.0
 tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.0.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -30,8 +30,7 @@ ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
 # Install gcc
 ARG GCC_VERSION
@ -81,8 +80,6 @@ RUN bash ./install_openssl.sh
 ENV OPENSSL_DIR /opt/openssl
 ARG INDUCTOR_BENCHMARKS
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -14,20 +14,21 @@ ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 COPY ./common/install_base.sh install_base.sh
 RUN bash ./install_base.sh && rm install_base.sh
 # Install clang
 ARG LLVMDEV
 ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 # Install user
 COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh
 # Install katex
 ARG KATEX
 COPY ./common/install_docs_reqs.sh install_docs_reqs.sh
 RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 # Install conda and other packages (e.g., numpy, pytest)
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
@ -38,11 +39,6 @@ ARG GCC_VERSION
 COPY ./common/install_gcc.sh install_gcc.sh
 RUN bash ./install_gcc.sh && rm install_gcc.sh
 # Install clang
 ARG CLANG_VERSION
 COPY ./common/install_clang.sh install_clang.sh
 RUN bash ./install_clang.sh && rm install_clang.sh
 # (optional) Install protobuf for ONNX
 ARG PROTOBUF
 COPY ./common/install_protobuf.sh install_protobuf.sh
@ -72,8 +68,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -89,32 +83,6 @@ COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
 RUN rm install_amdsmi.sh
 # (optional) Install UCC
 ARG UCX_COMMIT
 ARG UCC_COMMIT
 ENV UCX_COMMIT $UCX_COMMIT
 ENV UCC_COMMIT $UCC_COMMIT
 ENV UCX_HOME /usr
 ENV UCC_HOME /usr
 ADD ./common/install_ucc.sh install_ucc.sh
 RUN if [ -n "${UCX_COMMIT}" ] && [ -n "${UCC_COMMIT}" ]; then bash ./install_ucc.sh; fi
 RUN rm install_ucc.sh
 COPY ./common/install_openssl.sh install_openssl.sh
 ENV OPENSSL_ROOT_DIR /opt/openssl
 RUN bash ./install_openssl.sh
 ENV OPENSSL_DIR /opt/openssl
 ARG INDUCTOR_BENCHMARKS
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -132,28 +100,26 @@ ARG TRITON
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt
 # Install AOTriton
 COPY ./aotriton_version.txt aotriton_version.txt
 COPY ./common/common_utils.sh common_utils.sh
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
 RUN bash ./install_cache.sh && rm install_cache.sh
 # Install Open MPI for ROCm
 COPY ./common/install_openmpi.sh install_openmpi.sh
 RUN if [ -n "${CUDA_VERSION}" ]; then bash install_openmpi.sh; fi
 RUN rm install_openmpi.sh
 # Include BUILD_ENVIRONMENT environment variable in image
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
 # Install LLVM dev version (Defined in the pytorch/builder github repository)
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -36,8 +36,7 @@ ENV DOCS=$DOCS
 COPY requirements-ci.txt requirements-docs.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
 RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
 RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi
 # Install gcc
@ -88,6 +87,19 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}
 # (optional) Install Android NDK
 ARG ANDROID
 ARG ANDROID_NDK
 ARG GRADLE_VERSION
 COPY ./common/install_android.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 COPY ./android/AndroidManifest.xml AndroidManifest.xml
 COPY ./android/build.gradle build.gradle
 RUN if [ -n "${ANDROID}" ]; then bash ./install_android.sh; fi
 RUN rm install_android.sh cache_vision_models.sh common_utils.sh
 RUN rm AndroidManifest.xml
 RUN rm build.gradle
 ENV INSTALLED_ANDROID ${ANDROID}
 # (optional) Install Vulkan SDK
 ARG VULKAN_SDK_VERSION
 COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
@ -135,13 +147,6 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt
 ARG TRITON_CPU
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
 RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-cpu.txt
 ARG EXECUTORCH
 # Build and install executorch
 COPY ./common/install_executorch.sh install_executorch.sh
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -1,10 +0,0 @@
 #!/usr/bin/env bash
 # This is mostly just a shim to manywheel/build.sh
 # TODO: Make this a dedicated script to build just libtorch
 set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/magma/.gitignore
+++ b/.ci/magma/.gitignore
@ -1,2 +0,0 @@
 output/
 magma-cuda*/
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -1,49 +0,0 @@
 SHELL=/usr/bin/env bash
 DOCKER_CMD ?= docker
 DESIRED_CUDA ?= 11.8
 DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
 PACKAGE_NAME = magma-cuda
 CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
 DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
 	-w /builder \
 	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
 	-e DESIRED_CUDA=${DESIRED_CUDA} \
 	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
 	"pytorch/manylinux2_28-builder:cuda${DESIRED_CUDA}-main" \
 	magma/build_magma.sh
 .PHONY: all
 all: magma-cuda128
 all: magma-cuda126
 all: magma-cuda124
 all: magma-cuda118
 .PHONY:
 clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 .PHONY: magma-cuda128
 magma-cuda128: DESIRED_CUDA := 12.8
 magma-cuda128: CUDA_ARCH_LIST += -gencode arch=compute_100,code=sm_100 -gencode arch=compute_120,code=sm_120
 magma-cuda128:
 	$(DOCKER_RUN)
 .PHONY: magma-cuda126
 magma-cuda126: DESIRED_CUDA := 12.6
 magma-cuda126:
 	$(DOCKER_RUN)
 .PHONY: magma-cuda124
 magma-cuda124: DESIRED_CUDA := 12.4
 magma-cuda124:
 	$(DOCKER_RUN)
 .PHONY: magma-cuda118
 magma-cuda118: DESIRED_CUDA := 11.8
 magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
 magma-cuda118:
 	$(DOCKER_RUN)
--- a/.ci/magma/README.md
+++ b/.ci/magma/README.md
@ -1,50 +0,0 @@
 # Magma
 This folder contains the scripts and configurations to build magma, statically linked for various versions of CUDA.
 ## Building
 Look in the `Makefile` for available targets to build. To build any target, for example `magma-cuda118`, run
 ```
 # Using `docker`
 make magma-cuda118
 # Using `podman`
 DOCKER_CMD=podman make magma-cuda118
 ```
 This spawns a `pytorch/manylinux-cuda<version>` docker image, which has the required `devtoolset` and CUDA versions installed.
 Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
 into a tarball, with the following structure:
 ```
 .
 ├── include       # header files
 ├── lib           # libmagma.a
 ├── info
 │   ├── licenses  # license file
 │   └── recipe    # build script and patches
 ```
 More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the CUDA version.
 Outputted binaries should be in the `output` folder.
 ## Pushing
 Packages can be uploaded to an S3 bucket using:
 ```
 aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
 ```
 If you do not have upload permissions, please ping @seemethere or @soumith to gain access
 ## New versions
 New CUDA versions can be added by creating a new make target with the next desired version. For CUDA version NN.n, the target should be named `magma-cudaNNn`.
 Make sure to edit the appropriate environment variables (e.g., DESIRED_CUDA, CUDA_ARCH_LIST) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
 New patches can be added by editing `Makefile` and`build_magma.sh` the same way `getrf_nbparam.patch` is implemented.
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -1,50 +0,0 @@
 #!/usr/bin/env bash
 set -eou pipefail
 # Environment variables
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
 MAGMA_VERSION=2.6.1
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma/package_files # source patches and metadata
 PACKAGE_DIR=${ROOT_DIR}/magma/${PACKAGE_NAME} # build workspace
 PACKAGE_OUTPUT=${ROOT_DIR}/magma/output # where tarballs are stored
 PACKAGE_BUILD=${PACKAGE_DIR}/build # where the content of the tarball is prepared
 PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
 PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
 mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
 curl -LO http://icl.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz
 tar zxf magma-${MAGMA_VERSION}.tar.gz
 sha256sum --check < ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256
 popd
 # Apply patches and build
 pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
 patch < ${PACKAGE_FILES}/CMake.patch
 patch < ${PACKAGE_FILES}/cmakelists.patch
 patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
 patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
 # The build.sh script expects to be executed from the sources root folder
 INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
 popd
 # Package recipe, license and tarball
 # Folder and package name are backward compatible for the build workflow
 cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
 cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
 cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
 cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
 cp ${PACKAGE_FILES}/getrf_nbparam.patch ${PACKAGE_RECIPE}/getrf_nbparam.patch
 cp ${PACKAGE_FILES}/CMake.patch ${PACKAGE_RECIPE}/CMake.patch
 cp ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256 ${PACKAGE_RECIPE}/magma-${MAGMA_VERSION}.sha256
 cp ${PACKAGE_DIR}/magma-${MAGMA_VERSION}/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
 pushd ${PACKAGE_BUILD}
 tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
 echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
 popd
--- a/.ci/magma/package_files/CMake.patch
+++ b/.ci/magma/package_files/CMake.patch
@ -1,40 +0,0 @@
 --- CMake.src.cuda	2023-03-29 10:05:32.136954140 +0000
 +++ CMake.src.cuda	2023-03-29 10:05:50.281318043 +0000
@@ -283,10 +283,10 @@
 magmablas/zgeadd.cu
 magmablas/zgeadd2.cu
 magmablas/zgeam.cu
 -magmablas/zgemm_fermi.cu
 +#magmablas/zgemm_fermi.cu
 magmablas/zgemm_reduce.cu
 magmablas/zgemv_conj.cu
 -magmablas/zgemv_fermi.cu
 +#magmablas/zgemv_fermi.cu
 magmablas/zgerbt.cu
 magmablas/zgerbt_kernels.cu
 magmablas/zgetmatrix_transpose.cpp
@@ -1009,18 +1009,18 @@
 magmablas/sgeam.cu
 magmablas/dgeam.cu
 magmablas/cgeam.cu
 -magmablas/sgemm_fermi.cu
 -magmablas/dgemm_fermi.cu
 -magmablas/cgemm_fermi.cu
 +#magmablas/sgemm_fermi.cu
 +#magmablas/dgemm_fermi.cu
 +#magmablas/cgemm_fermi.cu
 magmablas/sgemm_reduce.cu
 magmablas/dgemm_reduce.cu
 magmablas/cgemm_reduce.cu
 magmablas/sgemv_conj.cu
 magmablas/dgemv_conj.cu
 magmablas/cgemv_conj.cu
 -magmablas/sgemv_fermi.cu
 -magmablas/dgemv_fermi.cu
 -magmablas/cgemv_fermi.cu
 +#magmablas/sgemv_fermi.cu
 +#magmablas/dgemv_fermi.cu
 +#magmablas/cgemv_fermi.cu
 magmablas/sgerbt.cu
 magmablas/dgerbt.cu
 magmablas/cgerbt.cu
--- a/.ci/magma/package_files/build.sh
+++ b/.ci/magma/package_files/build.sh
@ -1,12 +0,0 @@
 CUDA__VERSION=$(nvcc --version|sed -n 4p|cut -f5 -d" "|cut -f1 -d",")
 if [ "$CUDA__VERSION" != "$DESIRED_CUDA" ]; then
    echo "CUDA Version is not $DESIRED_CUDA. CUDA Version found: $CUDA__VERSION"
    exit 1
 fi
 mkdir build
 cd build
 cmake .. -DUSE_FORTRAN=OFF -DGPU_TARGET="All" -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DCUDA_ARCH_LIST="$CUDA_ARCH_LIST"
 make -j$(getconf _NPROCESSORS_CONF)
 make install
 cd ..
--- a/.ci/magma/package_files/cmakelists.patch
+++ b/.ci/magma/package_files/cmakelists.patch
@ -1,388 +0,0 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
 index d5d8d87d..8a507334 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 2.8.1 )
 # ----------------------------------------
 # to disable Fortran, set this to "off"
 # see also -DADD_ below
 -option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" ON )
 +option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" OFF )
 if (USE_FORTRAN)
     project( MAGMA C CXX Fortran )
@@ -75,6 +75,8 @@ else()
     message( WARNING "The compiler ${CMAKE_CXX_COMPILER} doesn't support the -std=c++11 flag. Some code may not compile.")
 endif()
 +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++ -fno-exceptions")
 +
 CHECK_C_COMPILER_FLAG("-std=c99" COMPILER_SUPPORTS_C99)
 if (COMPILER_SUPPORTS_C99)
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
@@ -101,15 +103,15 @@ endif()
 # ----------------------------------------
 -# locate OpenMP
 -find_package( OpenMP )
 -if (OPENMP_FOUND)
 -    message( STATUS "Found OpenMP" )
 -    message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
 -    message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
 -    set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
 -    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
 -endif()
 +# # locate OpenMP
 +# find_package( OpenMP )
 +# if (OPENMP_FOUND)
 +#     message( STATUS "Found OpenMP" )
 +#     message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
 +#     message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
 +#     set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
 +#     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
 +# endif()
 if (MAGMA_ENABLE_CUDA)
   # ----------------------------------------
@@ -132,7 +134,7 @@ if (MAGMA_ENABLE_CUDA)
     set( NV_SM    "" )
     set( NV_COMP  "" )
 -    set(CUDA_SEPARABLE_COMPILATION ON)
 +    set(CUDA_SEPARABLE_COMPILATION OFF)
     # nvcc >= 6.5 supports -std=c++11, so propagate CXXFLAGS to NVCCFLAGS.
     # Older nvcc didn't support -std=c++11, so previously we disabled propagation.
@@ -294,11 +296,18 @@ if (MAGMA_ENABLE_CUDA)
         message( STATUS "    compile for CUDA arch 8.0 (Ampere)" )
     endif()
 +    if ( ${GPU_TARGET} MATCHES "All")
 +        set( MIN_ARCH 370)
 +        SET( NV_SM ${CUDA_ARCH_LIST})
 +        SET( NV_COMP "")
 +    endif()
 +
     if (NOT MIN_ARCH)
         message( FATAL_ERROR "GPU_TARGET must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
     endif()
 -    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
 +    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DHAVE_CUBLAS -Xfatbin -compress-all -Xcompiler -fPIC -std=c++11 ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
 +    MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
     #add_definitions( "-DMAGMA_HAVE_CUDA -DMAGMA_CUDA_ARCH_MIN=${MIN_ARCH}" )
     set(MAGMA_HAVE_CUDA "1")
     set(MAGMA_CUDA_ARCH_MIN "${MIN_ARCH}")
@@ -413,7 +422,7 @@ set_property(CACHE BLA_VENDOR PROPERTY STRINGS
 set( LAPACK_LIBRARIES "" CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" )
 if (LAPACK_LIBRARIES STREQUAL "")
     message( STATUS "Searching for BLAS and LAPACK. To override, set LAPACK_LIBRARIES using ccmake." )
 -    find_package( LAPACK )
 +    # find_package( LAPACK )
     # force showing updated LAPACK_LIBRARIES in ccmake / cmake-gui.
     set( LAPACK_LIBRARIES ${LAPACK_LIBRARIES} CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" FORCE )
 else()
@@ -552,12 +561,12 @@ if (WIN32)
     #message( "libmagma_all_f   ${libmagma_all_f}"   )
     # on Windows, Fortran files aren't compiled if listed here...
 -    cuda_add_library( magma ${libmagma_all_cpp} )
 +    cuda_add_library( magma STATIC ${libmagma_all_cpp} OPTIONS --compiler-options "-fPIC")
     target_link_libraries( magma
         ${LAPACK_LIBRARIES}
         ${CUDA_CUDART_LIBRARY}
         ${CUDA_CUBLAS_LIBRARIES}
 -        ${CUDA_cusparse_LIBRARY}
 +        # ${CUDA_cusparse_LIBRARY}
     )
     # no Fortran files at the moment (how to test libmagma_all_f is not empty?),
@@ -575,13 +584,13 @@ if (WIN32)
 else()
     # Unix doesn't seem to have a problem with mixing C, CUDA, and Fortran files
     if (MAGMA_ENABLE_CUDA)
 -      cuda_add_library( magma ${libmagma_all} )
 +      cuda_add_library( magma STATIC ${libmagma_all} OPTIONS --compiler-options "-fPIC")
       target_link_libraries( magma
         ${blas_fix}
         ${LAPACK_LIBRARIES}
         ${CUDA_CUDART_LIBRARY}
         ${CUDA_CUBLAS_LIBRARIES}
 -        ${CUDA_cusparse_LIBRARY}
 +        # ${CUDA_cusparse_LIBRARY}
 	)
     else()
       find_package( hipBLAS )
@@ -614,138 +623,139 @@ else()
     endif()
 endif()
 add_custom_target( lib DEPENDS magma )
 -
 -
 -# ----------------------------------------
 -# compile lapacktest library
 -# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
 -# else,           compile only C++     files, not Fortran files
 -if (USE_FORTRAN)
 -    foreach( filename ${liblapacktest_all} )
 -        if (filename MATCHES "\\.(f|f90|F90)$")
 -            list( APPEND liblapacktest_all_f ${filename} )
 -        endif()
 -    endforeach()
 -    add_library( lapacktest ${liblapacktest_all_f} )
 -else()
 -    # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
 -    foreach( filename ${liblapacktest_all} )
 -        if (filename MATCHES "\\.(c|cu|cpp)$")
 -            list( APPEND liblapacktest_all_cpp ${filename} )
 -        endif()
 -    endforeach()
 -    add_library( lapacktest ${liblapacktest_all_cpp} )
 -endif()
 -target_link_libraries( lapacktest
 -    ${blas_fix}
 -    ${LAPACK_LIBRARIES}
 -)
 -
 -
 -# ----------------------------------------
 -# compile tester library
 -add_library( tester ${libtest_all} )
 -target_link_libraries( tester
 -    magma
 -    lapacktest
 -    ${blas_fix}
 -    ${LAPACK_LIBRARIES}
 -)
 +set_target_properties(magma PROPERTIES POSITION_INDEPENDENT_CODE ON)
 +
 +
 +# # ----------------------------------------
 +# # compile lapacktest library
 +# # If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
 +# # else,           compile only C++     files, not Fortran files
 +# if (USE_FORTRAN)
 +#     foreach( filename ${liblapacktest_all} )
 +#         if (filename MATCHES "\\.(f|f90|F90)$")
 +#             list( APPEND liblapacktest_all_f ${filename} )
 +#         endif()
 +#     endforeach()
 +#     add_library( lapacktest ${liblapacktest_all_f} )
 +# else()
 +#     # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
 +#     foreach( filename ${liblapacktest_all} )
 +#         if (filename MATCHES "\\.(c|cu|cpp)$")
 +#             list( APPEND liblapacktest_all_cpp ${filename} )
 +#         endif()
 +#     endforeach()
 +#     add_library( lapacktest ${liblapacktest_all_cpp} )
 +# endif()
 +# target_link_libraries( lapacktest
 +#     ${blas_fix}
 +#     ${LAPACK_LIBRARIES}
 +# )
 +
 +
 +# # ----------------------------------------
 +# # compile tester library
 +# add_library( tester ${libtest_all} )
 +# target_link_libraries( tester
 +#     magma
 +#     lapacktest
 +#     ${blas_fix}
 +#     ${LAPACK_LIBRARIES}
 +# )
 # ----------------------------------------
 # compile MAGMA sparse library
 # sparse doesn't have Fortran at the moment, so no need for above shenanigans
 -if (MAGMA_ENABLE_CUDA)
 -  include_directories( sparse/include )
 -  include_directories( sparse/control )
 -else()
 -  include_directories( sparse_hip/include )
 -  include_directories( sparse_hip/control )
 -endif()
 -include_directories( testing )
 -
 -if (MAGMA_ENABLE_CUDA)
 -  cuda_add_library( magma_sparse ${libsparse_all} )
 -  target_link_libraries( magma_sparse
 -    magma
 -    ${blas_fix}
 -    ${LAPACK_LIBRARIES}
 -    ${CUDA_CUDART_LIBRARY}
 -    ${CUDA_CUBLAS_LIBRARIES}
 -    ${CUDA_cusparse_LIBRARY}
 -    )
 -else()
 -  add_library( magma_sparse ${libsparse_all} )
 -  target_link_libraries( magma_sparse
 -    magma
 -    ${blas_fix}
 -    ${LAPACK_LIBRARIES}
 -    hip::device
 -    roc::hipblas
 -    roc::hipsparse
 -    )
 -endif()
 -add_custom_target( sparse-lib DEPENDS magma_sparse )
 -
 -
 -# ----------------------------------------
 -# compile each tester
 -
 -# save testers to testing/
 -# save tester lib files to testing_lib/ to avoid cluttering lib/
 -set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
 -set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
 -set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
 -
 -# skip Fortran testers, which require an extra file from CUDA
 -foreach( filename ${testing_all} )
 -    if (filename MATCHES "\\.(c|cu|cpp)$")
 -        list( APPEND testing_all_cpp ${filename} )
 -    endif()
 -endforeach()
 -foreach( TEST ${testing_all_cpp} )
 -    string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
 -    string( REGEX REPLACE "testing/" "" EXE ${EXE} )
 -    #message( "${TEST} --> ${EXE}" )
 -    add_executable( ${EXE} ${TEST} )
 -    target_link_libraries( ${EXE} tester lapacktest magma )
 -    list( APPEND testing ${EXE} )
 -endforeach()
 -add_custom_target( testing DEPENDS ${testing} )
 -
 -
 -# ----------------------------------------
 -# compile each sparse tester
 -
 -if (MAGMA_ENABLE_CUDA)
 -  set(SPARSE_TEST_DIR "sparse/testing")
 -else()
 -  set(SPARSE_TEST_DIR "sparse_hip/testing")
 -endif()
 -
 -
 -set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
 -cmake_policy( SET CMP0037 OLD)
 -foreach( TEST ${sparse_testing_all} )
 -    string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
 -    string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
 -    #message( "${TEST} --> ${EXE}" )
 -    add_executable( ${EXE} ${TEST} )
 -    target_link_libraries( ${EXE} magma_sparse magma )
 -    list( APPEND sparse-testing ${EXE} )
 -endforeach()
 -add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
 +# if (MAGMA_ENABLE_CUDA)
 +#   include_directories( sparse/include )
 +#   include_directories( sparse/control )
 +# else()
 +#   include_directories( sparse_hip/include )
 +#   include_directories( sparse_hip/control )
 +# endif()
 +# include_directories( testing )
 +
 +# if (MAGMA_ENABLE_CUDA)
 +#   cuda_add_library( magma_sparse ${libsparse_all} )
 +#   target_link_libraries( magma_sparse
 +#     magma
 +#     ${blas_fix}
 +#     ${LAPACK_LIBRARIES}
 +#     ${CUDA_CUDART_LIBRARY}
 +#     ${CUDA_CUBLAS_LIBRARIES}
 +#     ${CUDA_cusparse_LIBRARY}
 +#     )
 +# else()
 +#   add_library( magma_sparse ${libsparse_all} )
 +#   target_link_libraries( magma_sparse
 +#     magma
 +#     ${blas_fix}
 +#     ${LAPACK_LIBRARIES}
 +#     hip::device
 +#     roc::hipblas
 +#     roc::hipsparse
 +#     )
 +# endif()
 +# add_custom_target( sparse-lib DEPENDS magma_sparse )
 +
 +
 +# # ----------------------------------------
 +# # compile each tester
 +
 +# # save testers to testing/
 +# # save tester lib files to testing_lib/ to avoid cluttering lib/
 +# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
 +# set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
 +# set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
 +
 +# # skip Fortran testers, which require an extra file from CUDA
 +# foreach( filename ${testing_all} )
 +#     if (filename MATCHES "\\.(c|cu|cpp)$")
 +#         list( APPEND testing_all_cpp ${filename} )
 +#     endif()
 +# endforeach()
 +# foreach( TEST ${testing_all_cpp} )
 +#     string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
 +#     string( REGEX REPLACE "testing/" "" EXE ${EXE} )
 +#     #message( "${TEST} --> ${EXE}" )
 +#     add_executable( ${EXE} ${TEST} )
 +#     target_link_libraries( ${EXE} tester lapacktest magma )
 +#     list( APPEND testing ${EXE} )
 +# endforeach()
 +# add_custom_target( testing DEPENDS ${testing} )
 +
 +
 +# # ----------------------------------------
 +# # compile each sparse tester
 +
 +# if (MAGMA_ENABLE_CUDA)
 +#   set(SPARSE_TEST_DIR "sparse/testing")
 +# else()
 +#   set(SPARSE_TEST_DIR "sparse_hip/testing")
 +# endif()
 +
 +
 +# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
 +# cmake_policy( SET CMP0037 OLD)
 +# foreach( TEST ${sparse_testing_all} )
 +#     string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
 +#     string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
 +#     #message( "${TEST} --> ${EXE}" )
 +#     add_executable( ${EXE} ${TEST} )
 +#     target_link_libraries( ${EXE} magma_sparse magma )
 +#     list( APPEND sparse-testing ${EXE} )
 +# endforeach()
 +# add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
 # ----------------------------------------
 # what to install
 -install( TARGETS magma magma_sparse ${blas_fix}
 +install( TARGETS magma ${blas_fix}
          RUNTIME DESTINATION bin
          LIBRARY DESTINATION lib
          ARCHIVE DESTINATION lib )
 -file( GLOB headers include/*.h sparse/include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
 +file( GLOB headers include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
 if (USE_FORTRAN)
     install( FILES ${headers} ${modules}
              DESTINATION include )
@@ -769,9 +779,9 @@ else()
     "${blas_fix_lib} ${LAPACK_LIBS} hip::device roc::hipblas roc::hipsparse" )
 endif()
 set( MAGMA_REQUIRED "" )
 -configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
 -install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
 -         DESTINATION lib/pkgconfig )
 +# configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
 +# install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
 +#          DESTINATION lib/pkgconfig )
 # ----------------------------------------
 get_directory_property( compile_definitions COMPILE_DEFINITIONS )
--- a/.ci/magma/package_files/getrf_nbparam.patch
+++ b/.ci/magma/package_files/getrf_nbparam.patch
@ -1,40 +0,0 @@
 diff --git a/control/get_batched_crossover.cpp b/control/get_batched_crossover.cpp
 index 4ec57306..912f8608 100644
 --- a/control/get_batched_crossover.cpp
 +++ b/control/get_batched_crossover.cpp
@@ -119,7 +119,7 @@ void magma_get_spotrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
 void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
 {
     *nb    = 64;
 -    *recnb = 32;
 +    *recnb = 16;
     return;
 }
@@ -127,7 +127,7 @@ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
 void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
 {
     *nb    = 128;
 -    *recnb =  32;
 +    *recnb =  16;
     return;
 }
@@ -135,7 +135,7 @@ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
 void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
 {
     *nb    = 128;
 -    *recnb =  32;
 +    *recnb =  16;
     return;
 }
@@ -143,7 +143,7 @@ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
 void magma_get_sgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
 {
     *nb    = 128;
 -    *recnb =  32;
 +    *recnb =  16;
     return;
 }
--- a/.ci/magma/package_files/getrf_shfl.patch
+++ b/.ci/magma/package_files/getrf_shfl.patch
@ -1,15 +0,0 @@
 diff --git a/src/zgetrf_batched.cpp b/src/zgetrf_batched.cpp
 index 24a65a90..884d9352 100644
 --- a/src/zgetrf_batched.cpp
 +++ b/src/zgetrf_batched.cpp
@@ -116,7 +116,9 @@ magma_zgetrf_batched(
             return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
         }
         else{
 -            return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
 +            // magma_cgetrf_batched_smallsq_shfl is broken, therefore let's call noshfl version for arch < 700
 +            // return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
 +            return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
         }
         #else
         return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
--- a/.ci/magma/package_files/magma-2.6.1.sha256
+++ b/.ci/magma/package_files/magma-2.6.1.sha256
@ -1 +0,0 @@
 6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213  magma-2.6.1.tar.gz
--- a/.ci/magma/package_files/thread_queue.patch
+++ b/.ci/magma/package_files/thread_queue.patch
@ -1,20 +0,0 @@
 --- control/thread_queue.cpp	2016-08-30 06:37:49.000000000 -0700
 +++ control/thread_queue.cpp	2016-10-10 19:47:28.911580965 -0700
@@ -15,7 +15,7 @@
 {
     if ( err != 0 ) {
         fprintf( stderr, "Error: %s (%d)\n", strerror(err), err );
 -        throw std::exception();
 +        // throw std::exception();
     }
 }
@@ -172,7 +172,7 @@
     check( pthread_mutex_lock( &mutex ));
     if ( quit_flag ) {
         fprintf( stderr, "Error: push_task() called after quit()\n" );
 -        throw std::exception();
 +        // throw std::exception();
     }
     q.push( task );
     ntask += 1;
--- a/.ci/manywheel/LICENSE
+++ b/.ci/manywheel/LICENSE
@ -1,21 +0,0 @@
 The MIT License (MIT)
 Copyright (c) 2016 manylinux
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -1,28 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 case "${GPU_ARCH_TYPE:-BLANK}" in
    BLANK)
        # Legacy behavior for CircleCI
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    cuda)
        bash "${SCRIPTPATH}/build_cuda.sh"
        ;;
    rocm)
        bash "${SCRIPTPATH}/build_rocm.sh"
        ;;
    cpu | cpu-cxx11-abi | cpu-s390x)
        bash "${SCRIPTPATH}/build_cpu.sh"
        ;;
    xpu)
        bash "${SCRIPTPATH}/build_xpu.sh"
        ;;
    *)
        echo "Un-recognized GPU_ARCH_TYPE '${GPU_ARCH_TYPE}', exiting..."
        exit 1
        ;;
 esac
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -1,498 +0,0 @@
 #!/usr/bin/env bash
 # meant to be called only from the neighboring build.sh and build_cpu.sh scripts
 set -ex
 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 source ${SOURCE_DIR}/set_desired_python.sh
 if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
    exit 1
 fi
 # Function to retry functions that sometimes timeout or have flaky failures
 retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }
 PLATFORM="manylinux2014_x86_64"
 # TODO move this into the Docker images
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
    PLATFORM="manylinux_2_28_x86_64"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    # TODO: Remove this once nvidia package repos are back online
    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
    # shellcheck disable=SC2046
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
 fi
 # We use the package name to test the package by passing this to 'pip install'
 # This is the env variable that setup.py uses to name the package. Note that
 # pip 'normalizes' the name first by changing all - to _
 if [[ -z "$TORCH_PACKAGE_NAME" ]]; then
    TORCH_PACKAGE_NAME='torch'
 fi
 if [[ -z "$TORCH_NO_PYTHON_PACKAGE_NAME" ]]; then
    TORCH_NO_PYTHON_PACKAGE_NAME='torch_no_python'
 fi
 TORCH_PACKAGE_NAME="$(echo $TORCH_PACKAGE_NAME | tr '-' '_')"
 TORCH_NO_PYTHON_PACKAGE_NAME="$(echo $TORCH_NO_PYTHON_PACKAGE_NAME | tr '-' '_')"
 echo "Expecting the built wheels to all be called '$TORCH_PACKAGE_NAME' or '$TORCH_NO_PYTHON_PACKAGE_NAME'"
 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
 # PYTORCH_BUILD_NUMBER > 1
 build_version="$PYTORCH_BUILD_VERSION"
 build_number="$PYTORCH_BUILD_NUMBER"
 if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
    # This will be the *exact* version, since build_number<1
    build_version="$OVERRIDE_PACKAGE_VERSION"
    build_number=0
 fi
 if [[ -z "$build_version" ]]; then
    build_version=1.0.0
 fi
 if [[ -z "$build_number" ]]; then
    build_number=1
 fi
 export PYTORCH_BUILD_VERSION=$build_version
 export PYTORCH_BUILD_NUMBER=$build_number
 export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
 export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
 if [[ -e /opt/openssl ]]; then
    export OPENSSL_ROOT_DIR=/opt/openssl
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi
 mkdir -p /tmp/$WHEELHOUSE_DIR
 export PATCHELF_BIN=/usr/local/bin/patchelf
 patchelf_version=$($PATCHELF_BIN --version)
 echo "patchelf version: " $patchelf_version
 if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
    echo "Your patchelf version is too old. Please use version >= 0.10."
    exit 1
 fi
 ########################################################
 # Compile wheels as well as libtorch
 #######################################################
 if [[ -z "$PYTORCH_ROOT" ]]; then
    echo "Need to set PYTORCH_ROOT env variable"
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
 python setup.py clean
 retry pip install -qr requirements.txt
 case ${DESIRED_PYTHON} in
  cp31*)
    retry pip install -q --pre numpy==2.1.0
    ;;
  # Should catch 3.9+
  *)
    retry pip install -q --pre numpy==2.0.2
    ;;
 esac
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    export _GLIBCXX_USE_CXX11_ABI=1
 else
    export _GLIBCXX_USE_CXX11_ABI=0
 fi
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
 fi
 # This value comes from binary_linux_build.sh (and should only be set to true
 # for master / release branches)
 BUILD_DEBUG_INFO=${BUILD_DEBUG_INFO:=0}
 if [[ $BUILD_DEBUG_INFO == "1" ]]; then
    echo "Building wheel and debug info"
 else
    echo "BUILD_DEBUG_INFO was not set, skipping debug info"
 fi
 if [[ "$DISABLE_RCCL" = 1 ]]; then
    echo "Disabling NCCL/RCCL in pyTorch"
    USE_RCCL=0
    USE_NCCL=0
    USE_KINETO=0
 else
    USE_RCCL=1
    USE_NCCL=1
    USE_KINETO=1
 fi
 echo "Calling setup.py bdist at $(date)"
 if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
 else
    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
 fi
 echo "Finished setup.py bdist at $(date)"
 # Build libtorch packages
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
    # Now build pythonless libtorch
    # Note - just use whichever python we happen to be on
    python setup.py clean
    if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
        STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
    fi
    mkdir -p build
    pushd build
    echo "Calling tools/build_libtorch.py at $(date)"
    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
         EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
         python ../tools/build_libtorch.py
    echo "Finished tools/build_libtorch.py at $(date)"
    popd
    mkdir -p libtorch/{lib,bin,include,share}
    cp -r build/build/lib libtorch/
    # for now, the headers for the libtorch package will just be copied in
    # from one of the wheels (this is from when this script built multiple
    # wheels at once)
    ANY_WHEEL=$(ls /tmp/$WHEELHOUSE_DIR/torch*.whl | head -n1)
    unzip -d any_wheel $ANY_WHEEL
    if [[ -d any_wheel/torch/include ]]; then
        cp -r any_wheel/torch/include libtorch/
    else
        cp -r any_wheel/torch/lib/include libtorch/
    fi
    cp -r any_wheel/torch/share/cmake libtorch/share/
    rm -rf any_wheel
    echo $PYTORCH_BUILD_VERSION > libtorch/build-version
    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
        LIBTORCH_ABI="cxx11-abi-"
    else
        LIBTORCH_ABI=
    fi
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
 fi
 popd
 #######################################################################
 # ADD DEPENDENCIES INTO THE WHEEL
 #
 # auditwheel repair doesn't work correctly and is buggy
 # so manually do the work of copying dependency libs and patchelfing
 # and fixing RECORDS entries correctly
 ######################################################################
 fname_with_sha256() {
    HASH=$(sha256sum $1 | cut -c1-8)
    DIRNAME=$(dirname $1)
    BASENAME=$(basename $1)
    # Do not rename nvrtc-builtins.so as they are dynamically loaded
    # by libnvrtc.so
    # Similarly don't mangle libcudnn and libcublas library names
    if [[ $BASENAME == "libnvrtc-builtins.s"* || $BASENAME == "libcudnn"* || $BASENAME == "libcublas"*  ]]; then
        echo $1
    else
        INITNAME=$(echo $BASENAME | cut -f1 -d".")
        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
    fi
 }
 fname_without_so_number() {
    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
    echo "$LINKNAME"
 }
 make_wheel_record() {
    FPATH=$1
    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
        # if the RECORD file, then
        echo "\"$FPATH\",,"
    else
        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
        echo "\"$FPATH\",sha256=$HASH,$FSIZE"
    fi
 }
 replace_needed_sofiles() {
    find $1 -name '*.so*' | while read sofile; do
        origname=$2
        patchedname=$3
        if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
            set +e
            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
            ERRCODE=$?
            set -e
            if [ "$ERRCODE" -eq "0" ]; then
                echo "patching $sofile entry $origname to $patchedname"
                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
            fi
        fi
    done
 }
 echo 'Built this wheel:'
 ls /tmp/$WHEELHOUSE_DIR
 mkdir -p "/$WHEELHOUSE_DIR"
 mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
 if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
 fi
 if [[ -n "$BUILD_PYTHONLESS" ]]; then
    mkdir -p /$LIBTORCH_HOUSE_DIR
    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
 fi
 rm -rf /tmp/$WHEELHOUSE_DIR
 rm -rf /tmp_dir
 mkdir /tmp_dir
 pushd /tmp_dir
 for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.whl /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
    # if the glob didn't match anything
    if [[ ! -e $pkg ]]; then
        continue
    fi
    rm -rf tmp
    mkdir -p tmp
    cd tmp
    cp $pkg .
    unzip -q $(basename $pkg)
    rm -f $(basename $pkg)
    if [[ -d torch ]]; then
        PREFIX=torch
    else
        PREFIX=libtorch
    fi
    if [[ $pkg != *"without-deps"* ]]; then
        # copy over needed dependent .so files over and tag them with their hash
        patched=()
        for filepath in "${DEPS_LIST[@]}"; do
            filename=$(basename $filepath)
            destpath=$PREFIX/lib/$filename
            if [[ "$filepath" != "$destpath" ]]; then
                cp $filepath $destpath
            fi
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
            fi
            patchedname=$(basename $patchedpath)
            if [[ "$destpath" != "$patchedpath" ]]; then
                mv $destpath $patchedpath
            fi
            patched+=("$patchedname")
            echo "Copied $filepath to $patchedpath"
        done
        echo "patching to fix the so names to the hashed names"
        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
            replace_needed_sofiles $PREFIX ${DEPS_SONAME[i]} ${patched[i]}
            # do the same for caffe2, if it exists
            if [[ -d caffe2 ]]; then
                replace_needed_sofiles caffe2 ${DEPS_SONAME[i]} ${patched[i]}
            fi
        done
        # copy over needed auxiliary files
        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
            srcpath=${DEPS_AUX_SRCLIST[i]}
            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
            mkdir -p $(dirname $dstpath)
            cp $srcpath $dstpath
        done
    fi
    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
        echo "Setting rpath of $sofile to ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'}"
        $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'} ${FORCE_RPATH:-} $sofile
        $PATCHELF_BIN --print-rpath $sofile
    done
    # set RPATH of lib/ files to $ORIGIN
    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
        echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
        $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
        $PATCHELF_BIN --print-rpath $sofile
    done
    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
    fi
    # regenerate the RECORD file with new hashes
    record_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g')
    if [[ -e $record_file ]]; then
        echo "Generating new record file $record_file"
        : > "$record_file"
        # generate records for folders in wheel
        find * -type f | while read fname; do
            make_wheel_record "$fname" >>"$record_file"
        done
    fi
    if [[ $BUILD_DEBUG_INFO == "1" ]]; then
        pushd "$PREFIX/lib"
        # Duplicate library into debug lib
        cp libtorch_cpu.so libtorch_cpu.so.dbg
        # Keep debug symbols on debug lib
        strip --only-keep-debug libtorch_cpu.so.dbg
        # Remove debug info from release lib
        strip --strip-debug libtorch_cpu.so
        objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
        # Zip up debug info
        mkdir -p /tmp/debug
        mv libtorch_cpu.so.dbg /tmp/debug/libtorch_cpu.so.dbg
        CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch_cpu.so)
        pushd /tmp
        PKG_NAME=$(basename "$pkg" | sed 's/\.whl$//g')
        zip /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip /tmp/debug/libtorch_cpu.so.dbg
        cp /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip "$PYTORCH_FINAL_PACKAGE_DIR"
        popd
        popd
    fi
    # Rename wheel for Manylinux 2_28
    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
        pkg_name=$(echo $(basename $pkg) | sed -e s#linux_x86_64#"${PLATFORM}"#)
        zip -rq $pkg_name $PREIX*
        rm -f $pkg
        mv $pkg_name $(dirname $pkg)/$pkg_name
    else
        # zip up the wheel back
        zip -rq $(basename $pkg) $PREIX*
        # remove original wheel
        rm -f $pkg
        mv $(basename $pkg) $pkg
    fi
    cd ..
    rm -rf tmp
 done
 # Copy wheels to host machine for persistence before testing
 if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
    if [[ -n "$BUILD_PYTHONLESS" ]]; then
        cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
    else
        cp /$WHEELHOUSE_DIR/torch*.whl "$PYTORCH_FINAL_PACKAGE_DIR"
    fi
 fi
 # remove stuff before testing
 rm -rf /opt/rh
 if ls /usr/local/cuda* >/dev/null 2>&1; then
    rm -rf /usr/local/cuda*
 fi
 # Test that all the wheels work
 if [[ -z "$BUILD_PYTHONLESS" ]]; then
  export OMP_NUM_THREADS=4 # on NUMA machines this takes too long
  pushd $PYTORCH_ROOT/test
  # Install the wheel for this Python version
  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
  fi
  pip uninstall -y "$TORCH_PACKAGE_NAME"
  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
  fi
  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
  # Print info on the libraries installed in this wheel
  # Rather than adjust find command to skip non-library files with an embedded *.so* in their name,
  # since this is only for reporting purposes, we add the || true to the ldd command.
  installed_libraries=($(find "$pydir/lib/python${py_majmin}/site-packages/torch/" -name '*.so*'))
  echo "The wheel installed all of the libraries: ${installed_libraries[@]}"
  for installed_lib in "${installed_libraries[@]}"; do
      ldd "$installed_lib" || true
  done
  # Run the tests
  echo "$(date) :: Running tests"
  pushd "$PYTORCH_ROOT"
  LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
          "${PYTORCH_ROOT}/.ci/pytorch/run_tests.sh" manywheel "${py_majmin}" "$DESIRED_CUDA"
  popd
  echo "$(date) :: Finished tests"
 fi
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -1,60 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 export TH_BINARY_BUILD=1
 export USE_CUDA=0
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build()
    CMAKE_ARGS=()
 fi
 if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi
 WHEELHOUSE_DIR="wheelhousecpu"
 LIBTORCH_HOUSE_DIR="libtorch_housecpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    if [[ -z "$BUILD_PYTHONLESS" ]]; then
        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhousecpu"
    else
        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_housecpu"
    fi
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    if [[ "$(uname -m)" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
 fi
 DEPS_LIST=(
    "$LIBGOMP_PATH"
 )
 DEPS_SONAME=(
    "libgomp.so.1"
 )
 rm -rf /usr/local/cuda*
 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ -z "$BUILD_PYTHONLESS" ]]; then
    BUILD_SCRIPT=build_common.sh
 else
    BUILD_SCRIPT=build_libtorch.sh
 fi
 source ${SOURCE_DIR}/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -1,309 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P ))"
 export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
 export NCCL_ROOT_DIR=/usr/local/cuda
 export TH_BINARY_BUILD=1
 export USE_STATIC_CUDNN=1
 export USE_STATIC_NCCL=1
 export ATEN_STATIC_CUDA=1
 export USE_CUDA_STATIC_LINK=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
 export USE_CUPTI_SO=0
 export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
 export USE_CUFILE=${USE_CUFILE:-1}
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build()
    CMAKE_ARGS=()
 fi
 if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi
 # Determine CUDA version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
 # because in some cases a single Docker image can have multiple CUDA versions
 # on it, and `nvcc --version` might not show the CUDA version we want.
 if [[ -n "$DESIRED_CUDA" ]]; then
    # If the DESIRED_CUDA already matches the format that we expect
    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
        CUDA_VERSION=${DESIRED_CUDA}
    else
        # cu90, cu92, cu100, cu101
        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
        fi
    fi
    echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA"
 else
    CUDA_VERSION=$(nvcc --version|grep release|cut -f5 -d" "|cut -f1 -d",")
    echo "CUDA $CUDA_VERSION Detected"
 fi
 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
    12.8)
        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8 and will be removed in future releases
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.6)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    12.4)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    11.8)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
        exit 1
        ;;
 esac
 export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
 echo "${TORCH_CUDA_ARCH_LIST}"
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
 LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    if [[ -z "$BUILD_PYTHONLESS" ]]; then
        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$cuda_version_nodot"
    else
        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$cuda_version_nodot"
    fi
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
 fi
 DEPS_LIST=(
    "$LIBGOMP_PATH"
 )
 DEPS_SONAME=(
    "libgomp.so.1"
 )
 # CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
 # since nvidia-cusparselt-cu11 is not available in PYPI
 if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
        DEPS_SONAME+=(
            "libcusparseLt.so.0"
        )
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
        )
 fi
 # Turn USE_CUFILE off for CUDA 11.8, 12.4 since nvidia-cufile-cu11 and 1.9.0.20 are
 # not available in PYPI
 if [[ $CUDA_VERSION == "11.8" || $CUDA_VERSION == "12.4" ]]; then
    export USE_CUFILE=0
 fi
 # CUDA_VERSION 12.4, 12.6, 12.8
 if [[ $CUDA_VERSION == 12* ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
            "/usr/local/cuda/lib64/libcudnn.so.9"
            "/usr/local/cuda/lib64/libcublas.so.12"
            "/usr/local/cuda/lib64/libcublasLt.so.12"
            "/usr/local/cuda/lib64/libcusparseLt.so.0"
            "/usr/local/cuda/lib64/libcudart.so.12"
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
            "libcudnn_cnn.so.9"
            "libcudnn_graph.so.9"
            "libcudnn_ops.so.9"
            "libcudnn_engines_runtime_compiled.so.9"
            "libcudnn_engines_precompiled.so.9"
            "libcudnn_heuristic.so.9"
            "libcudnn.so.9"
            "libcublas.so.12"
            "libcublasLt.so.12"
            "libcusparseLt.so.0"
            "libcudart.so.12"
            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
            "libnvrtc-builtins.so"
        )
        if [[ $USE_CUFILE == 1 ]]; then
            DEPS_LIST+=(
                "/usr/local/cuda/lib64/libcufile.so.0"
                "/usr/local/cuda/lib64/libcufile_rdma.so.1"
            )
            DEPS_SONAME+=(
                "libcufile.so.0"
                "libcufile_rdma.so.1"
            )
        fi
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
            '$ORIGIN/../../nvidia/cublas/lib'
            '$ORIGIN/../../nvidia/cuda_cupti/lib'
            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
            '$ORIGIN/../../nvidia/cuda_runtime/lib'
            '$ORIGIN/../../nvidia/cudnn/lib'
            '$ORIGIN/../../nvidia/cufft/lib'
            '$ORIGIN/../../nvidia/curand/lib'
            '$ORIGIN/../../nvidia/cusolver/lib'
            '$ORIGIN/../../nvidia/cusparse/lib'
            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
        )
        if [[ $USE_CUFILE == 1 ]]; then
            CUDA_RPATHS+=(
                '$ORIGIN/../../nvidia/cufile/lib'
            )
        fi
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 elif [[ $CUDA_VERSION == "11.8" ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
    export BUILD_BUNDLE_PTXAS=1
    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
        echo "Bundling with cudnn and cublas."
        DEPS_LIST+=(
            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
            "/usr/local/cuda/lib64/libcudnn.so.9"
            "/usr/local/cuda/lib64/libcublas.so.11"
            "/usr/local/cuda/lib64/libcublasLt.so.11"
            "/usr/local/cuda/lib64/libcudart.so.11.0"
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.11.2"    # this is not a mistake, it links to more specific cuda version
            "/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
            "libcudnn_cnn.so.9"
            "libcudnn_graph.so.9"
            "libcudnn_ops.so.9"
            "libcudnn_engines_runtime_compiled.so.9"
            "libcudnn_engines_precompiled.so.9"
            "libcudnn_heuristic.so.9"
            "libcudnn.so.9"
            "libcublas.so.11"
            "libcublasLt.so.11"
            "libcudart.so.11.0"
            "libnvToolsExt.so.1"
            "libnvrtc.so.11.2"
            "libnvrtc-builtins.so.11.8"
        )
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
            '$ORIGIN/../../nvidia/cublas/lib'
            '$ORIGIN/../../nvidia/cuda_cupti/lib'
            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
            '$ORIGIN/../../nvidia/cuda_runtime/lib'
            '$ORIGIN/../../nvidia/cudnn/lib'
            '$ORIGIN/../../nvidia/cufft/lib'
            '$ORIGIN/../../nvidia/curand/lib'
            '$ORIGIN/../../nvidia/cusolver/lib'
            '$ORIGIN/../../nvidia/cusparse/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
        )
        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
        export FORCE_RPATH="--force-rpath"
        export USE_STATIC_NCCL=0
        export USE_SYSTEM_NCCL=1
        export ATEN_STATIC_CUDA=0
        export USE_CUDA_STATIC_LINK=0
        export USE_CUPTI_SO=1
        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
    fi
 else
    echo "Unknown cuda version $CUDA_VERSION"
    exit 1
 fi
 # run_tests.sh requires DESIRED_CUDA to know what tests to exclude
 export DESIRED_CUDA="$cuda_version_nodot"
 # Switch `/usr/local/cuda` to the desired CUDA version
 rm -rf /usr/local/cuda || true
 ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
 # Switch `/usr/local/magma` to the desired CUDA version
 rm -rf /usr/local/magma || true
 ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
 export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
 export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
 export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 if [[ -z "$BUILD_PYTHONLESS" ]]; then
    BUILD_SCRIPT=build_common.sh
 else
    BUILD_SCRIPT=build_libtorch.sh
 fi
 source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -1,353 +0,0 @@
 #!/usr/bin/env bash
 # meant to be called only from the neighboring build.sh and build_cpu.sh scripts
 set -e pipefail
 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 # Require only one python installation
 if [[ -z "$DESIRED_PYTHON" ]]; then
    echo "Need to set DESIRED_PYTHON env variable"
    exit 1
 fi
 if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
    exit 1
 fi
 # Function to retry functions that sometimes timeout or have flaky failures
 retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
 }
 # TODO move this into the Docker images
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    retry yum install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    retry dnf install -q -y zip openssl
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    # TODO: Remove this once nvidia package repos are back online
    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
    # shellcheck disable=SC2046
    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
    retry apt-get update
    retry apt-get -y install zip openssl
 fi
 # Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
 # PYTORCH_BUILD_NUMBER > 1
 build_version="$PYTORCH_BUILD_VERSION"
 build_number="$PYTORCH_BUILD_NUMBER"
 if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
    # This will be the *exact* version, since build_number<1
    build_version="$OVERRIDE_PACKAGE_VERSION"
    build_number=0
 fi
 if [[ -z "$build_version" ]]; then
    build_version=1.0.0
 fi
 if [[ -z "$build_number" ]]; then
    build_number=1
 fi
 export PYTORCH_BUILD_VERSION=$build_version
 export PYTORCH_BUILD_NUMBER=$build_number
 export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
 export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
 # set OPENSSL_ROOT_DIR=/opt/openssl if it exists
 if [[ -e /opt/openssl ]]; then
    export OPENSSL_ROOT_DIR=/opt/openssl
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi
 # If given a python version like 3.6m or 2.7mu, convert this to the format we
 # expect. The binary CI jobs pass in python versions like this; they also only
 # ever pass one python version, so we assume that DESIRED_PYTHON is not a list
 # in this case
 if [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
 fi
 pydir="/opt/python/$DESIRED_PYTHON"
 export PATH="$pydir/bin:$PATH"
 export PATCHELF_BIN=/usr/local/bin/patchelf
 patchelf_version=`$PATCHELF_BIN --version`
 echo "patchelf version: " $patchelf_version
 if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
    echo "Your patchelf version is too old. Please use version >= 0.10."
    exit 1
 fi
 ########################################################
 # Compile wheels as well as libtorch
 #######################################################
 if [[ -z "$PYTORCH_ROOT" ]]; then
    echo "Need to set PYTORCH_ROOT env variable"
    exit 1
 fi
 pushd "$PYTORCH_ROOT"
 python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    export _GLIBCXX_USE_CXX11_ABI=1
 else
    export _GLIBCXX_USE_CXX11_ABI=0
 fi
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
    # TODO remove this work-around once pytorch sources are updated
    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
 fi
 echo "Calling setup.py install at $(date)"
 if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
 fi
 (
    set -x
    mkdir -p build
    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
        EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
        # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
        CFLAGS='-Wno-deprecated-declarations' \
        BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
        python setup.py install
    mkdir -p libtorch/{lib,bin,include,share}
    # Make debug folder separate so it doesn't get zipped up with the rest of
    # libtorch
    mkdir debug
    # Copy over all lib files
    cp -rv build/lib/*                libtorch/lib/
    cp -rv build/lib*/torch/lib/*     libtorch/lib/
    # Copy over all include files
    cp -rv build/include/*            libtorch/include/
    cp -rv build/lib*/torch/include/* libtorch/include/
    # Copy over all of the cmake files
    cp -rv build/lib*/torch/share/*   libtorch/share/
    # Split libtorch into debug / release version
    cp libtorch/lib/libtorch_cpu.so libtorch/lib/libtorch_cpu.so.dbg
    # Keep debug symbols on debug lib
    strip --only-keep-debug libtorch/lib/libtorch_cpu.so.dbg
    # Remove debug info from release lib
    strip --strip-debug libtorch/lib/libtorch_cpu.so
    # Add a debug link to the release lib to the debug lib (debuggers will then
    # search for symbols in a file called libtorch_cpu.so.dbg in some
    # predetermined locations) and embed a CRC32 of the debug library into the .so
    cd libtorch/lib
    objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
    cd ../..
    # Move the debug symbols to its own directory so it doesn't get processed /
    # zipped with all the other libraries
    mv libtorch/lib/libtorch_cpu.so.dbg debug/libtorch_cpu.so.dbg
    echo "${PYTORCH_BUILD_VERSION}" > libtorch/build-version
    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
 )
 if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    LIBTORCH_ABI="cxx11-abi-"
 else
    LIBTORCH_ABI=
 fi
 (
    set -x
    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
    # objcopy installs a CRC32 into libtorch_cpu above so, so add that to the name here
    CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch/lib/libtorch_cpu.so)
    # Zip debug symbols
    zip /tmp/$LIBTORCH_HOUSE_DIR/debug-libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION-$CRC32.zip debug/libtorch_cpu.so.dbg
    # Zip and copy libtorch
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
 )
 popd
 #######################################################################
 # ADD DEPENDENCIES INTO THE WHEEL
 #
 # auditwheel repair doesn't work correctly and is buggy
 # so manually do the work of copying dependency libs and patchelfing
 # and fixing RECORDS entries correctly
 ######################################################################
 fname_with_sha256() {
    HASH=$(sha256sum $1 | cut -c1-8)
    DIRNAME=$(dirname $1)
    BASENAME=$(basename $1)
    if [[ $BASENAME == "libnvrtc-builtins.so" || $BASENAME == "libcudnn"* ]]; then
        echo $1
    else
        INITNAME=$(echo $BASENAME | cut -f1 -d".")
        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
    fi
 }
 fname_without_so_number() {
    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
    echo "$LINKNAME"
 }
 make_wheel_record() {
    FPATH=$1
    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
        # if the RECORD file, then
        echo "\"$FPATH\",,"
    else
        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
        echo "\"$FPATH\",sha256=$HASH,$FSIZE"
    fi
 }
 echo 'Built this package:'
 (
    set -x
    mkdir -p /$LIBTORCH_HOUSE_DIR
    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
 )
 TMP_DIR=$(mktemp -d)
 trap "rm -rf ${TMP_DIR}" EXIT
 pushd "${TMP_DIR}"
 for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
    # if the glob didn't match anything
    if [[ ! -e $pkg ]]; then
        continue
    fi
    rm -rf tmp
    mkdir -p tmp
    cd tmp
    cp $pkg .
    unzip -q $(basename $pkg)
    rm -f $(basename $pkg)
    PREFIX=libtorch
    if [[ $pkg != *"without-deps"* ]]; then
        # copy over needed dependent .so files over and tag them with their hash
        patched=()
        for filepath in "${DEPS_LIST[@]}"; do
            filename=$(basename $filepath)
            destpath=$PREFIX/lib/$filename
            if [[ "$filepath" != "$destpath" ]]; then
                cp $filepath $destpath
            fi
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
            else
                patchedpath=$(fname_with_sha256 $destpath)
            fi
            patchedname=$(basename $patchedpath)
            if [[ "$destpath" != "$patchedpath" ]]; then
                mv $destpath $patchedpath
            fi
            patched+=("$patchedname")
            echo "Copied $filepath to $patchedpath"
        done
        echo "patching to fix the so names to the hashed names"
        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
            find $PREFIX -name '*.so*' | while read sofile; do
                origname=${DEPS_SONAME[i]}
                patchedname=${patched[i]}
                if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                    set +e
                    origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
                    ERRCODE=$?
                    set -e
                    if [ "$ERRCODE" -eq "0" ]; then
                        echo "patching $sofile entry $origname to $patchedname"
                        $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
                    fi
                fi
            done
        done
        # copy over needed auxiliary files
        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
            srcpath=${DEPS_AUX_SRCLIST[i]}
            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
            mkdir -p $(dirname $dstpath)
            cp $srcpath $dstpath
        done
    fi
    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
        echo "Setting rpath of $sofile to " '$ORIGIN:$ORIGIN/lib'
        $PATCHELF_BIN --set-rpath '$ORIGIN:$ORIGIN/lib' $sofile
        $PATCHELF_BIN --print-rpath $sofile
    done
    # set RPATH of lib/ files to $ORIGIN
    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
        echo "Setting rpath of $sofile to " '$ORIGIN'
        $PATCHELF_BIN --set-rpath '$ORIGIN' $sofile
        $PATCHELF_BIN --print-rpath $sofile
    done
    # regenerate the RECORD file with new hashes
    record_file=`echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g'`
    if [[ -e $record_file ]]; then
        echo "Generating new record file $record_file"
        rm -f $record_file
        # generate records for folders in wheel
        find * -type f | while read fname; do
            echo $(make_wheel_record $fname) >>$record_file
        done
    fi
    # zip up the wheel back
    zip -rq $(basename $pkg) $PREFIX*
    # replace original wheel
    rm -f $pkg
    mv $(basename $pkg) $pkg
    cd ..
    rm -rf tmp
 done
 # Copy wheels to host machine for persistence before testing
 if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
    cp /$LIBTORCH_HOUSE_DIR/debug-libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
 fi
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -1,268 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 export ROCM_HOME=/opt/rocm
 export MAGMA_HOME=$ROCM_HOME/magma
 # TODO: libtorch_cpu.so is broken when building with Debug info
 export BUILD_DEBUG_INFO=0
 # TODO Are these all used/needed?
 export TH_BINARY_BUILD=1
 export USE_STATIC_CUDNN=1
 export USE_STATIC_NCCL=1
 export ATEN_STATIC_CUDA=1
 export USE_CUDA_STATIC_LINK=1
 export INSTALL_TEST=0 # dont install test binaries into site-packages
 # Set RPATH instead of RUNPATH when using patchelf to avoid LD_LIBRARY_PATH override
 export FORCE_RPATH="--force-rpath"
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build()
    CMAKE_ARGS=()
 fi
 if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi
 # Determine ROCm version and architectures to build for
 #
 # NOTE: We should first check `DESIRED_CUDA` when determining `ROCM_VERSION`
 if [[ -n "$DESIRED_CUDA" ]]; then
    if ! echo "${DESIRED_CUDA}"| grep "^rocm" >/dev/null 2>/dev/null; then
        export DESIRED_CUDA="rocm${DESIRED_CUDA}"
    fi
    # rocm3.7, rocm3.5.1
    ROCM_VERSION="$DESIRED_CUDA"
    echo "Using $ROCM_VERSION as determined by DESIRED_CUDA"
 else
    echo "Must set DESIRED_CUDA"
    exit 1
 fi
 # Package directories
 WHEELHOUSE_DIR="wheelhouse$ROCM_VERSION"
 LIBTORCH_HOUSE_DIR="libtorch_house$ROCM_VERSION"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    if [[ -z "$BUILD_PYTHONLESS" ]]; then
        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$ROCM_VERSION"
    else
        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$ROCM_VERSION"
    fi
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 # To make version comparison easier, create an integer representation.
 ROCM_VERSION_CLEAN=$(echo ${ROCM_VERSION} | sed s/rocm//)
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION_CLEAN})
 IFS="$save_IFS"
 if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
    ROCM_VERSION_PATCH=0
 elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
 else
    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
    exit 1
 fi
 ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
 # Required ROCm libraries
 ROCM_SO_FILES=(
    "libMIOpen.so"
    "libamdhip64.so"
    "libhipblas.so"
    "libhipfft.so"
    "libhiprand.so"
    "libhipsolver.so"
    "libhipsparse.so"
    "libhsa-runtime64.so"
    "libamd_comgr.so"
    "libmagma.so"
    "librccl.so"
    "librocblas.so"
    "librocfft.so"
    "librocm_smi64.so"
    "librocrand.so"
    "librocsolver.so"
    "librocsparse.so"
    "libroctracer64.so"
    "libroctx64.so"
    "libhipblaslt.so"
    "libhiprtc.so"
 )
 if [[ $ROCM_INT -ge 60100 ]]; then
    ROCM_SO_FILES+=("librocprofiler-register.so")
 fi
 if [[ $ROCM_INT -ge 60200 ]]; then
    ROCM_SO_FILES+=("librocm-core.so")
 fi
 OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
 if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
    LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
    LIBELF_PATH="/usr/lib64/libelf.so.1"
    if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
        LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
    else
        LIBTINFO_PATH="/usr/lib64/libtinfo.so.6"
    fi
    LIBDRM_PATH="/opt/amdgpu/lib64/libdrm.so.2"
    LIBDRM_AMDGPU_PATH="/opt/amdgpu/lib64/libdrm_amdgpu.so.1"
    if [[ $ROCM_INT -ge 60100 && $ROCM_INT -lt 60300 ]]; then
        # Below libs are direct dependencies of libhipsolver
        LIBSUITESPARSE_CONFIG_PATH="/lib64/libsuitesparseconfig.so.4"
        if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
            LIBCHOLMOD_PATH="/lib64/libcholmod.so.2"
            # Below libs are direct dependencies of libsatlas
            LIBGFORTRAN_PATH="/lib64/libgfortran.so.3"
        else
            LIBCHOLMOD_PATH="/lib64/libcholmod.so.3"
            # Below libs are direct dependencies of libsatlas
            LIBGFORTRAN_PATH="/lib64/libgfortran.so.5"
        fi
        # Below libs are direct dependencies of libcholmod
        LIBAMD_PATH="/lib64/libamd.so.2"
        LIBCAMD_PATH="/lib64/libcamd.so.2"
        LIBCCOLAMD_PATH="/lib64/libccolamd.so.2"
        LIBCOLAMD_PATH="/lib64/libcolamd.so.2"
        LIBSATLAS_PATH="/lib64/atlas/libsatlas.so.3"
        # Below libs are direct dependencies of libsatlas
        LIBQUADMATH_PATH="/lib64/libquadmath.so.0"
    fi
    MAYBE_LIB64=lib64
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    LIBNUMA_PATH="/usr/lib/x86_64-linux-gnu/libnuma.so.1"
    LIBELF_PATH="/usr/lib/x86_64-linux-gnu/libelf.so.1"
    if [[ $ROCM_INT -ge 50300 ]]; then
        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.6"
    else
        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.5"
    fi
    LIBDRM_PATH="/usr/lib/x86_64-linux-gnu/libdrm.so.2"
    LIBDRM_AMDGPU_PATH="/usr/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1"
    if [[ $ROCM_INT -ge 60100 && $ROCM_INT -lt 60300 ]]; then
        # Below libs are direct dependencies of libhipsolver
        LIBCHOLMOD_PATH="/lib/x86_64-linux-gnu/libcholmod.so.3"
        # Below libs are direct dependencies of libcholmod
        LIBSUITESPARSE_CONFIG_PATH="/lib/x86_64-linux-gnu/libsuitesparseconfig.so.5"
        LIBAMD_PATH="/lib/x86_64-linux-gnu/libamd.so.2"
        LIBCAMD_PATH="/lib/x86_64-linux-gnu/libcamd.so.2"
        LIBCCOLAMD_PATH="/lib/x86_64-linux-gnu/libccolamd.so.2"
        LIBCOLAMD_PATH="/lib/x86_64-linux-gnu/libcolamd.so.2"
        LIBMETIS_PATH="/lib/x86_64-linux-gnu/libmetis.so.5"
        LIBLAPACK_PATH="/lib/x86_64-linux-gnu/liblapack.so.3"
        LIBBLAS_PATH="/lib/x86_64-linux-gnu/libblas.so.3"
        # Below libs are direct dependencies of libblas
        LIBGFORTRAN_PATH="/lib/x86_64-linux-gnu/libgfortran.so.5"
        LIBQUADMATH_PATH="/lib/x86_64-linux-gnu/libquadmath.so.0"
    fi
    MAYBE_LIB64=lib
 fi
 OS_SO_PATHS=($LIBGOMP_PATH $LIBNUMA_PATH\
             $LIBELF_PATH $LIBTINFO_PATH\
             $LIBDRM_PATH $LIBDRM_AMDGPU_PATH\
             $LIBSUITESPARSE_CONFIG_PATH\
             $LIBCHOLMOD_PATH $LIBAMD_PATH\
             $LIBCAMD_PATH $LIBCCOLAMD_PATH\
             $LIBCOLAMD_PATH $LIBSATLAS_PATH\
             $LIBGFORTRAN_PATH $LIBQUADMATH_PATH\
             $LIBMETIS_PATH $LIBLAPACK_PATH\
             $LIBBLAS_PATH)
 OS_SO_FILES=()
 for lib in "${OS_SO_PATHS[@]}"
 do
    file_name="${lib##*/}" # Substring removal of path to get filename
    OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
 done
 # rocBLAS library files
 ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
 ROCBLAS_LIB_DST=lib/rocblas/library
 ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
 ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
 OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
 ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
 # hipblaslt library files
 HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
 HIPBLASLT_LIB_DST=lib/hipblaslt/library
 ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
 OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
 HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
 # ROCm library files
 ROCM_SO_PATHS=()
 for lib in "${ROCM_SO_FILES[@]}"
 do
    file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
    if [[ -z $file_path ]]; then
        if [ -d "$ROCM_HOME/lib64/" ]; then
            file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
        fi
    fi
    if [[ -z $file_path ]]; then
        file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
    fi
    if [[ -z $file_path ]]; then
        echo "Error: Library file $lib is not found." >&2
        exit 1
    fi
    ROCM_SO_PATHS[${#ROCM_SO_PATHS[@]}]="$file_path" # Append lib to array
 done
 DEPS_LIST=(
    ${ROCM_SO_PATHS[*]}
    ${OS_SO_PATHS[*]}
 )
 DEPS_SONAME=(
    ${ROCM_SO_FILES[*]}
    ${OS_SO_FILES[*]}
 )
 DEPS_AUX_SRCLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
    "/opt/amdgpu/share/libdrm/amdgpu.ids"
 )
 DEPS_AUX_DSTLIST=(
    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
    "share/libdrm/amdgpu.ids"
 )
 # MIOpen library files
 MIOPEN_SHARE_SRC=$ROCM_HOME/share/miopen/db
 MIOPEN_SHARE_DST=share/miopen/db
 MIOPEN_SHARE_FILES=($(ls $MIOPEN_SHARE_SRC | grep -E $ARCH))
 DEPS_AUX_SRCLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_SRC/})
 DEPS_AUX_DSTLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_DST/})
 # RCCL library files
 RCCL_SHARE_SRC=$ROCM_HOME/share/rccl/msccl-algorithms
 RCCL_SHARE_DST=share/rccl/msccl-algorithms
 RCCL_SHARE_FILES=($(ls $RCCL_SHARE_SRC))
 DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/})
 DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/})
 echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}"
 SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
 if [[ -z "$BUILD_PYTHONLESS" ]]; then
    BUILD_SCRIPT=build_common.sh
 else
    BUILD_SCRIPT=build_libtorch.sh
 fi
 source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -1,108 +0,0 @@
 #!/usr/bin/env bash
 set -ex
 export TH_BINARY_BUILD=1
 export USE_CUDA=0
 # Keep an array of cmake variables to add to
 if [[ -z "$CMAKE_ARGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build()
    CMAKE_ARGS=()
 fi
 if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
    EXTRA_CAFFE2_CMAKE_FLAGS=()
 fi
 # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
 source /opt/intel/oneapi/compiler/latest/env/vars.sh
 source /opt/intel/oneapi/pti/latest/env/vars.sh
 source /opt/intel/oneapi/umf/latest/env/vars.sh
 export USE_STATIC_MKL=1
 WHEELHOUSE_DIR="wheelhousexpu"
 LIBTORCH_HOUSE_DIR="libtorch_housexpu"
 if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
    if [[ -z "$BUILD_PYTHONLESS" ]]; then
        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhousexpu"
    else
        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_housexpu"
    fi
 fi
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
 OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
 if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
 elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
    if [[ "$(uname -m)" == "s390x" ]]; then
        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
    else
        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
    fi
 fi
 DEPS_LIST=(
    "$LIBGOMP_PATH"
    "/opt/intel/oneapi/compiler/latest/lib/libOpenCL.so.1"
 )
 DEPS_SONAME=(
    "libgomp.so.1"
    "libOpenCL.so.1"
 )
 if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
    echo "Bundling with xpu support package libs."
    DEPS_LIST+=(
        "/opt/intel/oneapi/compiler/latest/lib/libsycl.so.8"
        "/opt/intel/oneapi/compiler/latest/lib/libur_loader.so.0"
        "/opt/intel/oneapi/compiler/latest/lib/libur_adapter_level_zero.so.0"
        "/opt/intel/oneapi/compiler/latest/lib/libur_adapter_opencl.so.0"
        "/opt/intel/oneapi/compiler/latest/lib/libsvml.so"
        "/opt/intel/oneapi/compiler/latest/lib/libirng.so"
        "/opt/intel/oneapi/compiler/latest/lib/libimf.so"
        "/opt/intel/oneapi/compiler/latest/lib/libintlc.so.5"
        "/opt/intel/oneapi/pti/latest/lib/libpti_view.so.0.10"
        "/opt/intel/oneapi/umf/latest/lib/libumf.so.0"
        "/opt/intel/oneapi/tcm/latest/lib/libhwloc.so.15"
    )
    DEPS_SONAME+=(
        "libsycl.so.8"
        "libur_loader.so.0"
        "libur_adapter_level_zero.so.0"
        "libur_adapter_opencl.so.0"
        "libsvml.so"
        "libirng.so"
        "libimf.so"
        "libintlc.so.5"
        "libpti_view.so.0.10"
        "libumf.so.0"
        "libhwloc.so.15"
    )
 else
    echo "Using xpu runtime libs from pypi."
    XPU_RPATHS=(
        '$ORIGIN/../../../..'
    )
    XPU_RPATHS=$(IFS=: ; echo "${XPU_RPATHS[*]}")
    export C_SO_RPATH=$XPU_RPATHS':$ORIGIN:$ORIGIN/lib'
    export LIB_SO_RPATH=$XPU_RPATHS':$ORIGIN'
    export FORCE_RPATH="--force-rpath"
 fi
 rm -rf /usr/local/cuda*
 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 if [[ -z "$BUILD_PYTHONLESS" ]]; then
    BUILD_SCRIPT=build_common.sh
 else
    BUILD_SCRIPT=build_libtorch.sh
 fi
 source ${SOURCE_DIR}/${BUILD_SCRIPT}
--- a/.ci/manywheel/set_desired_python.sh
+++ b/.ci/manywheel/set_desired_python.sh
@ -1,30 +0,0 @@
 #!/usr/bin/env bash
 # Require only one python installation
 if [[ -z "$DESIRED_PYTHON" ]]; then
    echo "Need to set DESIRED_PYTHON env variable"
    exit 1
 fi
 # If given a python version like 3.6m or 2.7mu, convert this to the format we
 # expect. The binary CI jobs pass in python versions like this; they also only
 # ever pass one python version, so we assume that DESIRED_PYTHON is not a list
 # in this case
 if [[ -n "$DESIRED_PYTHON" && $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
    python_digits="$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
    py_majmin="${DESIRED_PYTHON}"
    DESIRED_PYTHON="cp${python_digits}-cp${python_digits}t"
 elif [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
    if [[ ${python_nodot} -ge 310 ]]; then
        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:2}"
    else
        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:1}"
    fi
 fi
 pydir="/opt/python/$DESIRED_PYTHON"
 export DESIRED_PYTHON_BIN_DIR="${pydir}/bin"
 export PATH="$DESIRED_PYTHON_BIN_DIR:$PATH"
 echo "Will build for Python version: ${DESIRED_PYTHON}"
--- a/.ci/manywheel/test_wheel.sh
+++ b/.ci/manywheel/test_wheel.sh
@ -1,26 +0,0 @@
 #!/usr/bin/env bash
 set -e
 yum install -y wget git
 rm -rf /usr/local/cuda*
 # Install Anaconda
 if ! ls /py
 then
    echo "Miniconda needs to be installed"
    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
    bash ~/miniconda.sh -b -p /py
 else
    echo "Miniconda is already installed"
 fi
 export PATH="/py/bin:$PATH"
 # Anaconda token
 if ls /remote/token
 then
   source /remote/token
 fi
 conda install -y conda-build anaconda-client
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -1,6 +1,6 @@
 #!/bin/bash
-set -ex -o pipefail
+set -ex
 # Required environment variable: $BUILD_ENVIRONMENT
 # (This is set by default in the Docker images we build, so you don't
@ -49,8 +49,13 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi
 # Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+  export USE_LLVM=/opt/rocm/llvm
  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
 else
  export USE_LLVM=/opt/llvm
  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
 fi
 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -87,7 +92,7 @@ else
  # Workaround required for MKL library linkage
  # https://github.com/pytorch/pytorch/issues/119557
-  if [[ "$ANACONDA_PYTHON_VERSION" = "3.12" || "$ANACONDA_PYTHON_VERSION" = "3.13" ]]; then
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
    export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/"
    export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/"
  fi
@ -173,13 +178,12 @@ if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
  export USE_KINETO=0
  export TORCH_XPU_ARCH_LIST=pvc
 fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
-  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; } && which sccache > /dev/null; then
+  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]; } && which sccache > /dev/null; then
    export MAX_JOBS=$(($(nproc) - 1))
  fi
 fi
@ -192,7 +196,7 @@ fi
 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -204,12 +208,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
 fi
 if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
+  export LDSHARED="clang --shared"
-    export USE_CUDA=1
+  export USE_CUDA=0
  fi
  export USE_ASAN=1
-  export REL_WITH_DEB_INFO=1
+  export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
  export UBSAN_FLAGS="-fno-sanitize-recover=all"
  unset USE_LLVM
 fi
@ -221,6 +223,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
    export USE_PRECOMPILED_HEADERS=1
 fi
 if [[ "${BUILD_ENVIRONMENT}" == *linux-focal-py3.7-gcc7-build*  ]]; then
  export USE_GLOO_WITH_OPENSSL=ON
 fi
 if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
@ -229,9 +235,9 @@ if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi
-# Do not change workspace permissions for ROCm and s390x CI jobs
+# Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -248,9 +254,10 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
 fi
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
-  set -e -o pipefail
+  set -e
  get_bazel
  install_sccache_nvcc_for_bazel
  # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
  # the runner
@ -278,14 +285,16 @@ else
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
-        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        # Install numpy-2.0 release candidate for builds
-        python -mpip install numpy==2.0.2
+        # Which should be backward compatible with Numpy-1.X
        python -mpip install --pre numpy==2.0.0rc1
      fi
      WERROR=1 python setup.py clean
      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        python3 tools/packaging/split_wheel.py bdist_wheel
+        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
      else
        WERROR=1 python setup.py bdist_wheel
      fi
@ -337,11 +346,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -351,10 +360,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -366,7 +375,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -378,10 +387,8 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
+    MAX_JOBS=$(nproc --ignore=4)
-      MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS
      export MAX_JOBS
    fi
    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
@ -398,7 +405,9 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
  python tools/stats/export_test_times.py
 fi
-# don't do this for bazel or s390x as they don't use sccache
+
-if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+# snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
 if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -1,384 +0,0 @@
 #!/bin/bash
 # shellcheck disable=SC2086,SC2006,SC2207,SC2076,SC2155,SC2046,SC1091,SC2143
 # TODO: Re-enable shellchecks above
 set -eux -o pipefail
 # This script checks the following things on binaries
 # 1. The gcc abi matches DESIRED_DEVTOOLSET
 # 2. MacOS binaries do not link against OpenBLAS
 # 3. There are no protobuf symbols of any sort anywhere (turned off, because
 #    this is currently not true)
 # 4. Standard Python imports work
 # 5. MKL is available everywhere except for MacOS wheels
 # 6. XNNPACK is available everywhere except for MacOS wheels
 # 7. CUDA is setup correctly and does not hang
 # 8. Magma is available for CUDA builds
 # 9. CuDNN is available for CUDA builds
 #
 # This script needs the env variables DESIRED_PYTHON, DESIRED_CUDA,
 # DESIRED_DEVTOOLSET and PACKAGE_TYPE
 #
 # This script expects PyTorch to be installed into the active Python (the
 # Python returned by `which python`). Or, if this is testing a libtorch
 # Pythonless binary, then it expects to be in the root folder of the unzipped
 # libtorch package.
 if [[ -z ${DESIRED_PYTHON:-} ]]; then
  export DESIRED_PYTHON=${MATRIX_PYTHON_VERSION:-}
 fi
 if [[ -z ${DESIRED_CUDA:-} ]]; then
  export DESIRED_CUDA=${MATRIX_DESIRED_CUDA:-}
 fi
 if [[ -z ${DESIRED_DEVTOOLSET:-} ]]; then
  export DESIRED_DEVTOOLSET=${MATRIX_DESIRED_DEVTOOLSET:-}
 fi
 if [[ -z ${PACKAGE_TYPE:-} ]]; then
  export PACKAGE_TYPE=${MATRIX_PACKAGE_TYPE:-}
 fi
 # The install root depends on both the package type and the os
 # All MacOS packages use conda, even for the wheel packages.
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  # NOTE: Only $PWD works on both CentOS and Ubuntu
  export install_root="$PWD"
 else
  if [[ $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
    # For python that is maj.mint keep original version
    py_dot="$DESIRED_PYTHON"
  elif [[ $DESIRED_PYTHON =~ ([0-9].[0-9]+) ]];  then
    # Strip everything but major.minor from DESIRED_PYTHON version
    py_dot="${BASH_REMATCH[0]}"
  else
    echo "Unexpected ${DESIRED_PYTHON} format"
    exit 1
  fi
  export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
 fi
 ###############################################################################
 # Check GCC ABI
 ###############################################################################
 # NOTE [ Building libtorch with old vs. new gcc ABI ]
 #
 # Packages built with one version of ABI could not be linked against by client
 # C++ libraries that were compiled using the other version of ABI. Since both
 # gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
 #
 # - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
 # - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
  function is_expected() {
    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
        echo 1
      fi
    else
      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
        echo 1
      fi
    fi
  }
  # First we check that the env var in TorchConfig.cmake is correct
  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
  if [[ ! -f "$torch_config" ]]; then
    echo "No TorchConfig.cmake found!"
    ls -lah "$install_root/share/cmake/Torch"
    exit 1
  fi
  echo "Checking the TorchConfig.cmake"
  cat "$torch_config"
  # The sed call below is
  #   don't print lines by default (only print the line we want)
  # -n
  #   execute the following expression
  # e
  #   replace lines that match with the first capture group and print
  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
  #   quote, any characters
  #   Note the exactly one single character after the '='. In the case that the
  #     variable is not set the '=' will be followed by a '"' immediately and the
  #     line will fail the match and nothing will be printed; this is what we
  #     want.  Otherwise it will capture the 0 or 1 after the '='.
  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
  #   replace the matched line with the capture group and print
  # /\1/p
  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
    echo "gcc ABI $actual_gcc_abi not as expected."
    exit 1
  fi
  # We also check that there are [not] cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
  echo "cxx11 symbols seem to be in order"
 fi # if on Darwin
 ###############################################################################
 # Check for no OpenBLAS
 # TODO Check for no Protobuf symbols (not finished)
 # Print *all* runtime dependencies
 ###############################################################################
 # We have to loop through all shared libraries for this
 if [[ "$(uname)" == 'Darwin' ]]; then
  all_dylibs=($(find "$install_root" -name '*.dylib'))
  for dylib in "${all_dylibs[@]}"; do
    echo "All dependencies of $dylib are $(otool -L $dylib) with rpath $(otool -l $dylib | grep LC_RPATH -A2)"
    # Check that OpenBlas is not linked to on Macs
    echo "Checking the OpenBLAS is not linked to"
    if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then
      echo "ERROR: Found openblas as a dependency of $dylib"
      echo "Full dependencies is: $(otool -L $dylib)"
      exit 1
    fi
    # Check for protobuf symbols
    #proto_symbols="$(nm $dylib | grep protobuf)" || true
    #if [[ -n "$proto_symbols" ]]; then
    #  echo "ERROR: Detected protobuf symbols in $dylib"
    #  echo "Symbols are $proto_symbols"
    #  exit 1
    #fi
  done
 else
  all_libs=($(find "$install_root" -name '*.so'))
  for lib in "${all_libs[@]}"; do
    echo "All dependencies of $lib are $(ldd $lib) with runpath $(objdump -p $lib | grep RUNPATH)"
    # Check for protobuf symbols
    #proto_symbols=$(nm $lib | grep protobuf) || true
    #if [[ -n "$proto_symbols" ]]; then
    #  echo "ERROR: Detected protobuf symbols in $lib"
    #  echo "Symbols are $proto_symbols"
    #  exit 1
    #fi
  done
 fi
 setup_link_flags () {
  REF_LIB="-Wl,-R${install_root}/lib"
  if [[ "$(uname)" == 'Darwin' ]]; then
    REF_LIB="-Wl,-rpath ${install_root}/lib"
  fi
  ADDITIONAL_LINKER_FLAGS=""
  if [[ "$(uname)" == 'Linux' ]]; then
    ADDITIONAL_LINKER_FLAGS="-Wl,--no-as-needed"
  fi
  C10_LINK_FLAGS=""
  if [ -f "${install_root}/lib/libc10.so" ] || [ -f "${install_root}/lib/libc10.dylib" ]; then
    C10_LINK_FLAGS="-lc10"
  fi
  TORCH_CPU_LINK_FLAGS=""
  if [ -f "${install_root}/lib/libtorch_cpu.so" ] || [ -f "${install_root}/lib/libtorch_cpu.dylib" ]; then
    TORCH_CPU_LINK_FLAGS="-ltorch_cpu"
  fi
  TORCH_CUDA_LINK_FLAGS=""
  if [ -f "${install_root}/lib/libtorch_cuda.so" ] || [ -f "${install_root}/lib/libtorch_cuda.dylib" ]; then
    TORCH_CUDA_LINK_FLAGS="-ltorch_cuda"
  elif [ -f "${install_root}/lib/libtorch_cuda_cpp.so" ] && [ -f "${install_root}/lib/libtorch_cuda_cpp.so" ] || \
    [ -f "${install_root}/lib/libtorch_cuda_cu.dylib" ] && [ -f "${install_root}/lib/libtorch_cuda_cu.dylib" ]; then
    TORCH_CUDA_LINK_FLAGS="-ltorch_cuda_cpp -ltorch_cuda_cu"
  fi
 }
 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    GLIBCXX_USE_CXX11_ABI=1
  else
    GLIBCXX_USE_CXX11_ABI=0
  fi
  setup_link_flags
  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }
 build_example_cpp_with_incorrect_abi () {
  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    GLIBCXX_USE_CXX11_ABI=0
  else
    GLIBCXX_USE_CXX11_ABI=1
  fi
  set +e
  setup_link_flags
  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ERRCODE=$?
  set -e
  if [ "$ERRCODE" -eq "0" ]; then
    echo "Building example with incorrect ABI didn't throw error. Aborting."
    exit 1
  else
    echo "Building example with incorrect ABI throws expected error. Proceeding."
  fi
 }
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
  # NS: Set LD_LIBRARY_PATH for CUDA builds, but perhaps it should be removed
  if [[ "$DESIRED_CUDA" == "cu"* ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
    build_example_cpp_with_incorrect_abi simple-torch-test
  fi
 else
  pushd /tmp
  python -c 'import torch'
  popd
 fi
 ###############################################################################
 # Check torch.git_version
 ###############################################################################
 if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then
  pushd /tmp
  python -c 'import torch; assert torch.version.git_version != "Unknown"'
  python -c 'import torch; assert torch.version.git_version != None'
  popd
 fi
 ###############################################################################
 # Check for MKL
 ###############################################################################
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
  echo "Checking that MKL is available"
  build_and_run_example_cpp check-torch-mkl
 elif [[ "$(uname -m)" != "arm64" && "$(uname -m)" != "s390x" ]]; then
  if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]]; then
    if [[ "$(uname -m)" == "aarch64" ]]; then
      echo "Checking that MKLDNN is available on aarch64"
      pushd /tmp
      python -c 'import torch; exit(0 if torch.backends.mkldnn.is_available() else 1)'
      popd
    else
      echo "Checking that MKL is available"
      pushd /tmp
      python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)'
      popd
    fi
  fi
 fi
 ###############################################################################
 # Check for XNNPACK
 ###############################################################################
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
  echo "Checking that XNNPACK is available"
  build_and_run_example_cpp check-torch-xnnpack
 else
  if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]] && [[ "$(uname -m)" != "s390x"  ]]; then
    echo "Checking that XNNPACK is available"
    pushd /tmp
    python -c 'import torch.backends.xnnpack; exit(0 if torch.backends.xnnpack.enabled else 1)'
    popd
  fi
 fi
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
 # Skip these for Windows machines without GPUs
 if [[ "$OSTYPE" == "msys" ]]; then
    GPUS=$(wmic path win32_VideoController get name)
    if [[ ! "$GPUS" == *NVIDIA* ]]; then
        echo "Skip CUDA tests for machines without a Nvidia GPU card"
        exit 0
    fi
 fi
 # Test that CUDA builds are setup correctly
 if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRED_CUDA" != *"rocm"* && "$(uname -m)" != "s390x" ]]; then
  if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    build_and_run_example_cpp check-torch-cuda
  else
    pushd /tmp
    echo "Checking that CUDA archs are setup correctly"
    timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()'
    # These have to run after CUDA is initialized
    echo "Checking that magma is available"
    python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)'
    echo "Checking that CuDNN is available"
    python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)'
    # Validates builds is free of linker regressions reported in https://github.com/pytorch/pytorch/issues/57744
    echo "Checking that exception handling works"
    python -c "import torch; from unittest import TestCase;TestCase().assertRaises(RuntimeError, lambda:torch.eye(7, 7, device='cuda:7'))"
    echo "Checking that basic RNN works"
    python ${TEST_CODE_DIR}/rnn_smoke.py
    echo "Checking that basic CNN works"
    python "${TEST_CODE_DIR}/cnn_smoke.py"
    echo "Test that linalg works"
    python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.svd(torch.mm(x.t(), x)))"
    popd
  fi # if libtorch
 fi # if cuda
 ##########################
 # Run parts of smoke tests
 ##########################
 if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then
  pushd "$(dirname ${BASH_SOURCE[0]})/smoke_test"
  python -c "from smoke_test import test_linalg; test_linalg()"
  if [[ "$DESIRED_CUDA" == *cuda* ]]; then
    python -c "from smoke_test import test_linalg; test_linalg('cuda')"
  fi
  popd
 fi
 ###############################################################################
 # Check PyTorch supports TCP_TLS gloo transport
 ###############################################################################
 if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
  GLOO_CHECK="import torch.distributed as dist
 try:
    dist.init_process_group('gloo', rank=0, world_size=1)
 except RuntimeError as e:
    print(e)
 "
  RESULT=`GLOO_DEVICE_TRANSPORT=TCP_TLS MASTER_ADDR=localhost MASTER_PORT=63945 python -c "$GLOO_CHECK"`
  GLOO_TRANSPORT_IS_NOT_SUPPORTED='gloo transport is not supported'
  if [[ "$RESULT" =~ "$GLOO_TRANSPORT_IS_NOT_SUPPORTED" ]]; then
    echo "PyTorch doesn't support TLS_TCP transport, please build with USE_GLOO_WITH_OPENSSL=1"
    exit 1
  fi
 fi
 ###############################################################################
 # Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
  popd
 fi
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@ -6,12 +6,6 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
    # Save the absolute path in case later we chdir (as occurs in the gpu perf test)
    script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"
    if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
        # This is really weird, but newer sccache somehow produces broken binary
        # see https://github.com/pytorch/pytorch/issues/139188
        sudo mv /opt/cache/bin/sccache-0.2.14a /opt/cache/bin/sccache
    fi
    if which sccache > /dev/null; then
        # Save sccache logs to file
        sccache --stop-server > /dev/null  2>&1 || true
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -3,7 +3,7 @@
 # Common setup for all Jenkins scripts
 # shellcheck source=./common_utils.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-set -ex -o pipefail
+set -ex
 # Required environment variables:
 #   $BUILD_ENVIRONMENT (should be set by your Docker image)
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -81,15 +81,14 @@ function pip_install_whl() {
 function pip_install() {
  # retry 3 times
-  pip_install_pkg="python3 -m pip install --progress-bar off"
+  # old versions of pip don't have the "--progress-bar" flag
-  ${pip_install_pkg} "$@" || \
+  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
-    ${pip_install_pkg} "$@" || \
+  pip install "$@" || pip install "$@" || pip install "$@"
    ${pip_install_pkg} "$@"
 }
 function pip_uninstall() {
  # uninstall 2 times
-  pip3 uninstall -y "$@" || pip3 uninstall -y "$@"
+  pip uninstall -y "$@" || pip uninstall -y "$@"
 }
 function get_exit_code() {
@ -105,12 +104,32 @@ function get_bazel() {
  # version of Bazelisk to fetch the platform specific version of
  # Bazel to use from .bazelversion.
  retry curl --location --output tools/bazel \
-    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.23.0/bazelisk.py
+    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.16.0/bazelisk.py
  shasum --algorithm=1 --check \
-    <(echo '01df9cf7f08dd80d83979ed0d0666a99349ae93c  tools/bazel')
+    <(echo 'd4369c3d293814d3188019c9f7527a948972d9f8  tools/bazel')
  chmod u+x tools/bazel
 }
 # This function is bazel specific because of the bug
 # in the bazel that requires some special paths massaging
 # as a workaround. See
 # https://github.com/bazelbuild/bazel/issues/10167
 function install_sccache_nvcc_for_bazel() {
  sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
  # Write the `/usr/local/cuda/bin/nvcc`
  cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
 #!/bin/sh
 if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
  exec sccache /usr/local/cuda/bin/nvcc "\$@"
 else
  exec external/local_cuda/cuda/bin/nvcc-real "\$@"
 fi
 EOF
  sudo chmod +x /usr/local/cuda/bin/nvcc
 }
 function install_monkeytype {
  # Install MonkeyType
  pip_install MonkeyType
@ -160,7 +179,7 @@ function install_torchvision() {
 }
 function install_tlparse() {
-  pip_install --user "tlparse==0.3.30"
+  pip_install --user "tlparse==0.3.25"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }
@ -169,40 +188,17 @@ function install_torchrec_and_fbgemm() {
  torchrec_commit=$(get_pinned_commit torchrec)
  local fbgemm_commit
  fbgemm_commit=$(get_pinned_commit fbgemm)
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
    fbgemm_commit=$(get_pinned_commit fbgemm_rocm)
  fi
  pip_uninstall torchrec-nightly
  pip_uninstall fbgemm-gpu-nightly
  pip_install setuptools-git-versioning scikit-build pyre-extensions
-
+  # See https://github.com/pytorch/pytorch/issues/106971
-  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]] ; then
+  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
-    # install torchrec first because it installs fbgemm nightly on top of rocm fbgemm
+  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
    pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
    pip_uninstall fbgemm-gpu-nightly
    pip_install tabulate  # needed for newer fbgemm
    pip_install patchelf  # needed for rocm fbgemm
    git clone --recursive https://github.com/pytorch/fbgemm
    pushd fbgemm/fbgemm_gpu
    git checkout "${fbgemm_commit}"
    python setup.py install \
      --package_variant=rocm \
      -DHIP_ROOT_DIR="${ROCM_PATH}" \
      -DCMAKE_C_FLAGS="-DTORCH_USE_HIP_DSA" \
      -DCMAKE_CXX_FLAGS="-DTORCH_USE_HIP_DSA"
    popd
    rm -rf fbgemm
  else
    # See https://github.com/pytorch/pytorch/issues/106971
    CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
    pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
  fi
 }
 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
+    git clone --recursive --quiet https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -226,22 +222,11 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
  # TODO (huydhn): transformers-4.44.2 added by https://github.com/pytorch/benchmark/pull/2488
  # is regressing speedup metric. This needs to be investigated further
  pip install transformers==4.38.1
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
 }
 function install_torchao() {
  local commit
  commit=$(get_pinned_commit torchao)
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
 }
 function print_sccache_stats() {
  echo 'PyTorch Build Statistics'
  sccache --show-stats
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -40,7 +40,7 @@ echo "Building PyTorch C++ API docs..."
 rm -rf cppdocs
 git clone https://github.com/pytorch/cppdocs
-set -ex -o pipefail
+set -ex
 # Generate ATen files
 pushd "${pt_checkout}"
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 from tempfile import mkdtemp
 from cryptography import x509
@ -42,10 +42,11 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
+            datetime.utcnow()
            + timedelta(days=10)
        )
        .add_extension(
            x509.BasicConstraints(ca=True, path_length=None),
@ -87,10 +88,11 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
+            datetime.utcnow()
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
        .sign(private_ca_key, hashes.SHA256())
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`ebe8522378c3f9944aaaef44868f5ececdd845fc`	`69472e5c43481324ad923ceb29392ab72830acee`
		`@ -1 +1 @@`
			`461c12871f336fe6f57b55d6a297f13ef209161b`				`340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d`
`@ -1 +1 @@`
	`5d535d7a2d4b435b1b5c1177fd8f04a12b942b9a`	`ac3470188b914c5d7a5058a7e28b9eb685a62427`
`@ -1 +1 @@`
	`0bcc8265e677e5321606a3311bf71470f14456a8`	`1b2f15840e0d70eec50d84c7a0575cb835524def`
`@ -1 +1 @@`
	`96316ce50fade7e209553aba4898cd9b82aab83b`	`dedb7bdf339a3546896d4820366ca562c586bfa0`
		`@ -1 +0,0 @@`
			`6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213 magma-2.6.1.tar.gz`