[POC] "Python Compiled Autograd"

This is a "re-implementation" of compiled autograd. The idea is that: - we leverage the existing autograd graph to construct a Python function that is able to run the autograd graph - then, we run torch.compile over this function This resolves some of the issues we have with the existing compiled autograd. - We're able to graph break in unsupported C++ autograd nodes - The existing compiled autograd uses make_fx to construct the autograd graph before applying torch.compile over that autograd graph. This requires unsound assumptions about input strides and Tensor subclasses. By replicated what PyTorch autograd does in Python, this POC does not have this problem. More on the motivation over at https://docs.google.com/document/d/11KZw4MGoZOLDWQbv6NWxscNUC7lu97M4IVMqfcbkdqA/edit
2025-10-24 15:44:58 +08:00 · 2024-10-09 09:26:39 -04:00
6674 changed files with 114086 additions and 503844 deletions
--- a/.bazelversion
+++ b/.bazelversion
@ -1 +1 @@
-6.5.0
+6.1.1
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -0,0 +1,23 @@
+[pt]
+  is_oss=1
+
+[buildfile]
+  name = BUCK.oss
+  includes = //tools/build_defs/select.bzl
+
+[repositories]
+  bazel_skylib = third_party/bazel-skylib/
+  ovr_config = .
+
+[download]
+  in_build = true
+
+[cxx]
+  cxxflags = -std=c++17
+  ldflags = -Wl,--no-undefined
+  should_remap_host_platform = true
+  cpp = /usr/bin/clang
+  cc = /usr/bin/clang
+  cxx = /usr/bin/clang++
+  cxxpp = /usr/bin/clang++
+  ld = /usr/bin/clang++
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -1,19 +0,0 @@
-# Aarch64 (ARM/Graviton) Support Scripts
-Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
-* torch
-* torchvision
-* torchaudio
-* torchtext
-* torchdata
-## Aarch64_ci_build.sh
-This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
-### Usage
-```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
-
-__NOTE:__ CI build is currently __EXPERMINTAL__
-
-## Build_aarch64_wheel.py
-This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
-
-### Usage
-```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -1,29 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-# cuda arm build for Grace Hopper solely
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
-source $SCRIPTPATH/aarch64_ci_setup.sh
-
-###############################################################################
-# Run aarch64 builder python
-###############################################################################
-cd /
-# adding safe directory for git as the permissions will be
-# on the mounted pytorch repo
-git config --global --add safe.directory /pytorch
-pip install -r /pytorch/requirements.txt
-pip install auditwheel
-if [ "$DESIRED_CUDA" = "cpu" ]; then
-    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
-else
-    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
-    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
-    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
-fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
-# By creating symlinks from desired /opt/python to /usr/local/bin/
-
-NUMPY_VERSION=2.0.2
-PYGIT2_VERSION=1.15.1
-if [[ "$DESIRED_PYTHON"  == "3.13" ]]; then
-    NUMPY_VERSION=2.1.2
-    PYGIT2_VERSION=1.16.0
-fi
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-source $SCRIPTPATH/../manywheel/set_desired_python.sh
-
-pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 pygit2==${PYGIT2_VERSION}
-
-for tool in python python3 pip pip3 ninja scons patchelf; do
-    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
-done
-
-python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -1,230 +0,0 @@
-#!/usr/bin/env python3
-# encoding: UTF-8
-
-import os
-import shutil
-from subprocess import check_call, check_output
-from typing import List
-
-from pygit2 import Repository
-
-
-def list_dir(path: str) -> List[str]:
-    """'
-    Helper for getting paths for Python
-    """
-    return check_output(["ls", "-1", path]).decode().split("\n")
-
-
-def build_ArmComputeLibrary() -> None:
-    """
-    Using ArmComputeLibrary for aarch64 PyTorch
-    """
-    print("Building Arm Compute Library")
-    acl_build_flags = [
-        "debug=0",
-        "neon=1",
-        "opencl=0",
-        "os=linux",
-        "openmp=1",
-        "cppthreads=0",
-        "arch=armv8a",
-        "multi_isa=1",
-        "fixed_format_kernels=1",
-        "build=native",
-    ]
-    acl_install_dir = "/acl"
-    acl_checkout_dir = "ComputeLibrary"
-    os.makedirs(acl_install_dir)
-    check_call(
-        [
-            "git",
-            "clone",
-            "https://github.com/ARM-software/ComputeLibrary.git",
-            "-b",
-            "v24.09",
-            "--depth",
-            "1",
-            "--shallow-submodules",
-        ]
-    )
-
-    check_call(
-        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
-        + acl_build_flags,
-        cwd=acl_checkout_dir,
-    )
-    for d in ["arm_compute", "include", "utils", "support", "src"]:
-        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
-
-
-def update_wheel(wheel_path) -> None:
-    """
-    Update the cuda wheel libraries
-    """
-    folder = os.path.dirname(wheel_path)
-    wheelname = os.path.basename(wheel_path)
-    os.mkdir(f"{folder}/tmp")
-    os.system(f"unzip {wheel_path} -d {folder}/tmp")
-    libs_to_copy = [
-        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
-        "/usr/local/cuda/lib64/libcudnn.so.9",
-        "/usr/local/cuda/lib64/libcublas.so.12",
-        "/usr/local/cuda/lib64/libcublasLt.so.12",
-        "/usr/local/cuda/lib64/libcudart.so.12",
-        "/usr/local/cuda/lib64/libcufft.so.11",
-        "/usr/local/cuda/lib64/libcusparse.so.12",
-        "/usr/local/cuda/lib64/libcusparseLt.so.0",
-        "/usr/local/cuda/lib64/libcusolver.so.11",
-        "/usr/local/cuda/lib64/libcurand.so.10",
-        "/usr/local/cuda/lib64/libnvToolsExt.so.1",
-        "/usr/local/cuda/lib64/libnvJitLink.so.12",
-        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
-        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
-        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
-        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
-        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
-        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
-        "/lib64/libgomp.so.1",
-        "/usr/lib64/libgfortran.so.5",
-        "/acl/build/libarm_compute.so",
-        "/acl/build/libarm_compute_graph.so",
-    ]
-    if enable_cuda:
-        libs_to_copy += [
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-    else:
-        libs_to_copy += [
-            "/opt/OpenBLAS/lib/libopenblas.so.0",
-        ]
-    # Copy libraries to unzipped_folder/a/lib
-    for lib_path in libs_to_copy:
-        lib_name = os.path.basename(lib_path)
-        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
-        os.system(
-            f"cd {folder}/tmp/torch/lib/; "
-            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
-        )
-    os.mkdir(f"{folder}/cuda_wheel")
-    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
-    shutil.move(
-        f"{folder}/cuda_wheel/{wheelname}",
-        f"{folder}/{wheelname}",
-        copy_function=shutil.copy2,
-    )
-    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
-
-
-def complete_wheel(folder: str) -> str:
-    """
-    Complete wheel build and put in artifact location
-    """
-    wheel_name = list_dir(f"/{folder}/dist")[0]
-
-    if "pytorch" in folder and not enable_cuda:
-        print("Repairing Wheel with AuditWheel")
-        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
-        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
-
-        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
-        os.rename(
-            f"/{folder}/wheelhouse/{repaired_wheel_name}",
-            f"/{folder}/dist/{repaired_wheel_name}",
-        )
-    else:
-        repaired_wheel_name = wheel_name
-
-    print(f"Copying {repaired_wheel_name} to artifacts")
-    shutil.copy2(
-        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
-    )
-
-    return repaired_wheel_name
-
-
-def parse_arguments():
-    """
-    Parse inline arguments
-    """
-    from argparse import ArgumentParser
-
-    parser = ArgumentParser("AARCH64 wheels python CD")
-    parser.add_argument("--debug", action="store_true")
-    parser.add_argument("--build-only", action="store_true")
-    parser.add_argument("--test-only", type=str)
-    parser.add_argument("--enable-mkldnn", action="store_true")
-    parser.add_argument("--enable-cuda", action="store_true")
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    """
-    Entry Point
-    """
-    args = parse_arguments()
-    enable_mkldnn = args.enable_mkldnn
-    enable_cuda = args.enable_cuda
-    repo = Repository("/pytorch")
-    branch = repo.head.name
-    if branch == "HEAD":
-        branch = "master"
-
-    print("Building PyTorch wheel")
-    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
-    os.system("cd /pytorch; python setup.py clean")
-
-    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
-    if override_package_version is not None:
-        version = override_package_version
-        build_vars += (
-            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
-        )
-    elif branch in ["nightly", "master"]:
-        build_date = (
-            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
-            .decode()
-            .replace("-", "")
-        )
-        version = (
-            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
-        )
-        if enable_cuda:
-            desired_cuda = os.getenv("DESIRED_CUDA")
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
-        else:
-            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
-    elif branch.startswith(("v1.", "v2.")):
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
-
-    if enable_mkldnn:
-        build_ArmComputeLibrary()
-        print("build pytorch with mkldnn+acl backend")
-        build_vars += (
-            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
-            "ACL_ROOT_DIR=/acl "
-            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
-            "ACL_INCLUDE_DIR=/acl/build "
-            "ACL_LIBRARY=/acl/build "
-        )
-        if enable_cuda:
-            build_vars += "BLAS=NVPL "
-        else:
-            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
-    else:
-        print("build pytorch without mkldnn backend")
-
-    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
-    if enable_cuda:
-        print("Updating Cuda Dependency")
-        filename = os.listdir("/pytorch/dist/")
-        wheel_path = f"/pytorch/dist/{filename[0]}"
-        update_wheel(wheel_path)
-    pytorch_wheel_name = complete_wheel("/pytorch/")
-    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -1,87 +0,0 @@
-#!/usr/bin/env python3
-
-import os
-import shutil
-import sys
-from subprocess import check_call
-from tempfile import TemporaryDirectory
-
-from auditwheel.elfutils import elf_file_filter
-from auditwheel.lddtree import lddtree
-from auditwheel.patcher import Patchelf
-from auditwheel.repair import copylib
-from auditwheel.wheeltools import InWheelCtx
-
-
-def replace_tag(filename):
-    with open(filename) as f:
-        lines = f.read().split("\\n")
-    for i, line in enumerate(lines):
-        if not line.startswith("Tag: "):
-            continue
-        lines[i] = line.replace("-linux_", "-manylinux2014_")
-        print(f"Updated tag from {line} to {lines[i]}")
-
-    with open(filename, "w") as f:
-        f.write("\\n".join(lines))
-
-
-class AlignedPatchelf(Patchelf):
-    def set_soname(self, file_name: str, new_soname: str) -> None:
-        check_call(
-            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
-        )
-
-    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
-        check_call(
-            [
-                "patchelf",
-                "--page-size",
-                "65536",
-                "--replace-needed",
-                soname,
-                new_soname,
-                file_name,
-            ]
-        )
-
-
-def embed_library(whl_path, lib_soname, update_tag=False):
-    patcher = AlignedPatchelf()
-    out_dir = TemporaryDirectory()
-    whl_name = os.path.basename(whl_path)
-    tmp_whl_name = os.path.join(out_dir.name, whl_name)
-    with InWheelCtx(whl_path) as ctx:
-        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
-        ctx.out_wheel = tmp_whl_name
-        new_lib_path, new_lib_soname = None, None
-        for filename, _ in elf_file_filter(ctx.iter_files()):
-            if not filename.startswith("torch/lib"):
-                continue
-            libtree = lddtree(filename)
-            if lib_soname not in libtree["needed"]:
-                continue
-            lib_path = libtree["libs"][lib_soname]["path"]
-            if lib_path is None:
-                print(f"Can't embed {lib_soname} as it could not be found")
-                break
-            if lib_path.startswith(torchlib_path):
-                continue
-
-            if new_lib_path is None:
-                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
-            patcher.replace_needed(filename, lib_soname, new_lib_soname)
-            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
-        if update_tag:
-            # Add manylinux2014 tag
-            for filename in ctx.iter_files():
-                if os.path.basename(filename) != "WHEEL":
-                    continue
-                replace_tag(filename)
-    shutil.move(tmp_whl_name, whl_path)
-
-
-if __name__ == "__main__":
-    embed_library(
-        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
-    )
--- a/.ci/docker/android/AndroidManifest.xml
+++ b/.ci/docker/android/AndroidManifest.xml
@ -0,0 +1 @@
+<manifest package="org.pytorch.deps" />
--- a/.ci/docker/android/build.gradle
+++ b/.ci/docker/android/build.gradle
@ -0,0 +1,66 @@
+buildscript {
+    ext {
+        minSdkVersion = 21
+        targetSdkVersion = 28
+        compileSdkVersion = 28
+        buildToolsVersion = '28.0.3'
+
+        coreVersion = "1.2.0"
+        extJUnitVersion = "1.1.1"
+        runnerVersion = "1.2.0"
+        rulesVersion = "1.2.0"
+        junitVersion = "4.12"
+    }
+
+    repositories {
+        google()
+        mavenLocal()
+        mavenCentral()
+        jcenter()
+    }
+
+    dependencies {
+        classpath 'com.android.tools.build:gradle:4.1.2'
+        classpath 'com.vanniktech:gradle-maven-publish-plugin:0.14.2'
+    }
+}
+
+repositories {
+    google()
+    jcenter()
+}
+
+apply plugin: 'com.android.library'
+
+android {
+    compileSdkVersion rootProject.compileSdkVersion
+    buildToolsVersion rootProject.buildToolsVersion
+
+    defaultConfig {
+        minSdkVersion minSdkVersion
+        targetSdkVersion targetSdkVersion
+    }
+
+    sourceSets {
+        main {
+            manifest.srcFile 'AndroidManifest.xml'
+        }
+    }
+}
+
+dependencies {
+    implementation 'com.android.support:appcompat-v7:28.0.0'
+    implementation 'androidx.appcompat:appcompat:1.0.0'
+    implementation 'com.facebook.fbjni:fbjni-java-only:0.2.2'
+    implementation 'com.google.code.findbugs:jsr305:3.0.1'
+    implementation 'com.facebook.soloader:nativeloader:0.10.5'
+
+    implementation 'junit:junit:' + rootProject.junitVersion
+    implementation 'androidx.test:core:' + rootProject.coreVersion
+
+    implementation 'junit:junit:' + rootProject.junitVersion
+    implementation 'androidx.test:core:' + rootProject.coreVersion
+    implementation 'androidx.test.ext:junit:' + rootProject.extJUnitVersion
+    implementation 'androidx.test:rules:' + rootProject.rulesVersion
+    implementation 'androidx.test:runner:' + rootProject.runnerVersion
+}
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -0,0 +1,5 @@
+0.7b
+manylinux_2_17
+rocm6.2
+9be04068c3c0857a4cfd17d7e39e71d0423ebac2
+3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -179,10 +179,10 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.13-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
+  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+    CUDA_VERSION=11.8.0
    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.13
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -192,10 +192,9 @@ case "$image" in
    UCC_COMMIT=${_UCC_COMMIT}
    CONDA_CMAKE=yes
    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
-    CUDA_VERSION=11.8.0
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.1
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -222,6 +221,20 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
+    CUDA_VERSION=12.4.1
+    CUDNN_VERSION=9
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    KATEX=yes
+    UCX_COMMIT=${_UCX_COMMIT}
+    UCC_COMMIT=${_UCC_COMMIT}
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
  pytorch-linux-focal-py3-clang10-onnx)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
@ -231,6 +244,16 @@ case "$image" in
    CONDA_CMAKE=yes
    ONNX=yes
    ;;
+  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
+    ANACONDA_PYTHON_VERSION=3.9
+    CLANG_VERSION=9
+    LLVMDEV=yes
+    PROTOBUF=yes
+    ANDROID=yes
+    ANDROID_NDK_VERSION=r21e
+    GRADLE_VERSION=6.8.3
+    NINJA_VERSION=1.9.0
+    ;;
  pytorch-linux-focal-py3.9-clang10)
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
@ -268,7 +291,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.2.4
+    ROCM_VERSION=6.1
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -279,7 +302,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.3
+    ROCM_VERSION=6.2
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -295,17 +318,6 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-2025.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
-    GCC_VERSION=11
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    XPU_VERSION=2025.0
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -402,6 +414,9 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -414,6 +429,9 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
+    # snadampal: skipping sccache due to the following issue
+    # https://github.com/pytorch/pytorch/issues/121559
+    SKIP_SCCACHE_INSTALL=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
    SKIP_LLVM_SRC_BUILD_INSTALL=yes
@ -490,6 +508,8 @@ docker build \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
+       --build-arg "ANDROID=${ANDROID}" \
+       --build-arg "ANDROID_NDK=${ANDROID_NDK_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
@ -497,7 +517,7 @@ docker build \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
       --build-arg "ROCM_VERSION=${ROCM_VERSION:-}" \
-       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx90a;gfx942}" \
+       --build-arg "PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH:-gfx906;gfx90a}" \
       --build-arg "IMAGE_NAME=${IMAGE_NAME}" \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -113,6 +113,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

+# Install AOTriton (Early fail)
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-a29b208a06ab378bb29ab1aa68932e412f8e09f1
+cd1c833b079adb324871dcbbe75b43d42ffc0ade
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -1 +1 @@
-c7711371cace304afe265c1ffa906415ab82fc66
+6a333f1b05671f6fada4ba7bbfae4a02a9d96f4f
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-e98b6fcb8df5b44eb0d0addb6767c573d37ba024
+91b14bf5593cf58a8541f3e6b9125600a867d4ef
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-0d4682f073ded4d1a8260dd4208a43d735ae3a2b
+cf34004b8a67d290a962da166f5aa2fc66751326
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,7 +1,7 @@
 set -euo pipefail

 readonly version=v24.04
-readonly src_host=https://github.com/ARM-software
+readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary

 # Clone ACL
--- a/.ci/docker/common/install_android.sh
+++ b/.ci/docker/common/install_android.sh
@ -0,0 +1,112 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${ANDROID_NDK}" ]
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+apt-get update
+apt-get install -y --no-install-recommends autotools-dev autoconf unzip
+apt-get autoclean && apt-get clean
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+pushd /tmp
+curl -Os --retry 3 $_https_amazon_aws/android-ndk-${ANDROID_NDK}-linux-x86_64.zip
+popd
+_ndk_dir=/opt/ndk
+mkdir -p "$_ndk_dir"
+unzip -qo /tmp/android*.zip -d "$_ndk_dir"
+_versioned_dir=$(find "$_ndk_dir/" -mindepth 1 -maxdepth 1 -type d)
+mv "$_versioned_dir"/* "$_ndk_dir"/
+rmdir "$_versioned_dir"
+rm -rf /tmp/*
+
+# Install OpenJDK
+# https://hub.docker.com/r/picoded/ubuntu-openjdk-8-jdk/dockerfile/
+
+sudo apt-get update && \
+    apt-get install -y openjdk-8-jdk && \
+    apt-get install -y ant && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf /var/cache/oracle-jdk8-installer;
+
+# Fix certificate issues, found as of
+# https://bugs.launchpad.net/ubuntu/+source/ca-certificates-java/+bug/983302
+
+sudo apt-get update && \
+    apt-get install -y ca-certificates-java && \
+    apt-get clean && \
+    update-ca-certificates -f && \
+    rm -rf /var/lib/apt/lists/* && \
+    rm -rf /var/cache/oracle-jdk8-installer;
+
+export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
+
+# Installing android sdk
+# https://github.com/circleci/circleci-images/blob/staging/android/Dockerfile.m4
+
+_tmp_sdk_zip=/tmp/android-sdk-linux.zip
+_android_home=/opt/android/sdk
+
+rm -rf $_android_home
+sudo mkdir -p $_android_home
+curl --silent --show-error --location --fail --retry 3 --output /tmp/android-sdk-linux.zip $_https_amazon_aws/android-sdk-linux-tools3859397-build-tools2803-2902-platforms28-29.zip
+sudo unzip -q $_tmp_sdk_zip -d $_android_home
+rm $_tmp_sdk_zip
+
+sudo chmod -R 777 $_android_home
+
+export ANDROID_HOME=$_android_home
+export ADB_INSTALL_TIMEOUT=120
+
+export PATH="${ANDROID_HOME}/tools:${ANDROID_HOME}/tools/bin:${ANDROID_HOME}/platform-tools:${PATH}"
+echo "PATH:${PATH}"
+
+# Installing Gradle
+echo "GRADLE_VERSION:${GRADLE_VERSION}"
+_gradle_home=/opt/gradle
+sudo rm -rf $gradle_home
+sudo mkdir -p $_gradle_home
+
+curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip
+
+sudo unzip -q /tmp/gradle.zip -d $_gradle_home
+rm /tmp/gradle.zip
+
+sudo chmod -R 777 $_gradle_home
+
+export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION
+alias gradle="${GRADLE_HOME}/bin/gradle"
+
+export PATH="${GRADLE_HOME}/bin/:${PATH}"
+echo "PATH:${PATH}"
+
+gradle --version
+
+mkdir /var/lib/jenkins/gradledeps
+cp build.gradle /var/lib/jenkins/gradledeps
+cp AndroidManifest.xml /var/lib/jenkins/gradledeps
+
+pushd /var/lib/jenkins
+
+export GRADLE_LOCAL_PROPERTIES=gradledeps/local.properties
+rm -f $GRADLE_LOCAL_PROPERTIES
+echo "sdk.dir=/opt/android/sdk" >> $GRADLE_LOCAL_PROPERTIES
+echo "ndk.dir=/opt/ndk" >> $GRADLE_LOCAL_PROPERTIES
+
+chown -R jenkins /var/lib/jenkins/gradledeps
+chgrp -R jenkins /var/lib/jenkins/gradledeps
+
+sudo -H -u jenkins $GRADLE_HOME/bin/gradle -Pandroid.useAndroidX=true -p /var/lib/jenkins/gradledeps -g /var/lib/jenkins/.gradle --refresh-dependencies --debug --stacktrace assemble
+
+chown -R jenkins /var/lib/jenkins/.gradle
+chgrp -R jenkins /var/lib/jenkins/.gradle
+
+popd
+
+rm -rf /var/lib/jenkins/.gradle/daemon
+
+# Cache vision models used by the test
+source "$(dirname "${BASH_SOURCE[0]}")/cache_vision_models.sh"
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+TARBALL='aotriton.tar.gz'
+# This read command alwasy returns with exit code 1
+read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
+ARCH=$(uname -m)
+AOTRITON_INSTALL_PREFIX="$1"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
+
+cd "${AOTRITON_INSTALL_PREFIX}"
+# Must use -L to follow redirects
+curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
+ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
+if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
+  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
+  echo " which does not match the expected value ${SHA256}."
+  exit
+fi
+tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -76,8 +76,7 @@ install_ubuntu() {
    vim \
    unzip \
    gpg-agent \
-    gdb \
-    bc
+    gdb

  # Should resolve issues related to various apt package repository cert issues
  # see: https://github.com/pytorch/pytorch/issues/65931
--- a/.ci/docker/common/install_cache.sh
+++ b/.ci/docker/common/install_cache.sh
@ -9,7 +9,7 @@ install_ubuntu() {
  # Instead use lib and headers from OpenSSL1.1 installed in `install_openssl.sh``
  apt-get install -y cargo
  echo "Checking out sccache repo"
-  git clone https://github.com/mozilla/sccache -b v0.9.0
+  git clone https://github.com/pytorch/sccache
  cd sccache
  echo "Building sccache"
  cargo build --release
@ -19,10 +19,6 @@ install_ubuntu() {
  rm -rf sccache
  apt-get remove -y cargo rustc
  apt-get autoclean && apt-get clean
-
-  echo "Downloading old sccache binary from S3 repo for PCH builds"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /opt/cache/bin/sccache-0.2.14a
-  chmod 755 /opt/cache/bin/sccache-0.2.14a
 }

 install_binary() {
@ -36,42 +32,22 @@ sed -e 's|PATH="\(.*\)"|PATH="/opt/cache/bin:\1"|g' -i /etc/environment
 export PATH="/opt/cache/bin:$PATH"

 # Setup compiler cache
-install_ubuntu
+if [ -n "$ROCM_VERSION" ]; then
+  curl --retry 3 http://repo.radeon.com/misc/.sccache_amd/sccache -o /opt/cache/bin/sccache
+else
+  ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+  # TODO: Install the pre-built binary from S3 as building from source
+  # https://github.com/pytorch/sccache has started failing mysteriously
+  # in which sccache server couldn't start with the following error:
+  #   sccache: error: Invalid argument (os error 22)
+  install_binary
+fi
 chmod a+x /opt/cache/bin/sccache

 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  if [ $1 == "gcc" ]; then
-    # Do not call sccache recursively when dumping preprocessor argument
-    # For some reason it's very important for the first cached nvcc invocation
-    cat >"/opt/cache/bin/$1" <<EOF
-#!/bin/sh
-
-# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
-for arg in "\$@"; do
-  if [ "\$arg" = "-E" ]; then
-    exec $(which $1) "\$@"
-  fi
-done
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which $1) "\$@"
-else
-  exec $(which $1) "\$@"
-fi
-EOF
-  else
-    cat >"/opt/cache/bin/$1" <<EOF
-#!/bin/sh
-
-if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
-  exec sccache $(which $1) "\$@"
-else
-  exec $(which $1) "\$@"
-fi
-EOF
-  fi
+  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/opt/cache/bin/$1"
  chmod a+x "/opt/cache/bin/$1"
 }

@ -112,7 +88,7 @@ if [ -n "$ROCM_VERSION" ]; then
    TOPDIR=$(dirname $OLDCOMP)
    WRAPPED="$TOPDIR/original/$COMPNAME"
    mv "$OLDCOMP" "$WRAPPED"
-    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" >"$OLDCOMP"
+    printf "#!/bin/sh\nexec sccache $WRAPPED \"\$@\"" > "$OLDCOMP"
    chmod a+x "$OLDCOMP"
  }

--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -20,10 +20,9 @@ if [ -n "$CLANG_VERSION" ]; then
  fi

  sudo apt-get update
-  if [[ $CLANG_VERSION -ge 18 ]]; then
-    apt-get install -y libomp-${CLANG_VERSION}-dev libclang-rt-${CLANG_VERSION}-dev clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
-  else
-    apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  if [[ $CLANG_VERSION == 18 ]]; then
+    apt-get install -y --no-install-recommends libomp-18-dev
  fi

  # Install dev version of LLVM.
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -25,8 +25,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda

-  SCRIPT_FOLDER="$( cd "$(dirname "$0")" ; pwd -P )"
-  source "${SCRIPT_FOLDER}/common_utils.sh"
+  source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

  pushd /tmp
  wget -q "${BASE_URL}/${CONDA_FILE}"
@ -66,10 +65,23 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
-    conda_install "openblas==0.3.28=*openmp*"
+    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"
+
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
+      NUMPY_VERSION=1.24.4
+    else
+      NUMPY_VERSION=1.26.2
+    fi
  else
-    conda_install "mkl=2021.4.0 mkl-include=2021.4.0"
+    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
+
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
+      NUMPY_VERSION=1.26.0
+    else
+      NUMPY_VERSION=1.21.2
+    fi
  fi
+  conda_install ${CONDA_COMMON_DEPS}

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -85,13 +97,14 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Magma package names are concatenation of CUDA major and minor ignoring revision
  # I.e. magma-cuda102 package corresponds to CUDA_VERSION=10.2 and CUDA_VERSION=10.2.89
-  # Magma is installed from a tarball in the ossci-linux bucket into the conda env
  if [ -n "$CUDA_VERSION" ]; then
-    ${SCRIPT_FOLDER}/install_magma_conda.sh $(cut -f1-2 -d'.' <<< ${CUDA_VERSION}) ${ANACONDA_PYTHON_VERSION}
+    conda_install magma-cuda$(TMP=${CUDA_VERSION/./};echo ${TMP%.*[0-9]}) -c pytorch
  fi

  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
+  pip_install numpy=="$NUMPY_VERSION"
+  pip_install -U scikit-learn

  if [ -n "$DOCS" ]; then
    apt-get update
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -70,7 +70,7 @@ function do_cpython_build {
    # install setuptools since python 3.12 is required to use distutils
    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
-    ln -sf ${prefix} /opt/python/${abi_tag}
+    ln -s ${prefix} /opt/python/${abi_tag}
 }

 function build_cpython {
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -3,7 +3,7 @@
 set -ex

 NCCL_VERSION=v2.21.5-1
-CUDNN_VERSION=9.5.1.17
+CUDNN_VERSION=9.1.0.70

 function install_cusparselt_040 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -38,19 +38,7 @@ function install_cusparselt_062 {
    rm -rf tmp_cusparselt
 }

-function install_cusparselt_063 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
-    tar xf libcusparse_lt-linux-x86_64-0.6.3.2-archive.tar.xz
-    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-x86_64-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
 function install_118 {
-    CUDNN_VERSION=9.1.0.70
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
@ -117,7 +105,6 @@ function install_121 {
 }

 function install_124 {
-  CUDNN_VERSION=9.1.0.70
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
@ -150,39 +137,6 @@ function install_124 {
  ldconfig
 }

-function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
-  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
-  # install CUDA 12.6.3 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
-  chmod +x cuda_12.6.3_560.35.05_linux.run
-  ./cuda_12.6.3_560.35.05_linux.run --toolkit --silent
-  rm -f cuda_12.6.3_560.35.05_linux.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-x86_64-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
-
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
-
-  install_cusparselt_063
-
-  ldconfig
-}
-
 function prune_118 {
    echo "Pruning CUDA 11.8 and cuDNN"
    #####################################################################################
@ -273,46 +227,12 @@ function prune_124 {
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

  #####################################################################################
-  # CUDA 12.4 prune visual tools
+  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

-function prune_126 {
-  echo "Pruning CUDA 12.6"
-  #####################################################################################
-  # CUDA 12.6 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.6 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.6/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
@ -323,8 +243,6 @@ do
        ;;
    12.4) install_124; prune_124
        ;;
-    12.6) install_126; prune_126
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -4,7 +4,6 @@
 set -ex

 NCCL_VERSION=v2.21.5-1
-CUDNN_VERSION=9.5.1.17

 function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -17,20 +16,8 @@ function install_cusparselt_062 {
    rm -rf tmp_cusparselt
 }

-function install_cusparselt_063 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.6.3.2-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.6.3.2-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
 function install_124 {
-  CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -41,10 +28,10 @@ function install_124 {

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn

@ -87,87 +74,18 @@ function prune_124 {
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

  #####################################################################################
-  # CUDA 12.4 prune visual tools
+  # CUDA 12.1 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

-function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
-  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
-  # install CUDA 12.6.3 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux_sbsa.run
-  chmod +x cuda_12.6.3_560.35.05_linux_sbsa.run
-  ./cuda_12.6.3_560.35.05_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.6.3_560.35.05_linux_sbsa.run
-  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf tmp_cudnn
-
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
-  cd nccl && make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  cd ..
-  rm -rf nccl
-
-  install_cusparselt_063
-
-  ldconfig
-}
-
-function prune_126 {
-  echo "Pruning CUDA 12.6"
-  #####################################################################################
-  # CUDA 12.6 prune static libs
-  #####################################################################################
-  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
-  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
-
-  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
-
-  if [[ -n "$OVERRIDE_GENCODE" ]]; then
-      export GENCODE=$OVERRIDE_GENCODE
-  fi
-  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
-      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
-  fi
-
-  # all CUDA libs except CuDNN and CuBLAS
-  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
-      | xargs -I {} bash -c \
-                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
-
-  # prune CuDNN and CuBLAS
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
-  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
-
-  #####################################################################################
-  # CUDA 12.6 prune visual tools
-  #####################################################################################
-  export CUDA_BASE="/usr/local/cuda-12.6/"
-  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    12.4) install_124; prune_124
        ;;
-    12.6) install_126; prune_126
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -4,9 +4,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -36,19 +36,25 @@ install_conda_dependencies() {
 }

 install_pip_dependencies() {
-  pushd executorch
-  as_jenkins bash install_requirements.sh --pybind xnnpack
+  pushd executorch/.ci/docker
+  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
+  # binaries later, ExecuTorch only needs CPU
+  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+  # Install all Python dependencies
+  pip_install -r requirements-ci.txt
  popd
 }

 setup_executorch() {
  pushd executorch
+  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh

  export PYTHON_EXECUTABLE=python
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  as_jenkins .ci/scripts/setup-linux.sh cmake || true
+  as_jenkins .ci/scripts/setup-linux.sh cmake
  popd
 }

--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -7,20 +7,14 @@ source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 function install_huggingface() {
  local version
  commit=$(get_pinned_commit huggingface)
+  pip_install pandas==2.0.3
  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }

 function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)
-
-  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
-  # I'm using nightly here instead. We just need to package to be able to install
-  # TIMM. Removing this once vision has a release on 3.13
-  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
-    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
-  fi
-
+  pip_install pandas==2.0.3
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
  conda_run pip uninstall -y cmake torch torchvision triton
--- a/.ci/docker/common/install_magma.sh
+++ b/.ci/docker/common/install_magma.sh
@ -3,6 +3,8 @@

 set -eou pipefail

+MAGMA_VERSION="2.5.2"
+
 function do_install() {
    cuda_version=$1
    cuda_version_nodot=${1/./}
@ -15,7 +17,7 @@ function do_install() {
        set -x
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
+        curl -OLs https://anaconda.org/pytorch/magma-cuda${cuda_version_nodot}/${MAGMA_VERSION}/download/linux-64/${magma_archive}
        tar -xvf "${magma_archive}"
        mkdir -p "${cuda_dir}/magma"
        mv include "${cuda_dir}/magma/include"
--- a/.ci/docker/common/install_magma_conda.sh
+++ b/.ci/docker/common/install_magma_conda.sh
@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-# Script that replaces the magma install from a conda package
-
-set -eou pipefail
-
-function do_install() {
-    cuda_version_nodot=${1/./}
-    anaconda_python_version=$2
-
-    MAGMA_VERSION="2.6.1"
-    magma_archive="magma-cuda${cuda_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
-
-    anaconda_dir="/opt/conda/envs/py_${anaconda_python_version}"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        tar -xvf "${magma_archive}"
-        mv include/* "${anaconda_dir}/include/"
-        mv lib/* "${anaconda_dir}/lib"
-        popd
-    )
-}
-
-do_install $1 $2
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -16,7 +16,7 @@ case "$ID" in
  ubuntu)
    IS_UBUNTU=1
    ;;
-  centos|almalinux)
+  centos)
    IS_UBUNTU=0
    ;;
  *)
@ -43,6 +43,12 @@ else
 fi
 ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))

+# Install custom MIOpen + COMgr for ROCm >= 4.0.1
+if [[ $ROCM_INT -lt 40001 ]]; then
+    echo "ROCm version < 4.0.1; will not install custom MIOpen"
+    exit 0
+fi
+
 # Function to retry functions that sometimes timeout or have flaky failures
 retry () {
    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
@ -60,27 +66,55 @@ else
    ROCM_INSTALL_PATH="/opt/rocm-${ROCM_VERSION}"
 fi

+# MIOPEN_USE_HIP_KERNELS is a Workaround for COMgr issues
 MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_USE_COMGR=ON
 -DMIOPEN_BUILD_DRIVER=OFF
 "
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60204 ]]; then
-    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
-else
-    echo "ROCm ${ROCM_VERSION} does not need any patches, do not build from source"
+# Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
+if [[ $ROCM_INT -ge 60300 ]]; then
+    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
+elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
+elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
+    echo "ROCm 6.0 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50700 ]] && [[ $ROCM_INT -lt 60000 ]]; then
+    echo "ROCm 5.7 MIOpen does not need any patches, do not build from source"
+    exit 0
+elif [[ $ROCM_INT -ge 50600 ]] && [[ $ROCM_INT -lt 50700 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.6-staging"
+elif [[ $ROCM_INT -ge 50500 ]] && [[ $ROCM_INT -lt 50600 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-5.5-gfx11"
+elif [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.4-staging"
+elif [[ $ROCM_INT -ge 50300 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.3-staging"
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50300 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36 -DMIOPEN_USE_MLIR=Off"
+    MIOPEN_BRANCH="release/rocm-rel-5.2-staging"
+elif [[ $ROCM_INT -ge 50100 ]] && [[ $ROCM_INT -lt 50200 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.1-staging"
+elif [[ $ROCM_INT -ge 50000 ]] && [[ $ROCM_INT -lt 50100 ]]; then
+    MIOPEN_CMAKE_DB_FLAGS="-DMIOPEN_EMBED_DB=gfx900_56;gfx906_60;gfx90878;gfx90a6e;gfx1030_36"
+    MIOPEN_BRANCH="release/rocm-rel-5.0-staging"
+else
+    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
+    exit 1
 fi


 if [[ ${IS_UBUNTU} == 1 ]]; then
  apt-get remove -y miopen-hip
 else
-  # Workaround since almalinux manylinux image already has this and cget doesn't like that
-  rm -rf /usr/local/lib/pkgconfig/sqlite3.pc
-
-  # Versioned package name needs regex match
-  # Use --noautoremove to prevent other rocm packages from being uninstalled
-  yum remove -y miopen-hip* --noautoremove
+  yum remove -y miopen-hip
 fi

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
@ -88,7 +122,16 @@ pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
 # Don't build CK to save docker build time
-sed -i '/composable_kernel/d' requirements.txt
+if [[ $ROCM_INT -ge 60200 ]]; then
+    sed -i '/composable_kernel/d' requirements.txt
+fi
+# Don't build MLIR to save docker build time
+# since we are disabling MLIR backend for MIOpen anyway
+if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
+    sed -i '/rocMLIR/d' requirements.txt
+elif [[ $ROCM_INT -ge 50200 ]] && [[ $ROCM_INT -lt 50400 ]]; then
+    sed -i '/llvm-project-mlir/d' requirements.txt
+fi
 ## MIOpen minimum requirements
 cmake -P install_deps.cmake --minimum

@ -110,7 +153,7 @@ cd build
 PKG_CONFIG_PATH=/usr/local/lib/pkgconfig CXX=${ROCM_INSTALL_PATH}/llvm/bin/clang++ cmake .. \
    ${MIOPEN_CMAKE_COMMON_FLAGS} \
    ${MIOPEN_CMAKE_DB_FLAGS} \
-    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}"
+    -DCMAKE_PREFIX_PATH="${ROCM_INSTALL_PATH}/hip;${ROCM_INSTALL_PATH}"
 make MIOpen -j $(nproc)

 # Build MIOpen package
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -32,7 +32,7 @@ pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
 pip_install onnx==1.16.2
-pip_install onnxscript==0.1.0.dev20241124 --no-deps
+pip_install onnxscript==0.1.0.dev20240831 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_openblas.sh
+++ b/.ci/docker/common/install_openblas.sh
@ -4,7 +4,7 @@
 set -ex

 cd /
-git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.28 --depth 1 --shallow-submodules
+git clone https://github.com/OpenMathLib/OpenBLAS.git -b v0.3.25 --depth 1 --shallow-submodules


 OPENBLAS_BUILD_FLAGS="
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -62,22 +62,6 @@ install_ubuntu() {
        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
    done

-    # ROCm 6.3 had a regression where initializing static code objects had significant overhead
-    if [[ $(ver $ROCM_VERSION) -eq $(ver 6.3) ]]; then
-        # clr build needs CppHeaderParser but can only find it using conda's python
-        /opt/conda/bin/python -m pip install CppHeaderParser
-        git clone https://github.com/ROCm/HIP -b rocm-6.3.x
-        HIP_COMMON_DIR=$(readlink -f HIP)
-        git clone https://github.com/jeffdaily/clr -b release/rocm-rel-6.3-statco-hotfix
-        mkdir -p clr/build
-        pushd clr/build
-        cmake .. -DCLR_BUILD_HIP=ON -DHIP_COMMON_DIR=$HIP_COMMON_DIR
-        make -j
-        cp hipamd/lib/libamdhip64.so.6.3.* /opt/rocm/lib/libamdhip64.so.6.3.*
-        popd
-        rm -rf HIP clr
-    fi
-
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -12,7 +12,7 @@ case "$ID" in
    apt-get install -y libpciaccess-dev pkg-config
    apt-get clean
    ;;
-  centos|almalinux)
+  centos)
    yum install -y libpciaccess-devel pkgconfig
    ;;
  *)
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -3,18 +3,6 @@

 set -ex

-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac

 MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

--- a/.ci/docker/common/install_user.sh
+++ b/.ci/docker/common/install_user.sh
@ -2,13 +2,6 @@

 set -ex

-# Since version 24 the system ships with user 'ubuntu' that has id 1000
-# We need a work-around to enable id 1000 usage for this script
-if [[ $UBUNTU_VERSION == 24.04 ]]; then
-    # touch is used to disable harmless error message
-    touch /var/mail/ubuntu && chown ubuntu /var/mail/ubuntu && userdel -r ubuntu
-fi
-
 # Mirror jenkins user in container
 # jenkins user as ec2-user should have the same user-id
 echo "jenkins:x:1000:1000::/var/lib/jenkins:" >> /etc/passwd
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -24,10 +24,10 @@ function install_ubuntu() {
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
-    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
-        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
-        | tee /etc/apt/sources.list.d/oneAPI.list
+        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
+        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
+        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list

    # Update the packages list and repository index
    apt-get update
@ -41,13 +41,14 @@ function install_ubuntu() {
        libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
        libglapi-mesa libgles2-mesa-dev libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
-    if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-        apt-get install -y intel-ocloc
-    fi
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
-    apt-get install -y ${XPU_PACKAGES}
+    if [ -n "$XPU_VERSION" ]; then
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
+    else
+        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
+    fi

    # Cleanup
    apt-get autoclean && apt-get clean
@ -57,13 +58,13 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
    elif [[ "${ID}" == "almalinux" ]]; then
        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.8"
+        VERSION_ID="8.6"
    fi

    dnf install -y 'dnf-command(config-manager)'
@ -71,18 +72,16 @@ function install_rhel() {
    dnf config-manager --add-repo \
        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
    # To add the online network network package repository for the Intel Support Packages
-    tee > /etc/yum.repos.d/oneAPI.repo << EOF
-[oneAPI]
+    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
+[intel-for-pytorch-gpu-dev]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
+baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 EOF

-    # Install Intel Support Packages
-    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
    # Compute and Media Runtimes
@ -97,6 +96,8 @@ EOF
    dnf install -y --refresh \
        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
        level-zero-devel
+    # Install Intel Support Packages
+    yum install -y intel-for-pytorch-gpu-dev intel-pti-dev

    # Cleanup
    dnf clean all
@ -118,7 +119,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
+    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB

    # The xpu-smi packages
@ -130,7 +131,7 @@ function install_sles() {
    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel

    # Install Intel Support Packages
-    zypper install -y ${XPU_PACKAGES}
+    zypper install -y intel-for-pytorch-gpu-dev intel-pti-dev

 }

@ -141,13 +142,6 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi

-XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
-XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
-if [[ "$XPU_VERSION" == "2025.0" ]]; then
-    XPU_REPO_NAME="oneapi"
-    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
-fi
-
 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,39 +1,47 @@
-ARG CUDA_VERSION=12.4
+ARG CUDA_VERSION=10.2
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-FROM amd64/almalinux:8 as base
+FROM centos:7 as base

 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=11
-
-ENV LC_ALL en_US.UTF-8
-ENV LANG en_US.UTF-8
-ENV LANGUAGE en_US.UTF-8
-
-RUN yum -y update
-RUN yum -y install epel-release
-RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
+ARG DEVTOOLSET_VERSION=9
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum update -y
+RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
-ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+RUN yum install -y yum-utils centos-release-scl
+RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
+RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
+RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
+RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
+# EPEL for cmake
+RUN yum --enablerepo=extras install -y epel-release

-# cmake-3.18.4 from pip
-RUN yum install -y python3-pip && \
-    python3 -mpip install cmake==3.18.4 && \
-    ln -s /usr/local/bin/cmake /usr/bin/cmake3
+# cmake
+RUN yum install -y cmake3 && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
+ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+
+RUN yum install -y autoconf aclocal automake make sudo
 RUN rm -rf /usr/local/cuda-*

-FROM base as openssl
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf

+FROM base as openssl
+# Install openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
 FROM base as conda
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
@ -41,7 +49,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh

 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=12.4
+ARG CUDA_VERSION=10.2
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
@ -62,10 +70,6 @@ FROM cuda as cuda12.4
 RUN bash ./install_cuda.sh 12.4
 ENV DESIRED_CUDA=12.4

-FROM cuda as cuda12.6
-RUN bash ./install_cuda.sh 12.6
-ENV DESIRED_CUDA=12.6
-
 # Install MNIST test data
 FROM base as mnist
 ADD ./common/install_mnist.sh install_mnist.sh
@ -75,7 +79,6 @@ FROM base as all_cuda
 COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.1  /usr/local/cuda-12.1 /usr/local/cuda-12.1
 COPY --from=cuda12.4  /usr/local/cuda-12.4 /usr/local/cuda-12.4
-COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6

 # Final step
 FROM ${BASE_TARGET} as final
@ -88,8 +91,7 @@ COPY ./common/install_jni.sh install_jni.sh
 COPY ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh

-ENV PATH /opt/conda/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+ENV  PATH /opt/conda/bin:$PATH
 COPY --from=mnist  /usr/local/mnist /usr/local/mnist
 RUN rm -rf /usr/local/cuda
 RUN chmod o+rw /usr/local
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -48,10 +48,10 @@ esac
    --progress plain \
    --build-arg "BASE_TARGET=${BASE_TARGET}" \
    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-    --build-arg "DEVTOOLSET_VERSION=11" \
+    --build-arg "DEVTOOLSET_VERSION=9" \
    -t ${DOCKER_IMAGE_NAME} \
    $@ \
-    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
    ${TOPDIR}/.ci/docker/
 )

--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -66,11 +66,6 @@ RUN bash ./install_cuda.sh 12.4
 RUN bash ./install_magma.sh 12.4
 RUN ln -sf /usr/local/cuda-12.4 /usr/local/cuda

-FROM cuda as cuda12.6
-RUN bash ./install_cuda.sh 12.6
-RUN bash ./install_magma.sh 12.6
-RUN ln -sf /usr/local/cuda-12.6 /usr/local/cuda
-
 FROM cpu as rocm
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
@ -92,6 +87,13 @@ RUN apt-get update -y && \
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
 # Install patchelf
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,7 +39,17 @@ case ${GPU_ARCH_TYPE} in
        BASE_TARGET=rocm
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100;gfx1101;gfx942"
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
+        fi
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -25,8 +25,7 @@ ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -144,10 +144,6 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.1
 ARG DEVTOOLSET_VERSION=9
-# Install Anaconda
-ADD ./common/install_conda_docker.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-ENV PATH /opt/conda/bin:$PATH
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
@ -198,3 +194,10 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
+
+# Install AOTriton
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -1,4 +1,5 @@
 # syntax = docker/dockerfile:experimental
+ARG ROCM_VERSION=3.7
 ARG BASE_CUDA_VERSION=11.8
 ARG GPU_IMAGE=amd64/almalinux:8
 FROM quay.io/pypa/manylinux_2_28_x86_64 as base
@ -116,49 +117,30 @@ COPY --from=jni                /usr/local/include/jni.h              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=11.8
 ARG DEVTOOLSET_VERSION=11
-# Install Anaconda
-ADD ./common/install_conda_docker.sh install_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh
-ENV PATH /opt/conda/bin:$PATH
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
-# Install setuptools and wheel for python 3.12/3.13
-RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
-    /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
-    done;

-
-# cmake-3.18.4 from pip; force in case cmake3 already exists
+# cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
    python3 -mpip install cmake==3.18.4 && \
-    ln -sf /usr/local/bin/cmake /usr/bin/cmake3
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3

 FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
-RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
-ENV PATH=/usr/local/cuda/bin:$PATH

-FROM cpu_final as rocm_final
-ARG ROCM_VERSION=6.0
-ARG PYTORCH_ROCM_ARCH
-ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
-ARG DEVTOOLSET_VERSION=11
-ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
-# Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
-# below workaround helps avoid error
-ENV ROCM_PATH /opt/rocm
-# cmake-3.28.4 from pip to get enable_language(HIP)
-# and avoid 3.21.0 cmake+ninja issues with ninja inserting "-Wl,--no-as-needed" in LINK_FLAGS for static linker
-RUN python3 -m pip install --upgrade pip && \
-    python3 -mpip install cmake==3.28.4
-ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
-RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-ENV MKLROOT /opt/intel
-ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
+FROM common as rocm_final
+ARG ROCM_VERSION=3.7
+# Install ROCm
+ADD ./common/install_rocm.sh install_rocm.sh
+RUN bash ./install_rocm.sh ${ROCM_VERSION} && rm install_rocm.sh
+# cmake is already installed inside the rocm base image, but both 2 and 3 exist
+# cmake3 is needed for the later MIOpen custom build, so that step is last.
+RUN yum install -y cmake3 && \
+    rm -f /usr/bin/cmake && \
+    ln -s /usr/bin/cmake3 /usr/bin/cmake
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

@ -168,7 +150,8 @@ ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
+# Install setuptools and wheel for python 3.13
+RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
-ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -48,11 +48,6 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/op
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"

-FROM base as openblas
-# Install openblas
-ADD ./common/install_openblas.sh install_openblas.sh
-RUN bash ./install_openblas.sh && rm install_openblas.sh
-
 FROM base as final

 # remove unncessary python versions
@ -60,5 +55,3 @@ RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
-COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
-ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_aarch64
@ -61,7 +61,7 @@ RUN git config --global --add safe.directory "*"
 # NOTE: Need a better way to get this library as Ubuntu's package can be removed by the vender, or changed
 ###############################################################################
 RUN cd ~/ \
-  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-4ubuntu2_arm64.deb \
+  && curl -L -o ~/libgfortran-10-dev.deb http://ports.ubuntu.com/ubuntu-ports/pool/universe/g/gcc-10/libgfortran-10-dev_10.5.0-1ubuntu1_arm64.deb \
  && ar x ~/libgfortran-10-dev.deb \
  && tar --use-compress-program=unzstd -xvf data.tar.zst -C ~/ \
  && cp -f ~/usr/lib/gcc/aarch64-linux-gnu/10/libgfortran.a /opt/rh/devtoolset-10/root/usr/lib/gcc/aarch64-redhat-linux/10/
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -1,20 +1,17 @@
-FROM quay.io/pypa/manylinux_2_28_s390x as base
+FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base

 # Language variables
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8

-ARG DEVTOOLSET_VERSION=13
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
-RUN yum -y install epel-release
-RUN yum -y update
-RUN yum install -y \
-  sudo \
+RUN apt update ; apt upgrade -y
+RUN apt install -y \
+  build-essential \
  autoconf \
  automake \
-  bison \
  bzip2 \
  curl \
  diffutils \
@ -27,40 +24,19 @@ RUN yum install -y \
  util-linux \
  wget \
  which \
-  xz \
-  yasm \
+  xz-utils \
  less \
  zstd \
-  libgomp \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
-  gcc-toolset-${DEVTOOLSET_VERSION}-binutils \
-  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  cmake \
-  rust \
-  cargo \
-  llvm-devel \
-  libzstd-devel \
-  python3.12-devel \
-  python3.12-setuptools \
-  python3.12-pip \
-  python3-virtualenv \
-  python3.12-pyyaml \
-  python3.12-numpy \
-  python3.12-wheel \
-  python3.12-cryptography \
-  blas-devel \
-  openblas-devel \
-  lapack-devel \
-  atlas-devel \
-  libjpeg-devel \
-  libxslt-devel \
-  libxml2-devel \
-  openssl-devel \
-  valgrind
-
-ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+  python3 \
+  python3-dev \
+  python3-setuptools \
+  python3-yaml \
+  python3-typing-extensions \
+  libblas-dev \
+  libopenblas-dev \
+  liblapack-dev \
+  libatlas-base-dev

 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
@ -68,8 +44,14 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"

-# installed python doesn't have development parts. Rebuild it from scratch
-RUN /bin/rm -rf /opt/_internal /opt/python /usr/local/*/*
+FROM base as openssl
+# Install openssl (this must precede `build python` step)
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem

 # EPEL for cmake
 FROM base as patchelf
@ -82,43 +64,10 @@ FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
-ENV SSL_CERT_FILE=
 RUN bash build_scripts/build.sh && rm -r build_scripts

-FROM base as final
+FROM openssl as final
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel  /usr/local/bin/auditwheel
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
-
-RUN alternatives --set python /usr/bin/python3.12
-RUN alternatives --set python3 /usr/bin/python3.12
-
-RUN pip-3.12 install typing_extensions
-
-ENTRYPOINT []
-CMD ["/bin/bash"]
-
-# install test dependencies:
-# - grpcio requires system openssl, bundled crypto fails to build
-# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
-RUN dnf install -y \
-  protobuf-devel \
-  protobuf-c-devel \
-  protobuf-lite-devel \
-  wget \
-  patch
-
-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
-RUN cd ~ && \
-  git clone https://github.com/jax-ml/ml_dtypes && \
-  cd ml_dtypes && \
-  git checkout v0.4.0 && \
-  git submodule update --init --recursive && \
-  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
-  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
-  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
-  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
-  python3 setup.py bdist_wheel && \
-  pip3 install dist/*.whl && \
-  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -61,7 +61,7 @@ case ${GPU_ARCH_TYPE} in
    cpu-s390x)
        TARGET=final
        DOCKER_TAG=cpu-s390x
-        GPU_IMAGE=s390x/almalinux:8
+        GPU_IMAGE=redhat/ubi9
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
@ -87,18 +87,22 @@ case ${GPU_ARCH_TYPE} in
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    rocm|rocm-manylinux_2_28)
+    rocm)
        TARGET=rocm_final
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
-        DEVTOOLSET_VERSION="9"
-        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
-            MANY_LINUX_VERSION="2_28"
-            DEVTOOLSET_VERSION="11"
-            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1100"
+        ROCM_REGEX="([0-9]+)\.([0-9]+)[\.]?([0-9]*)"
+        if [[ $GPU_ARCH_VERSION =~ $ROCM_REGEX ]]; then
+            ROCM_VERSION_INT=$((${BASH_REMATCH[1]}*10000 + ${BASH_REMATCH[2]}*100 + ${BASH_REMATCH[3]:-0}))
+        else
+            echo "ERROR: rocm regex failed"
+            exit 1
        fi
-        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101"
-        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
+        if [[ $ROCM_VERSION_INT -ge 60000 ]]; then
+            PYTORCH_ROCM_ARCH+=";gfx942"
+        fi
+        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=9"
        ;;
    xpu)
        TARGET=xpu_final
@ -121,13 +125,11 @@ fi
 (
    set -x

-    if [ "$(uname -m)" != "s390x" ]; then
-        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-        sudo systemctl daemon-reload
-        sudo systemctl restart docker
-    fi
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker

    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -16,27 +16,37 @@ CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969

-# Dependencies for compiling Python that we want to remove from
-# the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel"
-
-if [ "$(uname -m)" != "s390x" ] ; then
-    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel"
-else
-    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel"
-fi
-
-# Libraries that are allowed as part of the manylinux1 profile
-MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
-
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
 source $MY_DIR/build_utils.sh

-# Development tools and libraries
-yum -y install bzip2 make git patch unzip bison yasm diffutils \
-    automake which file \
-    ${PYTHON_COMPILE_DEPS}
+if [ "$(uname -m)" != "s390x" ] ; then
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+    # Development tools and libraries
+    yum -y install bzip2 make git patch unzip bison yasm diffutils \
+        automake which file cmake28 \
+        kernel-devel-`uname -r` \
+        ${PYTHON_COMPILE_DEPS}
+else
+    # Dependencies for compiling Python that we want to remove from
+    # the final image after compiling Python
+    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
+
+    # Libraries that are allowed as part of the manylinux1 profile
+    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
+
+    # Development tools and libraries
+    apt install -y bzip2 make git patch unzip diffutils \
+        automake which file cmake \
+        linux-headers-virtual \
+        ${PYTHON_COMPILE_DEPS}
+fi

 # Install newest autoconf
 build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
@ -82,13 +92,16 @@ ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel

 # Clean up development headers and other unnecessary stuff for
 # final image
-yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
-    avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
-yum -y install ${MANYLINUX1_DEPS}
-yum -y clean all > /dev/null 2>&1
-yum list installed
-
+if [ "$(uname -m)" != "s390x" ] ; then
+    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+        avahi freetype bitstream-vera-fonts \
+        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+    yum -y install ${MANYLINUX1_DEPS}
+    yum -y clean all > /dev/null 2>&1
+    yum list installed
+else
+    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+fi
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
 # Strip what we can -- and ignore errors, because this just attempts to strip
--- a/.ci/docker/manywheel/build_scripts/ssl-check.py
+++ b/.ci/docker/manywheel/build_scripts/ssl-check.py
@ -1,12 +1,10 @@
 # cf. https://github.com/pypa/manylinux/issues/53

-import sys
-from urllib.request import urlopen
-
-
 GOOD_SSL = "https://google.com"
 BAD_SSL = "https://self-signed.badssl.com"

+import sys
+

 print("Testing SSL certificate checking for Python:", sys.version)

@ -14,8 +12,14 @@ if sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4):
    print("This version never checks SSL certs; skipping tests")
    sys.exit(0)

+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen

-EXC = OSError
+    EXC = OSError
+else:
+    from urllib import urlopen
+
+    EXC = IOError

 print(f"Connecting to {GOOD_SSL} should work")
 urlopen(GOOD_SSL)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -5,7 +5,7 @@
 #Pinned versions: 1.6
 #test that import:

-boto3==1.35.42
+boto3==1.19.12
 #Description: AWS SDK for python
 #Pinned versions: 1.19.12, 1.16.34
 #test that import:
@ -30,13 +30,13 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py

-expecttest==0.3.0
+expecttest==0.2.1
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
-#Pinned versions: 0.3.0
+#Pinned versions: 0.2.1
 #test that import:

-fbscribelogger==0.1.7
+fbscribelogger==0.1.6
 #Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.13.0
+mypy==1.11.2
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
@ -118,7 +118,7 @@ numba==0.55.2 ; python_version == "3.10"

 #numpy
 #Description: Provides N-dimensional arrays and linear algebra
-#Pinned versions: 1.26.2
+#Pinned versions: 1.20
 #test that import: test_view_ops.py, test_unary_ufuncs.py, test_type_promotion.py,
 #test_type_info.py, test_torch.py, test_tensorexpr_pybind.py, test_tensorexpr.py,
 #test_tensorboard.py, test_tensor_creation_ops.py, test_static_runtime.py,
@ -128,12 +128,6 @@ numba==0.55.2 ; python_version == "3.10"
 #test_nn.py, test_namedtensor.py, test_linalg.py, test_jit_cuda_fuser.py,
 #test_jit.py, test_indexing.py, test_datapipe.py, test_dataloader.py,
 #test_binary_ufuncs.py
-numpy==1.22.4; python_version == "3.9" or python_version == "3.10"
-numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13"
-
-pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13"

 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -145,9 +139,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.13.0
+optree==0.12.1
 #Description: A library for tree manipulation
-#Pinned versions: 0.13.0
+#Pinned versions: 0.12.1
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -158,7 +152,7 @@ optree==0.13.0
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py

-pillow==11.0.0
+pillow==10.3.0
 #Description:  Python Imaging Library fork
 #Pinned versions: 10.3.0
 #test that import:
@ -193,11 +187,6 @@ pytest-rerunfailures>=10.3
 #Pinned versions:
 #test that import:

-pytest-subtests==0.13.1
-#Description: plugin for subtest support
-#Pinned versions:
-#test that import:
-
 #pytest-benchmark
 #Description: fixture for benchmarking code
 #Pinned versions: 3.2.3
@ -245,7 +234,7 @@ scikit-image==0.22.0 ; python_version >= "3.10"
 #test that import:

 scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version >= "3.12"
+scipy==1.12.0 ; python_version == "3.12"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@ -264,7 +253,7 @@ tb-nightly==2.13.0a20230426
 #test that import:

 # needed by torchgen utils
-typing-extensions>=4.10.0
+typing-extensions
 #Description: type hints for python
 #Pinned versions:
 #test that import:
@ -280,21 +269,26 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #test that import:

 #lintrunner is supported on aarch64-linux only from 0.12.4 version
-lintrunner==0.12.7
+lintrunner==0.12.5
 #Description: all about linters!
-#Pinned versions: 0.12.7
+#Pinned versions: 0.12.5
 #test that import:

 redis>=4.0.0
 #Description: redis database
 #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)

+rockset==1.0.3
+#Description: queries Rockset
+#Pinned versions: 1.0.3
+#test that import:
+
 ghstack==0.8.0
 #Description: ghstack tool
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.5
+jinja2==3.1.4
 #Description: jinja2 template engine
 #Pinned versions: 3.1.4
 #test that import:
@ -304,37 +298,37 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:

-z3-solver==4.12.6.0
+z3-solver==4.12.2.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:

-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.13.0
 #Description: Also included in .ci/docker/requirements-docs.txt
 #Pinned versions:
 #test that import: test_tensorboard

 pywavelets==1.4.1 ; python_version < "3.12"
-pywavelets==1.7.0 ; python_version >= "3.12"
+pywavelets==1.5.0 ; python_version >= "3.12"
 #Description: This is a requirement of scikit-image, we need to pin
 # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.3.0
+lxml==5.0.0
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries

 PyGithub==2.3.0

+sympy==1.12.1 ; python_version == "3.8"
 sympy==1.13.1 ; python_version >= "3.9"
 #Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
 #Pinned versions:
 #test that import:

-onnx==1.17.0
+onnx==1.16.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -348,26 +342,3 @@ parameterized==0.8.1
 #Description: Parameterizes unittests, both the tests themselves and the entire testing class
 #Pinned versions:
 #test that import:
-
-#Description: required for testing torch/distributed/_tools/sac_estimator.py
-#Pinned versions: 1.24.0
-#test that import: test_sac_estimator.py
-
-pwlf==2.2.1 ; python_version >= "3.8"
-#Description: required for testing torch/distributed/_tools/sac_estimator.py
-#Pinned versions: 2.2.1
-#test that import: test_sac_estimator.py
-
-
-# To build PyTorch itself
-astunparse
-PyYAML
-setuptools
-
-ninja==1.11.1 ; platform_machine == "aarch64"
-scons==4.5.2 ; platform_machine == "aarch64"
-
-pulp==2.9.0 ; python_version >= "3.8"
-#Description: required for testing ilp formulaiton under torch/distributed/_tools
-#Pinned versions: 2.9.0
-#test that import: test_sac_ilp.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -14,8 +14,7 @@ matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3

-tensorboard==2.13.0 ; python_version < "3.13"
-tensorboard==2.18.0 ; python_version >= "3.13"
+tensorboard==2.13.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0

--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.2.0
+3.1.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -30,8 +30,7 @@ ARG CONDA_CMAKE
 COPY requirements-ci.txt /opt/conda/requirements-ci.txt
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install gcc
 ARG GCC_VERSION
@ -81,8 +80,6 @@ RUN bash ./install_openssl.sh
 ENV OPENSSL_DIR /opt/openssl

 ARG INDUCTOR_BENCHMARKS
-ARG ANACONDA_PYTHON_VERSION
-ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface.txt huggingface.txt
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -107,11 +107,12 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

-# This is needed by sccache
-COPY ./common/install_openssl.sh install_openssl.sh
-ENV OPENSSL_ROOT_DIR /opt/openssl
-RUN bash ./install_openssl.sh
-ENV OPENSSL_DIR /opt/openssl
+# Install AOTriton
+COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton

 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -36,8 +36,7 @@ ENV DOCS=$DOCS
 COPY requirements-ci.txt requirements-docs.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_magma_conda.sh install_magma_conda.sh
-RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
 RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi

 # Install gcc
@ -88,6 +87,19 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Android NDK
+ARG ANDROID
+ARG ANDROID_NDK
+ARG GRADLE_VERSION
+COPY ./common/install_android.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
+COPY ./android/AndroidManifest.xml AndroidManifest.xml
+COPY ./android/build.gradle build.gradle
+RUN if [ -n "${ANDROID}" ]; then bash ./install_android.sh; fi
+RUN rm install_android.sh cache_vision_models.sh common_utils.sh
+RUN rm AndroidManifest.xml
+RUN rm build.gradle
+ENV INSTALLED_ANDROID ${ANDROID}
+
 # (optional) Install Vulkan SDK
 ARG VULKAN_SDK_VERSION
 COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-# This is mostly just a shim to manywheel/build.sh
-# TODO: Make this a dedicated script to build just libtorch
-
-set -ex
-
-SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/magma/.gitignore
+++ b/.ci/magma/.gitignore
@ -1,2 +0,0 @@
-output/
-magma-cuda*/
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -1,48 +0,0 @@
-SHELL=/usr/bin/env bash
-
-DOCKER_CMD ?= docker
-DESIRED_CUDA ?= 11.8
-DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
-PACKAGE_NAME = magma-cuda
-CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
-
-DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-	-w /builder \
-	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
-	-e DESIRED_CUDA=${DESIRED_CUDA} \
-	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
-	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
-	magma/build_magma.sh
-
-.PHONY: all
-all: magma-cuda126
-all: magma-cuda124
-all: magma-cuda121
-all: magma-cuda118
-
-.PHONY:
-clean:
-	$(RM) -r magma-*
-	$(RM) -r output
-
-.PHONY: magma-cuda126
-magma-cuda126: DESIRED_CUDA := 12.6
-magma-cuda126:
-	$(DOCKER_RUN)
-
-.PHONY: magma-cuda124
-magma-cuda124: DESIRED_CUDA := 12.4
-magma-cuda124:
-	$(DOCKER_RUN)
-
-.PHONY: magma-cuda121
-magma-cuda121: DESIRED_CUDA := 12.1
-magma-cuda121:
-	$(DOCKER_RUN)
-
-.PHONY: magma-cuda118
-magma-cuda118: DESIRED_CUDA := 11.8
-magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
-magma-cuda118:
-	$(DOCKER_RUN)
--- a/.ci/magma/README.md
+++ b/.ci/magma/README.md
@ -1,50 +0,0 @@
-# Magma
-
-This folder contains the scripts and configurations to build magma, statically linked for various versions of CUDA.
-
-## Building
-
-Look in the `Makefile` for available targets to build. To build any target, for example `magma-cuda118`, run
-
-```
-# Using `docker`
-make magma-cuda118
-
-# Using `podman`
-DOCKER_CMD=podman make magma-cuda118
-```
-
-This spawns a `pytorch/manylinux-cuda<version>` docker image, which has the required `devtoolset` and CUDA versions installed.
-Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
-into a tarball, with the following structure:
-
-```
-.
-├── include       # header files
-├── lib           # libmagma.a
-├── info
-│   ├── licenses  # license file
-│   └── recipe    # build script and patches
-```
-
-More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the CUDA version.
-Outputted binaries should be in the `output` folder.
-
-
-## Pushing
-
-Packages can be uploaded to an S3 bucket using:
-
-```
-aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
-```
-
-If you do not have upload permissions, please ping @seemethere or @soumith to gain access
-
-## New versions
-
-New CUDA versions can be added by creating a new make target with the next desired version. For CUDA version NN.n, the target should be named `magma-cudaNNn`.
-
-Make sure to edit the appropriate environment variables (e.g., DESIRED_CUDA, CUDA_ARCH_LIST) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
-
-New patches can be added by editing `Makefile` and`build_magma.sh` the same way `getrf_nbparam.patch` is implemented.
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-# Environment variables
-# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-MAGMA_VERSION=2.6.1
-
-# Folders for the build
-PACKAGE_FILES=${ROOT_DIR}/magma/package_files # source patches and metadata
-PACKAGE_DIR=${ROOT_DIR}/magma/${PACKAGE_NAME} # build workspace
-PACKAGE_OUTPUT=${ROOT_DIR}/magma/output # where tarballs are stored
-PACKAGE_BUILD=${PACKAGE_DIR}/build # where the content of the tarball is prepared
-PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
-PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
-mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
-
-# Fetch magma sources and verify checksum
-pushd ${PACKAGE_DIR}
-curl -LO http://icl.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz
-tar zxf magma-${MAGMA_VERSION}.tar.gz
-sha256sum --check < ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256
-popd
-
-# Apply patches and build
-pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
-patch < ${PACKAGE_FILES}/CMake.patch
-patch < ${PACKAGE_FILES}/cmakelists.patch
-patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
-patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
-patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
-# The build.sh script expects to be executed from the sources root folder
-INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
-popd
-
-# Package recipe, license and tarball
-# Folder and package name are backward compatible for the build workflow
-cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
-cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
-cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
-cp ${PACKAGE_FILES}/getrf_nbparam.patch ${PACKAGE_RECIPE}/getrf_nbparam.patch
-cp ${PACKAGE_FILES}/CMake.patch ${PACKAGE_RECIPE}/CMake.patch
-cp ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256 ${PACKAGE_RECIPE}/magma-${MAGMA_VERSION}.sha256
-cp ${PACKAGE_DIR}/magma-${MAGMA_VERSION}/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
-pushd ${PACKAGE_BUILD}
-tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
-echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
-popd
--- a/.ci/magma/package_files/CMake.patch
+++ b/.ci/magma/package_files/CMake.patch
@ -1,40 +0,0 @@
--- CMake.src.cuda	2023-03-29 10:05:32.136954140 +0000
-+++ CMake.src.cuda	2023-03-29 10:05:50.281318043 +0000
-@@ -283,10 +283,10 @@
- magmablas/zgeadd.cu
- magmablas/zgeadd2.cu
- magmablas/zgeam.cu
-magmablas/zgemm_fermi.cu
-+#magmablas/zgemm_fermi.cu
- magmablas/zgemm_reduce.cu
- magmablas/zgemv_conj.cu
-magmablas/zgemv_fermi.cu
-+#magmablas/zgemv_fermi.cu
- magmablas/zgerbt.cu
- magmablas/zgerbt_kernels.cu
- magmablas/zgetmatrix_transpose.cpp
-@@ -1009,18 +1009,18 @@
- magmablas/sgeam.cu
- magmablas/dgeam.cu
- magmablas/cgeam.cu
-magmablas/sgemm_fermi.cu
-magmablas/dgemm_fermi.cu
-magmablas/cgemm_fermi.cu
-+#magmablas/sgemm_fermi.cu
-+#magmablas/dgemm_fermi.cu
-+#magmablas/cgemm_fermi.cu
- magmablas/sgemm_reduce.cu
- magmablas/dgemm_reduce.cu
- magmablas/cgemm_reduce.cu
- magmablas/sgemv_conj.cu
- magmablas/dgemv_conj.cu
- magmablas/cgemv_conj.cu
-magmablas/sgemv_fermi.cu
-magmablas/dgemv_fermi.cu
-magmablas/cgemv_fermi.cu
-+#magmablas/sgemv_fermi.cu
-+#magmablas/dgemv_fermi.cu
-+#magmablas/cgemv_fermi.cu
- magmablas/sgerbt.cu
- magmablas/dgerbt.cu
- magmablas/cgerbt.cu
--- a/.ci/magma/package_files/build.sh
+++ b/.ci/magma/package_files/build.sh
@ -1,12 +0,0 @@
-CUDA__VERSION=$(nvcc --version|sed -n 4p|cut -f5 -d" "|cut -f1 -d",")
-if [ "$CUDA__VERSION" != "$DESIRED_CUDA" ]; then
-    echo "CUDA Version is not $DESIRED_CUDA. CUDA Version found: $CUDA__VERSION"
-    exit 1
-fi
-
-mkdir build
-cd build
-cmake .. -DUSE_FORTRAN=OFF -DGPU_TARGET="All" -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DCUDA_ARCH_LIST="$CUDA_ARCH_LIST"
-make -j$(getconf _NPROCESSORS_CONF)
-make install
-cd ..
--- a/.ci/magma/package_files/cmakelists.patch
+++ b/.ci/magma/package_files/cmakelists.patch
@ -1,388 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index d5d8d87d..8a507334 100644
--- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 2.8.1 )
- # ----------------------------------------
- # to disable Fortran, set this to "off"
- # see also -DADD_ below
-option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" ON )
-+option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" OFF )
-
- if (USE_FORTRAN)
-     project( MAGMA C CXX Fortran )
-@@ -75,6 +75,8 @@ else()
-     message( WARNING "The compiler ${CMAKE_CXX_COMPILER} doesn't support the -std=c++11 flag. Some code may not compile.")
- endif()
-
-+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++ -fno-exceptions")
-+
- CHECK_C_COMPILER_FLAG("-std=c99" COMPILER_SUPPORTS_C99)
- if (COMPILER_SUPPORTS_C99)
-     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
-@@ -101,15 +103,15 @@ endif()
-
-
- # ----------------------------------------
-# locate OpenMP
-find_package( OpenMP )
-if (OPENMP_FOUND)
-    message( STATUS "Found OpenMP" )
-    message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
-    message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
-    set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
-endif()
-+# # locate OpenMP
-+# find_package( OpenMP )
-+# if (OPENMP_FOUND)
-+#     message( STATUS "Found OpenMP" )
-+#     message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
-+#     message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
-+#     set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
-+#     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
-+# endif()
-
- if (MAGMA_ENABLE_CUDA)
-   # ----------------------------------------
-@@ -132,7 +134,7 @@ if (MAGMA_ENABLE_CUDA)
-     set( NV_SM    "" )
-     set( NV_COMP  "" )
-
-    set(CUDA_SEPARABLE_COMPILATION ON)
-+    set(CUDA_SEPARABLE_COMPILATION OFF)
-
-     # nvcc >= 6.5 supports -std=c++11, so propagate CXXFLAGS to NVCCFLAGS.
-     # Older nvcc didn't support -std=c++11, so previously we disabled propagation.
-@@ -294,11 +296,18 @@ if (MAGMA_ENABLE_CUDA)
-         message( STATUS "    compile for CUDA arch 8.0 (Ampere)" )
-     endif()
-
-+    if ( ${GPU_TARGET} MATCHES "All")
-+        set( MIN_ARCH 370)
-+        SET( NV_SM ${CUDA_ARCH_LIST})
-+        SET( NV_COMP "")
-+    endif()
-+
-     if (NOT MIN_ARCH)
-         message( FATAL_ERROR "GPU_TARGET must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
-     endif()
-
-    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
-+    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DHAVE_CUBLAS -Xfatbin -compress-all -Xcompiler -fPIC -std=c++11 ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
-+    MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
-     #add_definitions( "-DMAGMA_HAVE_CUDA -DMAGMA_CUDA_ARCH_MIN=${MIN_ARCH}" )
-     set(MAGMA_HAVE_CUDA "1")
-     set(MAGMA_CUDA_ARCH_MIN "${MIN_ARCH}")
-@@ -413,7 +422,7 @@ set_property(CACHE BLA_VENDOR PROPERTY STRINGS
- set( LAPACK_LIBRARIES "" CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" )
- if (LAPACK_LIBRARIES STREQUAL "")
-     message( STATUS "Searching for BLAS and LAPACK. To override, set LAPACK_LIBRARIES using ccmake." )
-    find_package( LAPACK )
-+    # find_package( LAPACK )
-     # force showing updated LAPACK_LIBRARIES in ccmake / cmake-gui.
-     set( LAPACK_LIBRARIES ${LAPACK_LIBRARIES} CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" FORCE )
- else()
-@@ -552,12 +561,12 @@ if (WIN32)
-     #message( "libmagma_all_f   ${libmagma_all_f}"   )
-
-     # on Windows, Fortran files aren't compiled if listed here...
-    cuda_add_library( magma ${libmagma_all_cpp} )
-+    cuda_add_library( magma STATIC ${libmagma_all_cpp} OPTIONS --compiler-options "-fPIC")
-     target_link_libraries( magma
-         ${LAPACK_LIBRARIES}
-         ${CUDA_CUDART_LIBRARY}
-         ${CUDA_CUBLAS_LIBRARIES}
-        ${CUDA_cusparse_LIBRARY}
-+        # ${CUDA_cusparse_LIBRARY}
-     )
-
-     # no Fortran files at the moment (how to test libmagma_all_f is not empty?),
-@@ -575,13 +584,13 @@ if (WIN32)
- else()
-     # Unix doesn't seem to have a problem with mixing C, CUDA, and Fortran files
-     if (MAGMA_ENABLE_CUDA)
-      cuda_add_library( magma ${libmagma_all} )
-+      cuda_add_library( magma STATIC ${libmagma_all} OPTIONS --compiler-options "-fPIC")
-       target_link_libraries( magma
-         ${blas_fix}
-         ${LAPACK_LIBRARIES}
-         ${CUDA_CUDART_LIBRARY}
-         ${CUDA_CUBLAS_LIBRARIES}
-        ${CUDA_cusparse_LIBRARY}
-+        # ${CUDA_cusparse_LIBRARY}
- 	)
-     else()
-       find_package( hipBLAS )
-@@ -614,138 +623,139 @@ else()
-     endif()
- endif()
- add_custom_target( lib DEPENDS magma )
-
-
-# ----------------------------------------
-# compile lapacktest library
-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
-# else,           compile only C++     files, not Fortran files
-if (USE_FORTRAN)
-    foreach( filename ${liblapacktest_all} )
-        if (filename MATCHES "\\.(f|f90|F90)$")
-            list( APPEND liblapacktest_all_f ${filename} )
-        endif()
-    endforeach()
-    add_library( lapacktest ${liblapacktest_all_f} )
-else()
-    # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
-    foreach( filename ${liblapacktest_all} )
-        if (filename MATCHES "\\.(c|cu|cpp)$")
-            list( APPEND liblapacktest_all_cpp ${filename} )
-        endif()
-    endforeach()
-    add_library( lapacktest ${liblapacktest_all_cpp} )
-endif()
-target_link_libraries( lapacktest
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-)
-
-
-# ----------------------------------------
-# compile tester library
-add_library( tester ${libtest_all} )
-target_link_libraries( tester
-    magma
-    lapacktest
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-)
-+set_target_properties(magma PROPERTIES POSITION_INDEPENDENT_CODE ON)
-+
-+
-+# # ----------------------------------------
-+# # compile lapacktest library
-+# # If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
-+# # else,           compile only C++     files, not Fortran files
-+# if (USE_FORTRAN)
-+#     foreach( filename ${liblapacktest_all} )
-+#         if (filename MATCHES "\\.(f|f90|F90)$")
-+#             list( APPEND liblapacktest_all_f ${filename} )
-+#         endif()
-+#     endforeach()
-+#     add_library( lapacktest ${liblapacktest_all_f} )
-+# else()
-+#     # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
-+#     foreach( filename ${liblapacktest_all} )
-+#         if (filename MATCHES "\\.(c|cu|cpp)$")
-+#             list( APPEND liblapacktest_all_cpp ${filename} )
-+#         endif()
-+#     endforeach()
-+#     add_library( lapacktest ${liblapacktest_all_cpp} )
-+# endif()
-+# target_link_libraries( lapacktest
-+#     ${blas_fix}
-+#     ${LAPACK_LIBRARIES}
-+# )
-+
-+
-+# # ----------------------------------------
-+# # compile tester library
-+# add_library( tester ${libtest_all} )
-+# target_link_libraries( tester
-+#     magma
-+#     lapacktest
-+#     ${blas_fix}
-+#     ${LAPACK_LIBRARIES}
-+# )
-
-
- # ----------------------------------------
- # compile MAGMA sparse library
-
- # sparse doesn't have Fortran at the moment, so no need for above shenanigans
-if (MAGMA_ENABLE_CUDA)
-  include_directories( sparse/include )
-  include_directories( sparse/control )
-else()
-  include_directories( sparse_hip/include )
-  include_directories( sparse_hip/control )
-endif()
-include_directories( testing )
-
-if (MAGMA_ENABLE_CUDA)
-  cuda_add_library( magma_sparse ${libsparse_all} )
-  target_link_libraries( magma_sparse
-    magma
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-    ${CUDA_CUDART_LIBRARY}
-    ${CUDA_CUBLAS_LIBRARIES}
-    ${CUDA_cusparse_LIBRARY}
-    )
-else()
-  add_library( magma_sparse ${libsparse_all} )
-  target_link_libraries( magma_sparse
-    magma
-    ${blas_fix}
-    ${LAPACK_LIBRARIES}
-    hip::device
-    roc::hipblas
-    roc::hipsparse
-    )
-endif()
-add_custom_target( sparse-lib DEPENDS magma_sparse )
-
-
-# ----------------------------------------
-# compile each tester
-
-# save testers to testing/
-# save tester lib files to testing_lib/ to avoid cluttering lib/
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
-
-# skip Fortran testers, which require an extra file from CUDA
-foreach( filename ${testing_all} )
-    if (filename MATCHES "\\.(c|cu|cpp)$")
-        list( APPEND testing_all_cpp ${filename} )
-    endif()
-endforeach()
-foreach( TEST ${testing_all_cpp} )
-    string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
-    string( REGEX REPLACE "testing/" "" EXE ${EXE} )
-    #message( "${TEST} --> ${EXE}" )
-    add_executable( ${EXE} ${TEST} )
-    target_link_libraries( ${EXE} tester lapacktest magma )
-    list( APPEND testing ${EXE} )
-endforeach()
-add_custom_target( testing DEPENDS ${testing} )
-
-
-# ----------------------------------------
-# compile each sparse tester
-
-if (MAGMA_ENABLE_CUDA)
-  set(SPARSE_TEST_DIR "sparse/testing")
-else()
-  set(SPARSE_TEST_DIR "sparse_hip/testing")
-endif()
-
-
-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
-cmake_policy( SET CMP0037 OLD)
-foreach( TEST ${sparse_testing_all} )
-    string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
-    string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
-    #message( "${TEST} --> ${EXE}" )
-    add_executable( ${EXE} ${TEST} )
-    target_link_libraries( ${EXE} magma_sparse magma )
-    list( APPEND sparse-testing ${EXE} )
-endforeach()
-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
-+# if (MAGMA_ENABLE_CUDA)
-+#   include_directories( sparse/include )
-+#   include_directories( sparse/control )
-+# else()
-+#   include_directories( sparse_hip/include )
-+#   include_directories( sparse_hip/control )
-+# endif()
-+# include_directories( testing )
-+
-+# if (MAGMA_ENABLE_CUDA)
-+#   cuda_add_library( magma_sparse ${libsparse_all} )
-+#   target_link_libraries( magma_sparse
-+#     magma
-+#     ${blas_fix}
-+#     ${LAPACK_LIBRARIES}
-+#     ${CUDA_CUDART_LIBRARY}
-+#     ${CUDA_CUBLAS_LIBRARIES}
-+#     ${CUDA_cusparse_LIBRARY}
-+#     )
-+# else()
-+#   add_library( magma_sparse ${libsparse_all} )
-+#   target_link_libraries( magma_sparse
-+#     magma
-+#     ${blas_fix}
-+#     ${LAPACK_LIBRARIES}
-+#     hip::device
-+#     roc::hipblas
-+#     roc::hipsparse
-+#     )
-+# endif()
-+# add_custom_target( sparse-lib DEPENDS magma_sparse )
-+
-+
-+# # ----------------------------------------
-+# # compile each tester
-+
-+# # save testers to testing/
-+# # save tester lib files to testing_lib/ to avoid cluttering lib/
-+# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
-+# set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
-+# set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
-+
-+# # skip Fortran testers, which require an extra file from CUDA
-+# foreach( filename ${testing_all} )
-+#     if (filename MATCHES "\\.(c|cu|cpp)$")
-+#         list( APPEND testing_all_cpp ${filename} )
-+#     endif()
-+# endforeach()
-+# foreach( TEST ${testing_all_cpp} )
-+#     string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
-+#     string( REGEX REPLACE "testing/" "" EXE ${EXE} )
-+#     #message( "${TEST} --> ${EXE}" )
-+#     add_executable( ${EXE} ${TEST} )
-+#     target_link_libraries( ${EXE} tester lapacktest magma )
-+#     list( APPEND testing ${EXE} )
-+# endforeach()
-+# add_custom_target( testing DEPENDS ${testing} )
-+
-+
-+# # ----------------------------------------
-+# # compile each sparse tester
-+
-+# if (MAGMA_ENABLE_CUDA)
-+#   set(SPARSE_TEST_DIR "sparse/testing")
-+# else()
-+#   set(SPARSE_TEST_DIR "sparse_hip/testing")
-+# endif()
-+
-+
-+# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
-+# cmake_policy( SET CMP0037 OLD)
-+# foreach( TEST ${sparse_testing_all} )
-+#     string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
-+#     string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
-+#     #message( "${TEST} --> ${EXE}" )
-+#     add_executable( ${EXE} ${TEST} )
-+#     target_link_libraries( ${EXE} magma_sparse magma )
-+#     list( APPEND sparse-testing ${EXE} )
-+# endforeach()
-+# add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
-
-
- # ----------------------------------------
- # what to install
-install( TARGETS magma magma_sparse ${blas_fix}
-+install( TARGETS magma ${blas_fix}
-          RUNTIME DESTINATION bin
-          LIBRARY DESTINATION lib
-          ARCHIVE DESTINATION lib )
-file( GLOB headers include/*.h sparse/include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
-+file( GLOB headers include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
- if (USE_FORTRAN)
-     install( FILES ${headers} ${modules}
-              DESTINATION include )
-@@ -769,9 +779,9 @@ else()
-     "${blas_fix_lib} ${LAPACK_LIBS} hip::device roc::hipblas roc::hipsparse" )
- endif()
- set( MAGMA_REQUIRED "" )
-configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
-install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
-         DESTINATION lib/pkgconfig )
-+# configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
-+# install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
-+#          DESTINATION lib/pkgconfig )
-
- # ----------------------------------------
- get_directory_property( compile_definitions COMPILE_DEFINITIONS )
--- a/.ci/magma/package_files/getrf_nbparam.patch
+++ b/.ci/magma/package_files/getrf_nbparam.patch
@ -1,40 +0,0 @@
-diff --git a/control/get_batched_crossover.cpp b/control/get_batched_crossover.cpp
-index 4ec57306..912f8608 100644
--- a/control/get_batched_crossover.cpp
-+++ b/control/get_batched_crossover.cpp
-@@ -119,7 +119,7 @@ void magma_get_spotrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
- void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
- {
-     *nb    = 64;
-    *recnb = 32;
-+    *recnb = 16;
-     return;
- }
- 
-@@ -127,7 +127,7 @@ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
- void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
- {
-     *nb    = 128;
-    *recnb =  32;
-+    *recnb =  16;
-     return;
- }
- 
-@@ -135,7 +135,7 @@ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
- void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
- {
-     *nb    = 128;
-    *recnb =  32;
-+    *recnb =  16;
-     return;
- }
- 
-@@ -143,7 +143,7 @@ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
- void magma_get_sgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
- {
-     *nb    = 128;
-    *recnb =  32;
-+    *recnb =  16;
-     return;
- }
- 
--- a/.ci/magma/package_files/getrf_shfl.patch
+++ b/.ci/magma/package_files/getrf_shfl.patch
@ -1,15 +0,0 @@
-diff --git a/src/zgetrf_batched.cpp b/src/zgetrf_batched.cpp
-index 24a65a90..884d9352 100644
--- a/src/zgetrf_batched.cpp
-+++ b/src/zgetrf_batched.cpp
-@@ -116,7 +116,9 @@ magma_zgetrf_batched(
-             return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
-         }
-         else{
-            return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
-+            // magma_cgetrf_batched_smallsq_shfl is broken, therefore let's call noshfl version for arch < 700
-+            // return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
-+            return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
-         }
-         #else
-         return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
--- a/.ci/magma/package_files/magma-2.6.1.sha256
+++ b/.ci/magma/package_files/magma-2.6.1.sha256
@ -1 +0,0 @@
-6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213  magma-2.6.1.tar.gz
--- a/.ci/magma/package_files/thread_queue.patch
+++ b/.ci/magma/package_files/thread_queue.patch
@ -1,20 +0,0 @@
--- control/thread_queue.cpp	2016-08-30 06:37:49.000000000 -0700
-+++ control/thread_queue.cpp	2016-10-10 19:47:28.911580965 -0700
-@@ -15,7 +15,7 @@
- {
-     if ( err != 0 ) {
-         fprintf( stderr, "Error: %s (%d)\n", strerror(err), err );
-        throw std::exception();
-+        // throw std::exception();
-     }
- }
- 
-@@ -172,7 +172,7 @@
-     check( pthread_mutex_lock( &mutex ));
-     if ( quit_flag ) {
-         fprintf( stderr, "Error: push_task() called after quit()\n" );
-        throw std::exception();
-+        // throw std::exception();
-     }
-     q.push( task );
-     ntask += 1;
--- a/.ci/manywheel/LICENSE
+++ b/.ci/manywheel/LICENSE
@ -1,21 +0,0 @@
-The MIT License (MIT)
-
-Copyright (c) 2016 manylinux
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/.ci/manywheel/build.sh
+++ b/.ci/manywheel/build.sh
@ -1,28 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-case "${GPU_ARCH_TYPE:-BLANK}" in
-    BLANK)
-        # Legacy behavior for CircleCI
-        bash "${SCRIPTPATH}/build_cuda.sh"
-        ;;
-    cuda)
-        bash "${SCRIPTPATH}/build_cuda.sh"
-        ;;
-    rocm)
-        bash "${SCRIPTPATH}/build_rocm.sh"
-        ;;
-    cpu | cpu-cxx11-abi | cpu-s390x)
-        bash "${SCRIPTPATH}/build_cpu.sh"
-        ;;
-    xpu)
-        bash "${SCRIPTPATH}/build_xpu.sh"
-        ;;
-    *)
-        echo "Un-recognized GPU_ARCH_TYPE '${GPU_ARCH_TYPE}', exiting..."
-        exit 1
-        ;;
-esac
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -1,498 +0,0 @@
-#!/usr/bin/env bash
-# meant to be called only from the neighboring build.sh and build_cpu.sh scripts
-
-set -ex
-SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-
-source ${SOURCE_DIR}/set_desired_python.sh
-
-
-if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
-    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
-    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
-    exit 1
-fi
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-PLATFORM="manylinux2014_x86_64"
-# TODO move this into the Docker images
-OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    retry yum install -q -y zip openssl
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
-    retry yum install -q -y zip openssl
-    PLATFORM="manylinux_2_28_x86_64"
-elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
-    retry dnf install -q -y zip openssl
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    # TODO: Remove this once nvidia package repos are back online
-    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
-    # shellcheck disable=SC2046
-    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
-
-    retry apt-get update
-    retry apt-get -y install zip openssl
-fi
-
-# We use the package name to test the package by passing this to 'pip install'
-# This is the env variable that setup.py uses to name the package. Note that
-# pip 'normalizes' the name first by changing all - to _
-if [[ -z "$TORCH_PACKAGE_NAME" ]]; then
-    TORCH_PACKAGE_NAME='torch'
-fi
-
-if [[ -z "$TORCH_NO_PYTHON_PACKAGE_NAME" ]]; then
-    TORCH_NO_PYTHON_PACKAGE_NAME='torch_no_python'
-fi
-
-TORCH_PACKAGE_NAME="$(echo $TORCH_PACKAGE_NAME | tr '-' '_')"
-TORCH_NO_PYTHON_PACKAGE_NAME="$(echo $TORCH_NO_PYTHON_PACKAGE_NAME | tr '-' '_')"
-echo "Expecting the built wheels to all be called '$TORCH_PACKAGE_NAME' or '$TORCH_NO_PYTHON_PACKAGE_NAME'"
-
-# Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
-# PYTORCH_BUILD_NUMBER > 1
-build_version="$PYTORCH_BUILD_VERSION"
-build_number="$PYTORCH_BUILD_NUMBER"
-if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
-    # This will be the *exact* version, since build_number<1
-    build_version="$OVERRIDE_PACKAGE_VERSION"
-    build_number=0
-fi
-if [[ -z "$build_version" ]]; then
-    build_version=1.0.0
-fi
-if [[ -z "$build_number" ]]; then
-    build_number=1
-fi
-export PYTORCH_BUILD_VERSION=$build_version
-export PYTORCH_BUILD_NUMBER=$build_number
-
-export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
-export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
-
-if [[ -e /opt/openssl ]]; then
-    export OPENSSL_ROOT_DIR=/opt/openssl
-    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
-fi
-
-
-
-mkdir -p /tmp/$WHEELHOUSE_DIR
-
-export PATCHELF_BIN=/usr/local/bin/patchelf
-patchelf_version=$($PATCHELF_BIN --version)
-echo "patchelf version: " $patchelf_version
-if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
-    echo "Your patchelf version is too old. Please use version >= 0.10."
-    exit 1
-fi
-
-########################################################
-# Compile wheels as well as libtorch
-#######################################################
-if [[ -z "$PYTORCH_ROOT" ]]; then
-    echo "Need to set PYTORCH_ROOT env variable"
-    exit 1
-fi
-pushd "$PYTORCH_ROOT"
-python setup.py clean
-retry pip install -qr requirements.txt
-case ${DESIRED_PYTHON} in
-  cp31*)
-    retry pip install -q --pre numpy==2.1.0
-    ;;
-  # Should catch 3.9+
-  *)
-    retry pip install -q --pre numpy==2.0.2
-    ;;
-esac
-
-if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    export _GLIBCXX_USE_CXX11_ABI=1
-else
-    export _GLIBCXX_USE_CXX11_ABI=0
-fi
-
-if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-    echo "Calling build_amd.py at $(date)"
-    python tools/amd_build/build_amd.py
-fi
-
-# This value comes from binary_linux_build.sh (and should only be set to true
-# for master / release branches)
-BUILD_DEBUG_INFO=${BUILD_DEBUG_INFO:=0}
-
-if [[ $BUILD_DEBUG_INFO == "1" ]]; then
-    echo "Building wheel and debug info"
-else
-    echo "BUILD_DEBUG_INFO was not set, skipping debug info"
-fi
-
-if [[ "$DISABLE_RCCL" = 1 ]]; then
-    echo "Disabling NCCL/RCCL in pyTorch"
-    USE_RCCL=0
-    USE_NCCL=0
-    USE_KINETO=0
-else
-    USE_RCCL=1
-    USE_NCCL=1
-    USE_KINETO=1
-fi
-
-echo "Calling setup.py bdist at $(date)"
-
-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    echo "Calling setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 \
-    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-    echo "Finished setup.py bdist_wheel for split build (BUILD_LIBTORCH_WHL)"
-    echo "Calling setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-    time EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-    BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 \
-    BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-    USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-    python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR --cmake
-    echo "Finished setup.py bdist_wheel for split build (BUILD_PYTHON_ONLY)"
-else
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-        EXTRA_CAFFE2_CMAKE_FLAGS=${EXTRA_CAFFE2_CMAKE_FLAGS[@]} \
-        BUILD_LIBTORCH_CPU_WITH_DEBUG=$BUILD_DEBUG_INFO \
-        USE_NCCL=${USE_NCCL} USE_RCCL=${USE_RCCL} USE_KINETO=${USE_KINETO} \
-        python setup.py bdist_wheel -d /tmp/$WHEELHOUSE_DIR
-fi
-echo "Finished setup.py bdist at $(date)"
-
-# Build libtorch packages
-if [[ -n "$BUILD_PYTHONLESS" ]]; then
-    # Now build pythonless libtorch
-    # Note - just use whichever python we happen to be on
-    python setup.py clean
-
-    if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
-        STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
-    fi
-
-    mkdir -p build
-    pushd build
-    echo "Calling tools/build_libtorch.py at $(date)"
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-         EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
-         python ../tools/build_libtorch.py
-    echo "Finished tools/build_libtorch.py at $(date)"
-    popd
-
-    mkdir -p libtorch/{lib,bin,include,share}
-    cp -r build/build/lib libtorch/
-
-    # for now, the headers for the libtorch package will just be copied in
-    # from one of the wheels (this is from when this script built multiple
-    # wheels at once)
-    ANY_WHEEL=$(ls /tmp/$WHEELHOUSE_DIR/torch*.whl | head -n1)
-    unzip -d any_wheel $ANY_WHEEL
-    if [[ -d any_wheel/torch/include ]]; then
-        cp -r any_wheel/torch/include libtorch/
-    else
-        cp -r any_wheel/torch/lib/include libtorch/
-    fi
-    cp -r any_wheel/torch/share/cmake libtorch/share/
-    rm -rf any_wheel
-
-    echo $PYTORCH_BUILD_VERSION > libtorch/build-version
-    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
-
-    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
-
-    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-        LIBTORCH_ABI="cxx11-abi-"
-    else
-        LIBTORCH_ABI=
-    fi
-
-    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
-    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
-       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
-fi
-
-popd
-
-#######################################################################
-# ADD DEPENDENCIES INTO THE WHEEL
-#
-# auditwheel repair doesn't work correctly and is buggy
-# so manually do the work of copying dependency libs and patchelfing
-# and fixing RECORDS entries correctly
-######################################################################
-
-fname_with_sha256() {
-    HASH=$(sha256sum $1 | cut -c1-8)
-    DIRNAME=$(dirname $1)
-    BASENAME=$(basename $1)
-    # Do not rename nvrtc-builtins.so as they are dynamically loaded
-    # by libnvrtc.so
-    # Similarly don't mangle libcudnn and libcublas library names
-    if [[ $BASENAME == "libnvrtc-builtins.s"* || $BASENAME == "libcudnn"* || $BASENAME == "libcublas"*  ]]; then
-        echo $1
-    else
-        INITNAME=$(echo $BASENAME | cut -f1 -d".")
-        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
-        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
-    fi
-}
-
-fname_without_so_number() {
-    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
-    echo "$LINKNAME"
-}
-
-make_wheel_record() {
-    FPATH=$1
-    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
-        # if the RECORD file, then
-        echo "\"$FPATH\",,"
-    else
-        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
-        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
-        echo "\"$FPATH\",sha256=$HASH,$FSIZE"
-    fi
-}
-
-replace_needed_sofiles() {
-    find $1 -name '*.so*' | while read sofile; do
-        origname=$2
-        patchedname=$3
-        if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-            set +e
-            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-            ERRCODE=$?
-            set -e
-            if [ "$ERRCODE" -eq "0" ]; then
-                echo "patching $sofile entry $origname to $patchedname"
-                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
-            fi
-        fi
-    done
-}
-
-echo 'Built this wheel:'
-ls /tmp/$WHEELHOUSE_DIR
-mkdir -p "/$WHEELHOUSE_DIR"
-mv /tmp/$WHEELHOUSE_DIR/torch*linux*.whl /$WHEELHOUSE_DIR/
-
-if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    mv /tmp/$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/ || true
-fi
-
-if [[ -n "$BUILD_PYTHONLESS" ]]; then
-    mkdir -p /$LIBTORCH_HOUSE_DIR
-    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
-    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
-fi
-rm -rf /tmp/$WHEELHOUSE_DIR
-rm -rf /tmp_dir
-mkdir /tmp_dir
-pushd /tmp_dir
-
-for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.whl /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
-
-    # if the glob didn't match anything
-    if [[ ! -e $pkg ]]; then
-        continue
-    fi
-
-    rm -rf tmp
-    mkdir -p tmp
-    cd tmp
-    cp $pkg .
-
-    unzip -q $(basename $pkg)
-    rm -f $(basename $pkg)
-
-    if [[ -d torch ]]; then
-        PREFIX=torch
-    else
-        PREFIX=libtorch
-    fi
-
-    if [[ $pkg != *"without-deps"* ]]; then
-        # copy over needed dependent .so files over and tag them with their hash
-        patched=()
-        for filepath in "${DEPS_LIST[@]}"; do
-            filename=$(basename $filepath)
-            destpath=$PREFIX/lib/$filename
-            if [[ "$filepath" != "$destpath" ]]; then
-                cp $filepath $destpath
-            fi
-
-            # ROCm workaround for roctracer dlopens
-            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies
-            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
-                patchedpath=$destpath
-            else
-                patchedpath=$(fname_with_sha256 $destpath)
-            fi
-            patchedname=$(basename $patchedpath)
-            if [[ "$destpath" != "$patchedpath" ]]; then
-                mv $destpath $patchedpath
-            fi
-            patched+=("$patchedname")
-            echo "Copied $filepath to $patchedpath"
-        done
-
-        echo "patching to fix the so names to the hashed names"
-        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
-            replace_needed_sofiles $PREFIX ${DEPS_SONAME[i]} ${patched[i]}
-            # do the same for caffe2, if it exists
-            if [[ -d caffe2 ]]; then
-                replace_needed_sofiles caffe2 ${DEPS_SONAME[i]} ${patched[i]}
-            fi
-        done
-
-        # copy over needed auxiliary files
-        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
-            srcpath=${DEPS_AUX_SRCLIST[i]}
-            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
-            mkdir -p $(dirname $dstpath)
-            cp $srcpath $dstpath
-        done
-    fi
-
-    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
-    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
-        echo "Setting rpath of $sofile to ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'}"
-        $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/lib'} ${FORCE_RPATH:-} $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-    done
-
-    # set RPATH of lib/ files to $ORIGIN
-    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
-        echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
-        $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-    done
-
-    # create Manylinux 2_28 tag this needs to happen before regenerate the RECORD
-    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
-        wheel_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/WHEEL/g')
-        sed -i -e s#linux_x86_64#"${PLATFORM}"# $wheel_file;
-    fi
-
-    # regenerate the RECORD file with new hashes
-    record_file=$(echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g')
-    if [[ -e $record_file ]]; then
-        echo "Generating new record file $record_file"
-        : > "$record_file"
-        # generate records for folders in wheel
-        find * -type f | while read fname; do
-            make_wheel_record "$fname" >>"$record_file"
-        done
-    fi
-
-    if [[ $BUILD_DEBUG_INFO == "1" ]]; then
-        pushd "$PREFIX/lib"
-
-        # Duplicate library into debug lib
-        cp libtorch_cpu.so libtorch_cpu.so.dbg
-
-        # Keep debug symbols on debug lib
-        strip --only-keep-debug libtorch_cpu.so.dbg
-
-        # Remove debug info from release lib
-        strip --strip-debug libtorch_cpu.so
-
-        objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
-
-        # Zip up debug info
-        mkdir -p /tmp/debug
-        mv libtorch_cpu.so.dbg /tmp/debug/libtorch_cpu.so.dbg
-        CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch_cpu.so)
-
-        pushd /tmp
-        PKG_NAME=$(basename "$pkg" | sed 's/\.whl$//g')
-        zip /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip /tmp/debug/libtorch_cpu.so.dbg
-        cp /tmp/debug-whl-libtorch-"$PKG_NAME"-"$CRC32".zip "$PYTORCH_FINAL_PACKAGE_DIR"
-        popd
-
-        popd
-    fi
-
-    # Rename wheel for Manylinux 2_28
-    if [[ $PLATFORM == "manylinux_2_28_x86_64" && $GPU_ARCH_TYPE != "cpu-s390x" && $GPU_ARCH_TYPE != "xpu" ]]; then
-        pkg_name=$(echo $(basename $pkg) | sed -e s#linux_x86_64#"${PLATFORM}"#)
-        zip -rq $pkg_name $PREIX*
-        rm -f $pkg
-        mv $pkg_name $(dirname $pkg)/$pkg_name
-    else
-        # zip up the wheel back
-        zip -rq $(basename $pkg) $PREIX*
-        # remove original wheel
-        rm -f $pkg
-        mv $(basename $pkg) $pkg
-    fi
-
-    cd ..
-    rm -rf tmp
-done
-
-# Copy wheels to host machine for persistence before testing
-if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
-    if [[ -n "$BUILD_PYTHONLESS" ]]; then
-        cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
-    else
-        cp /$WHEELHOUSE_DIR/torch*.whl "$PYTORCH_FINAL_PACKAGE_DIR"
-    fi
-fi
-
-# remove stuff before testing
-rm -rf /opt/rh
-if ls /usr/local/cuda* >/dev/null 2>&1; then
-    rm -rf /usr/local/cuda*
-fi
-
-
-# Test that all the wheels work
-if [[ -z "$BUILD_PYTHONLESS" ]]; then
-  export OMP_NUM_THREADS=4 # on NUMA machines this takes too long
-  pushd $PYTORCH_ROOT/test
-
-  # Install the wheel for this Python version
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip uninstall -y "$TORCH_NO_PYTHON_PACKAGE_NAME" || true
-  fi
-
-  pip uninstall -y "$TORCH_PACKAGE_NAME"
-
-  if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-    pip install "$TORCH_NO_PYTHON_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
-  fi
-
-  pip install "$TORCH_PACKAGE_NAME" --no-index -f /$WHEELHOUSE_DIR --no-dependencies -v
-
-  # Print info on the libraries installed in this wheel
-  # Rather than adjust find command to skip non-library files with an embedded *.so* in their name,
-  # since this is only for reporting purposes, we add the || true to the ldd command.
-  installed_libraries=($(find "$pydir/lib/python${py_majmin}/site-packages/torch/" -name '*.so*'))
-  echo "The wheel installed all of the libraries: ${installed_libraries[@]}"
-  for installed_lib in "${installed_libraries[@]}"; do
-      ldd "$installed_lib" || true
-  done
-
-  # Run the tests
-  echo "$(date) :: Running tests"
-  pushd "$PYTORCH_ROOT"
-
-
-  LD_LIBRARY_PATH=/usr/local/nvidia/lib64 \
-          "${PYTORCH_ROOT}/.ci/pytorch/run_tests.sh" manywheel "${py_majmin}" "$DESIRED_CUDA"
-  popd
-  echo "$(date) :: Finished tests"
-fi
--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-export TH_BINARY_BUILD=1
-export USE_CUDA=0
-
-# Keep an array of cmake variables to add to
-if [[ -z "$CMAKE_ARGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build()
-    CMAKE_ARGS=()
-fi
-if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
-    EXTRA_CAFFE2_CMAKE_FLAGS=()
-fi
-
-WHEELHOUSE_DIR="wheelhousecpu"
-LIBTORCH_HOUSE_DIR="libtorch_housecpu"
-if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    if [[ -z "$BUILD_PYTHONLESS" ]]; then
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhousecpu"
-    else
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_housecpu"
-    fi
-fi
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
-
-OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$(uname -m)" == "s390x" ]]; then
-        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
-    else
-        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-    fi
-fi
-
-DEPS_LIST=(
-    "$LIBGOMP_PATH"
-)
-
-DEPS_SONAME=(
-    "libgomp.so.1"
-)
-
-rm -rf /usr/local/cuda*
-
-SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-if [[ -z "$BUILD_PYTHONLESS" ]]; then
-    BUILD_SCRIPT=build_common.sh
-else
-    BUILD_SCRIPT=build_libtorch.sh
-fi
-source ${SOURCE_DIR}/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -1,280 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P ))"
-
-export TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
-export NCCL_ROOT_DIR=/usr/local/cuda
-export TH_BINARY_BUILD=1
-export USE_STATIC_CUDNN=1
-export USE_STATIC_NCCL=1
-export ATEN_STATIC_CUDA=1
-export USE_CUDA_STATIC_LINK=1
-export INSTALL_TEST=0 # dont install test binaries into site-packages
-export USE_CUPTI_SO=0
-export USE_CUSPARSELT=${USE_CUSPARSELT:-1} # Enable if not disabled by libtorch build
-
-# Keep an array of cmake variables to add to
-if [[ -z "$CMAKE_ARGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build()
-    CMAKE_ARGS=()
-fi
-if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
-    EXTRA_CAFFE2_CMAKE_FLAGS=()
-fi
-
-# Determine CUDA version and architectures to build for
-#
-# NOTE: We should first check `DESIRED_CUDA` when determining `CUDA_VERSION`,
-# because in some cases a single Docker image can have multiple CUDA versions
-# on it, and `nvcc --version` might not show the CUDA version we want.
-if [[ -n "$DESIRED_CUDA" ]]; then
-    # If the DESIRED_CUDA already matches the format that we expect
-    if [[ ${DESIRED_CUDA} =~ ^[0-9]+\.[0-9]+$ ]]; then
-        CUDA_VERSION=${DESIRED_CUDA}
-    else
-        # cu90, cu92, cu100, cu101
-        if [[ ${#DESIRED_CUDA} -eq 4 ]]; then
-            CUDA_VERSION="${DESIRED_CUDA:2:1}.${DESIRED_CUDA:3:1}"
-        elif [[ ${#DESIRED_CUDA} -eq 5 ]]; then
-            CUDA_VERSION="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4:1}"
-        fi
-    fi
-    echo "Using CUDA $CUDA_VERSION as determined by DESIRED_CUDA"
-else
-    CUDA_VERSION=$(nvcc --version|grep release|cut -f5 -d" "|cut -f1 -d",")
-    echo "CUDA $CUDA_VERSION Detected"
-fi
-
-cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
-
-TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
-case ${CUDA_VERSION} in
-    12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
-    12.4)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
-    11.8)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
-    *)
-        echo "unknown cuda version $CUDA_VERSION"
-        exit 1
-        ;;
-esac
-
-export TORCH_CUDA_ARCH_LIST=${TORCH_CUDA_ARCH_LIST}
-echo "${TORCH_CUDA_ARCH_LIST}"
-
-# Package directories
-WHEELHOUSE_DIR="wheelhouse$cuda_version_nodot"
-LIBTORCH_HOUSE_DIR="libtorch_house$cuda_version_nodot"
-if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    if [[ -z "$BUILD_PYTHONLESS" ]]; then
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$cuda_version_nodot"
-    else
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$cuda_version_nodot"
-    fi
-fi
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
-
-OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-fi
-
-DEPS_LIST=(
-    "$LIBGOMP_PATH"
-)
-DEPS_SONAME=(
-    "libgomp.so.1"
-)
-
-# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
-# since nvidia-cusparselt-cu11 is not available in PYPI
-if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
-        DEPS_SONAME+=(
-            "libcusparseLt.so.0"
-        )
-        DEPS_LIST+=(
-            "/usr/local/cuda/lib64/libcusparseLt.so.0"
-        )
-fi
-
-if [[ $CUDA_VERSION == "12.4" || $CUDA_VERSION == "12.6" ]]; then
-    export USE_STATIC_CUDNN=0
-    # Try parallelizing nvcc as well
-    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
-
-    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-        echo "Bundling with cudnn and cublas."
-        DEPS_LIST+=(
-            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
-            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
-            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
-            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
-            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
-            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
-            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
-            "/usr/local/cuda/lib64/libcudnn.so.9"
-            "/usr/local/cuda/lib64/libcublas.so.12"
-            "/usr/local/cuda/lib64/libcublasLt.so.12"
-            "/usr/local/cuda/lib64/libcusparseLt.so.0"
-            "/usr/local/cuda/lib64/libcudart.so.12"
-            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
-            "/usr/local/cuda/lib64/libnvrtc.so.12"
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
-        )
-        DEPS_SONAME+=(
-            "libcudnn_adv.so.9"
-            "libcudnn_cnn.so.9"
-            "libcudnn_graph.so.9"
-            "libcudnn_ops.so.9"
-            "libcudnn_engines_runtime_compiled.so.9"
-            "libcudnn_engines_precompiled.so.9"
-            "libcudnn_heuristic.so.9"
-            "libcudnn.so.9"
-            "libcublas.so.12"
-            "libcublasLt.so.12"
-            "libcusparseLt.so.0"
-            "libcudart.so.12"
-            "libnvToolsExt.so.1"
-            "libnvrtc.so.12"
-            "libnvrtc-builtins.so"
-        )
-    else
-        echo "Using nvidia libs from pypi."
-        CUDA_RPATHS=(
-            '$ORIGIN/../../nvidia/cublas/lib'
-            '$ORIGIN/../../nvidia/cuda_cupti/lib'
-            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-            '$ORIGIN/../../nvidia/cuda_runtime/lib'
-            '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/cufft/lib'
-            '$ORIGIN/../../nvidia/curand/lib'
-            '$ORIGIN/../../nvidia/cusolver/lib'
-            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../cusparselt/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
-            '$ORIGIN/../../nvidia/nvtx/lib'
-        )
-        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
-        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
-        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
-        export FORCE_RPATH="--force-rpath"
-        export USE_STATIC_NCCL=0
-        export USE_SYSTEM_NCCL=1
-        export ATEN_STATIC_CUDA=0
-        export USE_CUDA_STATIC_LINK=0
-        export USE_CUPTI_SO=1
-        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-    fi
-elif [[ $CUDA_VERSION == "11.8" ]]; then
-    export USE_STATIC_CUDNN=0
-    # Try parallelizing nvcc as well
-    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
-    # Bundle ptxas into the wheel, see https://github.com/pytorch/pytorch/pull/119750
-    export BUILD_BUNDLE_PTXAS=1
-
-    if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-        echo "Bundling with cudnn and cublas."
-        DEPS_LIST+=(
-            "/usr/local/cuda/lib64/libcudnn_adv.so.9"
-            "/usr/local/cuda/lib64/libcudnn_cnn.so.9"
-            "/usr/local/cuda/lib64/libcudnn_graph.so.9"
-            "/usr/local/cuda/lib64/libcudnn_ops.so.9"
-            "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9"
-            "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9"
-            "/usr/local/cuda/lib64/libcudnn_heuristic.so.9"
-            "/usr/local/cuda/lib64/libcudnn.so.9"
-            "/usr/local/cuda/lib64/libcublas.so.11"
-            "/usr/local/cuda/lib64/libcublasLt.so.11"
-            "/usr/local/cuda/lib64/libcudart.so.11.0"
-            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
-            "/usr/local/cuda/lib64/libnvrtc.so.11.2"    # this is not a mistake, it links to more specific cuda version
-            "/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
-        )
-        DEPS_SONAME+=(
-            "libcudnn_adv.so.9"
-            "libcudnn_cnn.so.9"
-            "libcudnn_graph.so.9"
-            "libcudnn_ops.so.9"
-            "libcudnn_engines_runtime_compiled.so.9"
-            "libcudnn_engines_precompiled.so.9"
-            "libcudnn_heuristic.so.9"
-            "libcudnn.so.9"
-            "libcublas.so.11"
-            "libcublasLt.so.11"
-            "libcudart.so.11.0"
-            "libnvToolsExt.so.1"
-            "libnvrtc.so.11.2"
-            "libnvrtc-builtins.so.11.8"
-        )
-    else
-        echo "Using nvidia libs from pypi."
-        CUDA_RPATHS=(
-            '$ORIGIN/../../nvidia/cublas/lib'
-            '$ORIGIN/../../nvidia/cuda_cupti/lib'
-            '$ORIGIN/../../nvidia/cuda_nvrtc/lib'
-            '$ORIGIN/../../nvidia/cuda_runtime/lib'
-            '$ORIGIN/../../nvidia/cudnn/lib'
-            '$ORIGIN/../../nvidia/cufft/lib'
-            '$ORIGIN/../../nvidia/curand/lib'
-            '$ORIGIN/../../nvidia/cusolver/lib'
-            '$ORIGIN/../../nvidia/cusparse/lib'
-            '$ORIGIN/../../nvidia/nccl/lib'
-            '$ORIGIN/../../nvidia/nvtx/lib'
-        )
-        CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
-        export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
-        export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
-        export FORCE_RPATH="--force-rpath"
-        export USE_STATIC_NCCL=0
-        export USE_SYSTEM_NCCL=1
-        export ATEN_STATIC_CUDA=0
-        export USE_CUDA_STATIC_LINK=0
-        export USE_CUPTI_SO=1
-        export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-        export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-    fi
-else
-    echo "Unknown cuda version $CUDA_VERSION"
-    exit 1
-fi
-
-# run_tests.sh requires DESIRED_CUDA to know what tests to exclude
-export DESIRED_CUDA="$cuda_version_nodot"
-
-# Switch `/usr/local/cuda` to the desired CUDA version
-rm -rf /usr/local/cuda || true
-ln -s "/usr/local/cuda-${CUDA_VERSION}" /usr/local/cuda
-
-# Switch `/usr/local/magma` to the desired CUDA version
-rm -rf /usr/local/magma || true
-ln -s /usr/local/cuda-${CUDA_VERSION}/magma /usr/local/magma
-
-export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev) # 10.0.130
-export CUDA_VERSION_SHORT=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev | cut -f1,2 -d".") # 10.0
-export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-if [[ -z "$BUILD_PYTHONLESS" ]]; then
-    BUILD_SCRIPT=build_common.sh
-else
-    BUILD_SCRIPT=build_libtorch.sh
-fi
-source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -1,353 +0,0 @@
-#!/usr/bin/env bash
-# meant to be called only from the neighboring build.sh and build_cpu.sh scripts
-
-set -e pipefail
-SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-
-# Require only one python installation
-if [[ -z "$DESIRED_PYTHON" ]]; then
-    echo "Need to set DESIRED_PYTHON env variable"
-    exit 1
-fi
-if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
-    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
-    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
-    exit 1
-fi
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# TODO move this into the Docker images
-OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    retry yum install -q -y zip openssl
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
-    retry yum install -q -y zip openssl
-elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
-    retry dnf install -q -y zip openssl
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    # TODO: Remove this once nvidia package repos are back online
-    # Comment out nvidia repositories to prevent them from getting apt-get updated, see https://github.com/pytorch/pytorch/issues/74968
-    # shellcheck disable=SC2046
-    sed -i 's/.*nvidia.*/# &/' $(find /etc/apt/ -type f -name "*.list")
-    retry apt-get update
-    retry apt-get -y install zip openssl
-fi
-
-# Version: setup.py uses $PYTORCH_BUILD_VERSION.post$PYTORCH_BUILD_NUMBER if
-# PYTORCH_BUILD_NUMBER > 1
-build_version="$PYTORCH_BUILD_VERSION"
-build_number="$PYTORCH_BUILD_NUMBER"
-if [[ -n "$OVERRIDE_PACKAGE_VERSION" ]]; then
-    # This will be the *exact* version, since build_number<1
-    build_version="$OVERRIDE_PACKAGE_VERSION"
-    build_number=0
-fi
-if [[ -z "$build_version" ]]; then
-    build_version=1.0.0
-fi
-if [[ -z "$build_number" ]]; then
-    build_number=1
-fi
-export PYTORCH_BUILD_VERSION=$build_version
-export PYTORCH_BUILD_NUMBER=$build_number
-
-export CMAKE_LIBRARY_PATH="/opt/intel/lib:/lib:$CMAKE_LIBRARY_PATH"
-export CMAKE_INCLUDE_PATH="/opt/intel/include:$CMAKE_INCLUDE_PATH"
-
-# set OPENSSL_ROOT_DIR=/opt/openssl if it exists
-if [[ -e /opt/openssl ]]; then
-    export OPENSSL_ROOT_DIR=/opt/openssl
-    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
-fi
-
-# If given a python version like 3.6m or 2.7mu, convert this to the format we
-# expect. The binary CI jobs pass in python versions like this; they also only
-# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
-# in this case
-if [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
-    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
-    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
-fi
-pydir="/opt/python/$DESIRED_PYTHON"
-export PATH="$pydir/bin:$PATH"
-
-export PATCHELF_BIN=/usr/local/bin/patchelf
-patchelf_version=`$PATCHELF_BIN --version`
-echo "patchelf version: " $patchelf_version
-if [[ "$patchelf_version" == "patchelf 0.9" ]]; then
-    echo "Your patchelf version is too old. Please use version >= 0.10."
-    exit 1
-fi
-
-########################################################
-# Compile wheels as well as libtorch
-#######################################################
-if [[ -z "$PYTORCH_ROOT" ]]; then
-    echo "Need to set PYTORCH_ROOT env variable"
-    exit 1
-fi
-pushd "$PYTORCH_ROOT"
-python setup.py clean
-retry pip install -qr requirements.txt
-retry pip install -q numpy==2.0.1
-
-if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    export _GLIBCXX_USE_CXX11_ABI=1
-else
-    export _GLIBCXX_USE_CXX11_ABI=0
-fi
-
-if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-    echo "Calling build_amd.py at $(date)"
-    python tools/amd_build/build_amd.py
-    # TODO remove this work-around once pytorch sources are updated
-    export ROCclr_DIR=/opt/rocm/rocclr/lib/cmake/rocclr
-fi
-
-echo "Calling setup.py install at $(date)"
-
-if [[ $LIBTORCH_VARIANT = *"static"* ]]; then
-    STATIC_CMAKE_FLAG="-DTORCH_STATIC=1"
-fi
-
-(
-    set -x
-
-    mkdir -p build
-
-    time CMAKE_ARGS=${CMAKE_ARGS[@]} \
-        EXTRA_CAFFE2_CMAKE_FLAGS="${EXTRA_CAFFE2_CMAKE_FLAGS[@]} $STATIC_CMAKE_FLAG" \
-        # TODO: Remove this flag once https://github.com/pytorch/pytorch/issues/55952 is closed
-        CFLAGS='-Wno-deprecated-declarations' \
-        BUILD_LIBTORCH_CPU_WITH_DEBUG=1 \
-        python setup.py install
-
-    mkdir -p libtorch/{lib,bin,include,share}
-
-    # Make debug folder separate so it doesn't get zipped up with the rest of
-    # libtorch
-    mkdir debug
-
-    # Copy over all lib files
-    cp -rv build/lib/*                libtorch/lib/
-    cp -rv build/lib*/torch/lib/*     libtorch/lib/
-
-    # Copy over all include files
-    cp -rv build/include/*            libtorch/include/
-    cp -rv build/lib*/torch/include/* libtorch/include/
-
-    # Copy over all of the cmake files
-    cp -rv build/lib*/torch/share/*   libtorch/share/
-
-    # Split libtorch into debug / release version
-    cp libtorch/lib/libtorch_cpu.so libtorch/lib/libtorch_cpu.so.dbg
-
-    # Keep debug symbols on debug lib
-    strip --only-keep-debug libtorch/lib/libtorch_cpu.so.dbg
-
-    # Remove debug info from release lib
-    strip --strip-debug libtorch/lib/libtorch_cpu.so
-
-    # Add a debug link to the release lib to the debug lib (debuggers will then
-    # search for symbols in a file called libtorch_cpu.so.dbg in some
-    # predetermined locations) and embed a CRC32 of the debug library into the .so
-    cd libtorch/lib
-
-    objcopy libtorch_cpu.so --add-gnu-debuglink=libtorch_cpu.so.dbg
-    cd ../..
-
-    # Move the debug symbols to its own directory so it doesn't get processed /
-    # zipped with all the other libraries
-    mv libtorch/lib/libtorch_cpu.so.dbg debug/libtorch_cpu.so.dbg
-
-    echo "${PYTORCH_BUILD_VERSION}" > libtorch/build-version
-    echo "$(pushd $PYTORCH_ROOT && git rev-parse HEAD)" > libtorch/build-hash
-
-)
-
-if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    LIBTORCH_ABI="cxx11-abi-"
-else
-    LIBTORCH_ABI=
-fi
-
-(
-    set -x
-
-    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR
-
-    # objcopy installs a CRC32 into libtorch_cpu above so, so add that to the name here
-    CRC32=$(objcopy --dump-section .gnu_debuglink=>(tail -c4 | od -t x4 -An | xargs echo) libtorch/lib/libtorch_cpu.so)
-
-    # Zip debug symbols
-    zip /tmp/$LIBTORCH_HOUSE_DIR/debug-libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION-$CRC32.zip debug/libtorch_cpu.so.dbg
-
-    # Zip and copy libtorch
-    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
-    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
-       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
-)
-
-
-popd
-
-#######################################################################
-# ADD DEPENDENCIES INTO THE WHEEL
-#
-# auditwheel repair doesn't work correctly and is buggy
-# so manually do the work of copying dependency libs and patchelfing
-# and fixing RECORDS entries correctly
-######################################################################
-
-fname_with_sha256() {
-    HASH=$(sha256sum $1 | cut -c1-8)
-    DIRNAME=$(dirname $1)
-    BASENAME=$(basename $1)
-    if [[ $BASENAME == "libnvrtc-builtins.so" || $BASENAME == "libcudnn"* ]]; then
-        echo $1
-    else
-        INITNAME=$(echo $BASENAME | cut -f1 -d".")
-        ENDNAME=$(echo $BASENAME | cut -f 2- -d".")
-        echo "$DIRNAME/$INITNAME-$HASH.$ENDNAME"
-    fi
-}
-
-fname_without_so_number() {
-    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
-    echo "$LINKNAME"
-}
-
-make_wheel_record() {
-    FPATH=$1
-    if echo $FPATH | grep RECORD >/dev/null 2>&1; then
-        # if the RECORD file, then
-        echo "\"$FPATH\",,"
-    else
-        HASH=$(openssl dgst -sha256 -binary $FPATH | openssl base64 | sed -e 's/+/-/g' | sed -e 's/\//_/g' | sed -e 's/=//g')
-        FSIZE=$(ls -nl $FPATH | awk '{print $5}')
-        echo "\"$FPATH\",sha256=$HASH,$FSIZE"
-    fi
-}
-
-echo 'Built this package:'
-(
-    set -x
-    mkdir -p /$LIBTORCH_HOUSE_DIR
-    mv /tmp/$LIBTORCH_HOUSE_DIR/*.zip /$LIBTORCH_HOUSE_DIR
-    rm -rf /tmp/$LIBTORCH_HOUSE_DIR
-)
-TMP_DIR=$(mktemp -d)
-trap "rm -rf ${TMP_DIR}" EXIT
-pushd "${TMP_DIR}"
-
-for pkg in /$LIBTORCH_HOUSE_DIR/libtorch*.zip; do
-
-    # if the glob didn't match anything
-    if [[ ! -e $pkg ]]; then
-        continue
-    fi
-
-    rm -rf tmp
-    mkdir -p tmp
-    cd tmp
-    cp $pkg .
-
-    unzip -q $(basename $pkg)
-    rm -f $(basename $pkg)
-
-    PREFIX=libtorch
-
-    if [[ $pkg != *"without-deps"* ]]; then
-        # copy over needed dependent .so files over and tag them with their hash
-        patched=()
-        for filepath in "${DEPS_LIST[@]}"; do
-            filename=$(basename $filepath)
-            destpath=$PREFIX/lib/$filename
-            if [[ "$filepath" != "$destpath" ]]; then
-                cp $filepath $destpath
-            fi
-
-            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-                patchedpath=$(fname_without_so_number $destpath)
-            else
-                patchedpath=$(fname_with_sha256 $destpath)
-            fi
-            patchedname=$(basename $patchedpath)
-            if [[ "$destpath" != "$patchedpath" ]]; then
-                mv $destpath $patchedpath
-            fi
-            patched+=("$patchedname")
-            echo "Copied $filepath to $patchedpath"
-        done
-
-        echo "patching to fix the so names to the hashed names"
-        for ((i=0;i<${#DEPS_LIST[@]};++i)); do
-            find $PREFIX -name '*.so*' | while read sofile; do
-                origname=${DEPS_SONAME[i]}
-                patchedname=${patched[i]}
-                if [[ "$origname" != "$patchedname" ]] || [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
-                    set +e
-                    origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-                    ERRCODE=$?
-                    set -e
-                    if [ "$ERRCODE" -eq "0" ]; then
-                        echo "patching $sofile entry $origname to $patchedname"
-                        $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
-                    fi
-                fi
-            done
-        done
-
-        # copy over needed auxiliary files
-        for ((i=0;i<${#DEPS_AUX_SRCLIST[@]};++i)); do
-            srcpath=${DEPS_AUX_SRCLIST[i]}
-            dstpath=$PREFIX/${DEPS_AUX_DSTLIST[i]}
-            mkdir -p $(dirname $dstpath)
-            cp $srcpath $dstpath
-        done
-    fi
-
-    # set RPATH of _C.so and similar to $ORIGIN, $ORIGIN/lib
-    find $PREFIX -maxdepth 1 -type f -name "*.so*" | while read sofile; do
-        echo "Setting rpath of $sofile to " '$ORIGIN:$ORIGIN/lib'
-        $PATCHELF_BIN --set-rpath '$ORIGIN:$ORIGIN/lib' $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-    done
-
-    # set RPATH of lib/ files to $ORIGIN
-    find $PREFIX/lib -maxdepth 1 -type f -name "*.so*" | while read sofile; do
-        echo "Setting rpath of $sofile to " '$ORIGIN'
-        $PATCHELF_BIN --set-rpath '$ORIGIN' $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-    done
-
-    # regenerate the RECORD file with new hashes
-    record_file=`echo $(basename $pkg) | sed -e 's/-cp.*$/.dist-info\/RECORD/g'`
-    if [[ -e $record_file ]]; then
-        echo "Generating new record file $record_file"
-        rm -f $record_file
-        # generate records for folders in wheel
-        find * -type f | while read fname; do
-            echo $(make_wheel_record $fname) >>$record_file
-        done
-    fi
-
-    # zip up the wheel back
-    zip -rq $(basename $pkg) $PREFIX*
-
-    # replace original wheel
-    rm -f $pkg
-    mv $(basename $pkg) $pkg
-    cd ..
-    rm -rf tmp
-done
-
-# Copy wheels to host machine for persistence before testing
-if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    cp /$LIBTORCH_HOUSE_DIR/libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
-    cp /$LIBTORCH_HOUSE_DIR/debug-libtorch*.zip "$PYTORCH_FINAL_PACKAGE_DIR"
-fi
--- a/.ci/manywheel/build_rocm.sh
+++ b/.ci/manywheel/build_rocm.sh
@ -1,268 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-export ROCM_HOME=/opt/rocm
-export MAGMA_HOME=$ROCM_HOME/magma
-# TODO: libtorch_cpu.so is broken when building with Debug info
-export BUILD_DEBUG_INFO=0
-
-# TODO Are these all used/needed?
-export TH_BINARY_BUILD=1
-export USE_STATIC_CUDNN=1
-export USE_STATIC_NCCL=1
-export ATEN_STATIC_CUDA=1
-export USE_CUDA_STATIC_LINK=1
-export INSTALL_TEST=0 # dont install test binaries into site-packages
-# Set RPATH instead of RUNPATH when using patchelf to avoid LD_LIBRARY_PATH override
-export FORCE_RPATH="--force-rpath"
-
-# Keep an array of cmake variables to add to
-if [[ -z "$CMAKE_ARGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build()
-    CMAKE_ARGS=()
-fi
-if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
-    EXTRA_CAFFE2_CMAKE_FLAGS=()
-fi
-
-# Determine ROCm version and architectures to build for
-#
-# NOTE: We should first check `DESIRED_CUDA` when determining `ROCM_VERSION`
-if [[ -n "$DESIRED_CUDA" ]]; then
-    if ! echo "${DESIRED_CUDA}"| grep "^rocm" >/dev/null 2>/dev/null; then
-        export DESIRED_CUDA="rocm${DESIRED_CUDA}"
-    fi
-    # rocm3.7, rocm3.5.1
-    ROCM_VERSION="$DESIRED_CUDA"
-    echo "Using $ROCM_VERSION as determined by DESIRED_CUDA"
-else
-    echo "Must set DESIRED_CUDA"
-    exit 1
-fi
-
-# Package directories
-WHEELHOUSE_DIR="wheelhouse$ROCM_VERSION"
-LIBTORCH_HOUSE_DIR="libtorch_house$ROCM_VERSION"
-if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    if [[ -z "$BUILD_PYTHONLESS" ]]; then
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhouse$ROCM_VERSION"
-    else
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_house$ROCM_VERSION"
-    fi
-fi
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
-
-# To make version comparison easier, create an integer representation.
-ROCM_VERSION_CLEAN=$(echo ${ROCM_VERSION} | sed s/rocm//)
-save_IFS="$IFS"
-IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION_CLEAN})
-IFS="$save_IFS"
-if [[ ${#ROCM_VERSION_ARRAY[@]} == 2 ]]; then
-    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
-    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
-    ROCM_VERSION_PATCH=0
-elif [[ ${#ROCM_VERSION_ARRAY[@]} == 3 ]]; then
-    ROCM_VERSION_MAJOR=${ROCM_VERSION_ARRAY[0]}
-    ROCM_VERSION_MINOR=${ROCM_VERSION_ARRAY[1]}
-    ROCM_VERSION_PATCH=${ROCM_VERSION_ARRAY[2]}
-else
-    echo "Unhandled ROCM_VERSION ${ROCM_VERSION}"
-    exit 1
-fi
-ROCM_INT=$(($ROCM_VERSION_MAJOR * 10000 + $ROCM_VERSION_MINOR * 100 + $ROCM_VERSION_PATCH))
-
-# Required ROCm libraries
-ROCM_SO_FILES=(
-    "libMIOpen.so"
-    "libamdhip64.so"
-    "libhipblas.so"
-    "libhipfft.so"
-    "libhiprand.so"
-    "libhipsolver.so"
-    "libhipsparse.so"
-    "libhsa-runtime64.so"
-    "libamd_comgr.so"
-    "libmagma.so"
-    "librccl.so"
-    "librocblas.so"
-    "librocfft.so"
-    "librocm_smi64.so"
-    "librocrand.so"
-    "librocsolver.so"
-    "librocsparse.so"
-    "libroctracer64.so"
-    "libroctx64.so"
-    "libhipblaslt.so"
-    "libhiprtc.so"
-)
-
-if [[ $ROCM_INT -ge 60100 ]]; then
-    ROCM_SO_FILES+=("librocprofiler-register.so")
-fi
-
-if [[ $ROCM_INT -ge 60200 ]]; then
-    ROCM_SO_FILES+=("librocm-core.so")
-fi
-
-OS_NAME=`awk -F= '/^NAME/{print $2}' /etc/os-release`
-if [[ "$OS_NAME" == *"CentOS Linux"* || "$OS_NAME" == *"AlmaLinux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-    LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
-    LIBELF_PATH="/usr/lib64/libelf.so.1"
-    if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-        LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
-    else
-        LIBTINFO_PATH="/usr/lib64/libtinfo.so.6"
-    fi
-    LIBDRM_PATH="/opt/amdgpu/lib64/libdrm.so.2"
-    LIBDRM_AMDGPU_PATH="/opt/amdgpu/lib64/libdrm_amdgpu.so.1"
-    if [[ $ROCM_INT -ge 60100 && $ROCM_INT -lt 60300 ]]; then
-        # Below libs are direct dependencies of libhipsolver
-        LIBSUITESPARSE_CONFIG_PATH="/lib64/libsuitesparseconfig.so.4"
-        if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-            LIBCHOLMOD_PATH="/lib64/libcholmod.so.2"
-            # Below libs are direct dependencies of libsatlas
-            LIBGFORTRAN_PATH="/lib64/libgfortran.so.3"
-        else
-            LIBCHOLMOD_PATH="/lib64/libcholmod.so.3"
-            # Below libs are direct dependencies of libsatlas
-            LIBGFORTRAN_PATH="/lib64/libgfortran.so.5"
-        fi
-        # Below libs are direct dependencies of libcholmod
-        LIBAMD_PATH="/lib64/libamd.so.2"
-        LIBCAMD_PATH="/lib64/libcamd.so.2"
-        LIBCCOLAMD_PATH="/lib64/libccolamd.so.2"
-        LIBCOLAMD_PATH="/lib64/libcolamd.so.2"
-        LIBSATLAS_PATH="/lib64/atlas/libsatlas.so.3"
-        # Below libs are direct dependencies of libsatlas
-        LIBQUADMATH_PATH="/lib64/libquadmath.so.0"
-    fi
-    MAYBE_LIB64=lib64
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-    LIBNUMA_PATH="/usr/lib/x86_64-linux-gnu/libnuma.so.1"
-    LIBELF_PATH="/usr/lib/x86_64-linux-gnu/libelf.so.1"
-    if [[ $ROCM_INT -ge 50300 ]]; then
-        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.6"
-    else
-        LIBTINFO_PATH="/lib/x86_64-linux-gnu/libtinfo.so.5"
-    fi
-    LIBDRM_PATH="/usr/lib/x86_64-linux-gnu/libdrm.so.2"
-    LIBDRM_AMDGPU_PATH="/usr/lib/x86_64-linux-gnu/libdrm_amdgpu.so.1"
-    if [[ $ROCM_INT -ge 60100 && $ROCM_INT -lt 60300 ]]; then
-        # Below libs are direct dependencies of libhipsolver
-        LIBCHOLMOD_PATH="/lib/x86_64-linux-gnu/libcholmod.so.3"
-        # Below libs are direct dependencies of libcholmod
-        LIBSUITESPARSE_CONFIG_PATH="/lib/x86_64-linux-gnu/libsuitesparseconfig.so.5"
-        LIBAMD_PATH="/lib/x86_64-linux-gnu/libamd.so.2"
-        LIBCAMD_PATH="/lib/x86_64-linux-gnu/libcamd.so.2"
-        LIBCCOLAMD_PATH="/lib/x86_64-linux-gnu/libccolamd.so.2"
-        LIBCOLAMD_PATH="/lib/x86_64-linux-gnu/libcolamd.so.2"
-        LIBMETIS_PATH="/lib/x86_64-linux-gnu/libmetis.so.5"
-        LIBLAPACK_PATH="/lib/x86_64-linux-gnu/liblapack.so.3"
-        LIBBLAS_PATH="/lib/x86_64-linux-gnu/libblas.so.3"
-        # Below libs are direct dependencies of libblas
-        LIBGFORTRAN_PATH="/lib/x86_64-linux-gnu/libgfortran.so.5"
-        LIBQUADMATH_PATH="/lib/x86_64-linux-gnu/libquadmath.so.0"
-    fi
-    MAYBE_LIB64=lib
-fi
-OS_SO_PATHS=($LIBGOMP_PATH $LIBNUMA_PATH\
-             $LIBELF_PATH $LIBTINFO_PATH\
-             $LIBDRM_PATH $LIBDRM_AMDGPU_PATH\
-             $LIBSUITESPARSE_CONFIG_PATH\
-             $LIBCHOLMOD_PATH $LIBAMD_PATH\
-             $LIBCAMD_PATH $LIBCCOLAMD_PATH\
-             $LIBCOLAMD_PATH $LIBSATLAS_PATH\
-             $LIBGFORTRAN_PATH $LIBQUADMATH_PATH\
-             $LIBMETIS_PATH $LIBLAPACK_PATH\
-             $LIBBLAS_PATH)
-OS_SO_FILES=()
-for lib in "${OS_SO_PATHS[@]}"
-do
-    file_name="${lib##*/}" # Substring removal of path to get filename
-    OS_SO_FILES[${#OS_SO_FILES[@]}]=$file_name # Append lib to array
-done
-
-# rocBLAS library files
-ROCBLAS_LIB_SRC=$ROCM_HOME/lib/rocblas/library
-ROCBLAS_LIB_DST=lib/rocblas/library
-ARCH=$(echo $PYTORCH_ROCM_ARCH | sed 's/;/|/g') # Replace ; seperated arch list to bar for grep
-ARCH_SPECIFIC_FILES=$(ls $ROCBLAS_LIB_SRC | grep -E $ARCH)
-OTHER_FILES=$(ls $ROCBLAS_LIB_SRC | grep -v gfx)
-ROCBLAS_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
-
-# hipblaslt library files
-HIPBLASLT_LIB_SRC=$ROCM_HOME/lib/hipblaslt/library
-HIPBLASLT_LIB_DST=lib/hipblaslt/library
-ARCH_SPECIFIC_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -E $ARCH)
-OTHER_FILES=$(ls $HIPBLASLT_LIB_SRC | grep -v gfx)
-HIPBLASLT_LIB_FILES=($ARCH_SPECIFIC_FILES $OTHER_FILES)
-
-# ROCm library files
-ROCM_SO_PATHS=()
-for lib in "${ROCM_SO_FILES[@]}"
-do
-    file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
-    if [[ -z $file_path ]]; then
-        if [ -d "$ROCM_HOME/lib64/" ]; then
-            file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
-        fi
-    fi
-    if [[ -z $file_path ]]; then
-        file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
-    fi
-    if [[ -z $file_path ]]; then
-        echo "Error: Library file $lib is not found." >&2
-        exit 1
-    fi
-    ROCM_SO_PATHS[${#ROCM_SO_PATHS[@]}]="$file_path" # Append lib to array
-done
-
-DEPS_LIST=(
-    ${ROCM_SO_PATHS[*]}
-    ${OS_SO_PATHS[*]}
-)
-
-DEPS_SONAME=(
-    ${ROCM_SO_FILES[*]}
-    ${OS_SO_FILES[*]}
-)
-
-DEPS_AUX_SRCLIST=(
-    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_SRC/}"
-    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_SRC/}"
-    "/opt/amdgpu/share/libdrm/amdgpu.ids"
-)
-
-DEPS_AUX_DSTLIST=(
-    "${ROCBLAS_LIB_FILES[@]/#/$ROCBLAS_LIB_DST/}"
-    "${HIPBLASLT_LIB_FILES[@]/#/$HIPBLASLT_LIB_DST/}"
-    "share/libdrm/amdgpu.ids"
-)
-
-# MIOpen library files
-MIOPEN_SHARE_SRC=$ROCM_HOME/share/miopen/db
-MIOPEN_SHARE_DST=share/miopen/db
-MIOPEN_SHARE_FILES=($(ls $MIOPEN_SHARE_SRC | grep -E $ARCH))
-DEPS_AUX_SRCLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_SRC/})
-DEPS_AUX_DSTLIST+=(${MIOPEN_SHARE_FILES[@]/#/$MIOPEN_SHARE_DST/})
-
-# RCCL library files
-RCCL_SHARE_SRC=$ROCM_HOME/share/rccl/msccl-algorithms
-RCCL_SHARE_DST=share/rccl/msccl-algorithms
-RCCL_SHARE_FILES=($(ls $RCCL_SHARE_SRC))
-DEPS_AUX_SRCLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_SRC/})
-DEPS_AUX_DSTLIST+=(${RCCL_SHARE_FILES[@]/#/$RCCL_SHARE_DST/})
-
-echo "PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH}"
-
-SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
-if [[ -z "$BUILD_PYTHONLESS" ]]; then
-    BUILD_SCRIPT=build_common.sh
-else
-    BUILD_SCRIPT=build_libtorch.sh
-fi
-source $SCRIPTPATH/${BUILD_SCRIPT}
--- a/.ci/manywheel/build_xpu.sh
+++ b/.ci/manywheel/build_xpu.sh
@ -1,108 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-export TH_BINARY_BUILD=1
-export USE_CUDA=0
-
-# Keep an array of cmake variables to add to
-if [[ -z "$CMAKE_ARGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build()
-    CMAKE_ARGS=()
-fi
-if [[ -z "$EXTRA_CAFFE2_CMAKE_FLAGS" ]]; then
-    # These are passed to tools/build_pytorch_libs.sh::build_caffe2()
-    EXTRA_CAFFE2_CMAKE_FLAGS=()
-fi
-
-
-# Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
-source /opt/intel/oneapi/compiler/latest/env/vars.sh
-source /opt/intel/oneapi/pti/latest/env/vars.sh
-source /opt/intel/oneapi/umf/latest/env/vars.sh
-export USE_STATIC_MKL=1
-
-WHEELHOUSE_DIR="wheelhousexpu"
-LIBTORCH_HOUSE_DIR="libtorch_housexpu"
-if [[ -z "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
-    if [[ -z "$BUILD_PYTHONLESS" ]]; then
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/wheelhousexpu"
-    else
-        PYTORCH_FINAL_PACKAGE_DIR="/remote/libtorch_housexpu"
-    fi
-fi
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
-
-OS_NAME=$(awk -F= '/^NAME/{print $2}' /etc/os-release)
-if [[ "$OS_NAME" == *"CentOS Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Red Hat Enterprise Linux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"AlmaLinux"* ]]; then
-    LIBGOMP_PATH="/usr/lib64/libgomp.so.1"
-elif [[ "$OS_NAME" == *"Ubuntu"* ]]; then
-    if [[ "$(uname -m)" == "s390x" ]]; then
-        LIBGOMP_PATH="/usr/lib/s390x-linux-gnu/libgomp.so.1"
-    else
-        LIBGOMP_PATH="/usr/lib/x86_64-linux-gnu/libgomp.so.1"
-    fi
-fi
-
-DEPS_LIST=(
-    "$LIBGOMP_PATH"
-    "/opt/intel/oneapi/compiler/latest/lib/libOpenCL.so.1"
-)
-
-DEPS_SONAME=(
-    "libgomp.so.1"
-    "libOpenCL.so.1"
-)
-
-if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
-    echo "Bundling with xpu support package libs."
-    DEPS_LIST+=(
-        "/opt/intel/oneapi/compiler/latest/lib/libsycl.so.8"
-        "/opt/intel/oneapi/compiler/latest/lib/libur_loader.so.0"
-        "/opt/intel/oneapi/compiler/latest/lib/libur_adapter_level_zero.so.0"
-        "/opt/intel/oneapi/compiler/latest/lib/libur_adapter_opencl.so.0"
-        "/opt/intel/oneapi/compiler/latest/lib/libsvml.so"
-        "/opt/intel/oneapi/compiler/latest/lib/libirng.so"
-        "/opt/intel/oneapi/compiler/latest/lib/libimf.so"
-        "/opt/intel/oneapi/compiler/latest/lib/libintlc.so.5"
-        "/opt/intel/oneapi/pti/latest/lib/libpti_view.so.0.10"
-        "/opt/intel/oneapi/umf/latest/lib/libumf.so.0"
-        "/opt/intel/oneapi/tcm/latest/lib/libhwloc.so.15"
-    )
-    DEPS_SONAME+=(
-        "libsycl.so.8"
-        "libur_loader.so.0"
-        "libur_adapter_level_zero.so.0"
-        "libur_adapter_opencl.so.0"
-        "libsvml.so"
-        "libirng.so"
-        "libimf.so"
-        "libintlc.so.5"
-        "libpti_view.so.0.10"
-        "libumf.so.0"
-        "libhwloc.so.15"
-    )
-else
-    echo "Using xpu runtime libs from pypi."
-    XPU_RPATHS=(
-        '$ORIGIN/../../../..'
-    )
-    XPU_RPATHS=$(IFS=: ; echo "${XPU_RPATHS[*]}")
-    export C_SO_RPATH=$XPU_RPATHS':$ORIGIN:$ORIGIN/lib'
-    export LIB_SO_RPATH=$XPU_RPATHS':$ORIGIN'
-    export FORCE_RPATH="--force-rpath"
-fi
-
-rm -rf /usr/local/cuda*
-
-SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
-if [[ -z "$BUILD_PYTHONLESS" ]]; then
-    BUILD_SCRIPT=build_common.sh
-else
-    BUILD_SCRIPT=build_libtorch.sh
-fi
-source ${SOURCE_DIR}/${BUILD_SCRIPT}
--- a/.ci/manywheel/set_desired_python.sh
+++ b/.ci/manywheel/set_desired_python.sh
@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-# Require only one python installation
-if [[ -z "$DESIRED_PYTHON" ]]; then
-    echo "Need to set DESIRED_PYTHON env variable"
-    exit 1
-fi
-
-# If given a python version like 3.6m or 2.7mu, convert this to the format we
-# expect. The binary CI jobs pass in python versions like this; they also only
-# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
-# in this case
-if [[ -n "$DESIRED_PYTHON" && $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
-    python_digits="$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
-    py_majmin="${DESIRED_PYTHON}"
-    DESIRED_PYTHON="cp${python_digits}-cp${python_digits}t"
-elif [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
-    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
-    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
-    if [[ ${python_nodot} -ge 310 ]]; then
-        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:2}"
-    else
-        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:1}"
-    fi
-fi
-
-pydir="/opt/python/$DESIRED_PYTHON"
-export DESIRED_PYTHON_BIN_DIR="${pydir}/bin"
-export PATH="$DESIRED_PYTHON_BIN_DIR:$PATH"
-echo "Will build for Python version: ${DESIRED_PYTHON}"
--- a/.ci/manywheel/test_wheel.sh
+++ b/.ci/manywheel/test_wheel.sh
@ -1,26 +0,0 @@
-#!/usr/bin/env bash
-set -e
-
-yum install -y wget git
-
-rm -rf /usr/local/cuda*
-
-# Install Anaconda
-if ! ls /py
-then
-    echo "Miniconda needs to be installed"
-    wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh
-    bash ~/miniconda.sh -b -p /py
-else
-    echo "Miniconda is already installed"
-fi
-
-export PATH="/py/bin:$PATH"
-
-# Anaconda token
-if ls /remote/token
-then
-   source /remote/token
-fi
-
-conda install -y conda-build anaconda-client
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -1,6 +1,6 @@
 #!/bin/bash

-set -ex -o pipefail
+set -ex

 # Required environment variable: $BUILD_ENVIRONMENT
 # (This is set by default in the Docker images we build, so you don't
@ -87,7 +87,7 @@ else

  # Workaround required for MKL library linkage
  # https://github.com/pytorch/pytorch/issues/119557
-  if [[ "$ANACONDA_PYTHON_VERSION" = "3.12" || "$ANACONDA_PYTHON_VERSION" = "3.13" ]]; then
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
    export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/"
    export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/"
  fi
@ -178,7 +178,7 @@ fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
-  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; } && which sccache > /dev/null; then
+  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]; } && which sccache > /dev/null; then
    export MAX_JOBS=$(($(nproc) - 1))
  fi
 fi
@ -191,7 +191,7 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ "$TORCH_CUDA_ARCH_LIST" == *"8.6"* || "$TORCH_CUDA_ARCH_LIST" == *"8.0"* ]]; then
  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
@ -203,12 +203,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *-clang*-asan* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-    export USE_CUDA=1
-  fi
+  export LDSHARED="clang --shared"
+  export USE_CUDA=0
  export USE_ASAN=1
-  export REL_WITH_DEB_INFO=1
-  export UBSAN_FLAGS="-fno-sanitize-recover=all"
+  export UBSAN_FLAGS="-fno-sanitize-recover=all;-fno-sanitize=float-divide-by-zero;-fno-sanitize=float-cast-overflow"
  unset USE_LLVM
 fi

@ -220,6 +218,10 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
    export USE_PRECOMPILED_HEADERS=1
 fi

+if [[ "${BUILD_ENVIRONMENT}" == *linux-focal-py3.7-gcc7-build*  ]]; then
+  export USE_GLOO_WITH_OPENSSL=ON
+fi
+
 if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
@ -228,9 +230,9 @@ if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
  export CMAKE_BUILD_TYPE=RelWithAssert
 fi

-# Do not change workspace permissions for ROCm and s390x CI jobs
+# Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -247,9 +249,10 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
 fi

 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
-  set -e -o pipefail
+  set -e

  get_bazel
+  install_sccache_nvcc_for_bazel

  # Leave 1 CPU free and use only up to 80% of memory to reduce the change of crashing
  # the runner
@ -275,16 +278,18 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-        python -mpip install numpy==2.0.2
+        python -mpip install --pre numpy==2.0.2
      fi

      WERROR=1 python setup.py clean

      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        python3 tools/packaging/split_wheel.py bdist_wheel
+        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
+        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
      else
        WERROR=1 python setup.py bdist_wheel
      fi
@ -395,7 +400,9 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
  python tools/stats/export_test_times.py
 fi
-# don't do this for bazel or s390x as they don't use sccache
-if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+
+# snadampal: skipping it till sccache support added for aarch64
+# https://github.com/pytorch/pytorch/issues/121559
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -1,394 +0,0 @@
-#!/bin/bash
-
-# shellcheck disable=SC2086,SC2006,SC2207,SC2076,SC2155,SC2046,SC1091,SC2143
-# TODO: Re-enable shellchecks above
-
-set -eux -o pipefail
-
-# This script checks the following things on binaries
-# 1. The gcc abi matches DESIRED_DEVTOOLSET
-# 2. MacOS binaries do not link against OpenBLAS
-# 3. There are no protobuf symbols of any sort anywhere (turned off, because
-#    this is currently not true)
-# 4. Standard Python imports work
-# 5. MKL is available everywhere except for MacOS wheels
-# 6. XNNPACK is available everywhere except for MacOS wheels
-# 7. CUDA is setup correctly and does not hang
-# 8. Magma is available for CUDA builds
-# 9. CuDNN is available for CUDA builds
-#
-# This script needs the env variables DESIRED_PYTHON, DESIRED_CUDA,
-# DESIRED_DEVTOOLSET and PACKAGE_TYPE
-#
-# This script expects PyTorch to be installed into the active Python (the
-# Python returned by `which python`). Or, if this is testing a libtorch
-# Pythonless binary, then it expects to be in the root folder of the unzipped
-# libtorch package.
-
-
-if [[ -z ${DESIRED_PYTHON:-} ]]; then
-  export DESIRED_PYTHON=${MATRIX_PYTHON_VERSION:-}
-fi
-if [[ -z ${DESIRED_CUDA:-} ]]; then
-  export DESIRED_CUDA=${MATRIX_DESIRED_CUDA:-}
-fi
-if [[ -z ${DESIRED_DEVTOOLSET:-} ]]; then
-  export DESIRED_DEVTOOLSET=${MATRIX_DESIRED_DEVTOOLSET:-}
-fi
-if [[ -z ${PACKAGE_TYPE:-} ]]; then
-  export PACKAGE_TYPE=${MATRIX_PACKAGE_TYPE:-}
-fi
-
-# The install root depends on both the package type and the os
-# All MacOS packages use conda, even for the wheel packages.
-if [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  # NOTE: Only $PWD works on both CentOS and Ubuntu
-  export install_root="$PWD"
-else
-
-  if [[ $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
-    # For python that is maj.mint keep original version
-    py_dot="$DESIRED_PYTHON"
-  elif [[ $DESIRED_PYTHON =~ ([0-9].[0-9]+) ]];  then
-    # Strip everything but major.minor from DESIRED_PYTHON version
-    py_dot="${BASH_REMATCH[0]}"
-  else
-    echo "Unexpected ${DESIRED_PYTHON} format"
-    exit 1
-  fi
-  export install_root="$(dirname $(which python))/../lib/python${py_dot}/site-packages/torch/"
-fi
-
-###############################################################################
-# Setup XPU ENV
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-  set +u
-  # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
-  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  source /opt/intel/oneapi/pti/latest/env/vars.sh
-fi
-
-###############################################################################
-# Check GCC ABI
-###############################################################################
-
-# NOTE [ Building libtorch with old vs. new gcc ABI ]
-#
-# Packages built with one version of ABI could not be linked against by client
-# C++ libraries that were compiled using the other version of ABI. Since both
-# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
-#
-# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
-# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.
-
-echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
-  function is_expected() {
-    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
-      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
-        echo 1
-      fi
-    else
-      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
-        echo 1
-      fi
-    fi
-  }
-
-  # First we check that the env var in TorchConfig.cmake is correct
-
-  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
-  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
-  if [[ ! -f "$torch_config" ]]; then
-    echo "No TorchConfig.cmake found!"
-    ls -lah "$install_root/share/cmake/Torch"
-    exit 1
-  fi
-  echo "Checking the TorchConfig.cmake"
-  cat "$torch_config"
-
-  # The sed call below is
-  #   don't print lines by default (only print the line we want)
-  # -n
-  #   execute the following expression
-  # e
-  #   replace lines that match with the first capture group and print
-  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
-  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
-  #   quote, any characters
-  #   Note the exactly one single character after the '='. In the case that the
-  #     variable is not set the '=' will be followed by a '"' immediately and the
-  #     line will fail the match and nothing will be printed; this is what we
-  #     want.  Otherwise it will capture the 0 or 1 after the '='.
-  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
-  #   replace the matched line with the capture group and print
-  # /\1/p
-  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
-  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
-    echo "gcc ABI $actual_gcc_abi not as expected."
-    exit 1
-  fi
-
-  # We also check that there are [not] cxx11 symbols in libtorch
-  #
-  echo "Checking that symbols in libtorch.so have the right gcc abi"
-  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
-
-  echo "cxx11 symbols seem to be in order"
-fi # if on Darwin
-
-###############################################################################
-# Check for no OpenBLAS
-# TODO Check for no Protobuf symbols (not finished)
-# Print *all* runtime dependencies
-###############################################################################
-# We have to loop through all shared libraries for this
-if [[ "$(uname)" == 'Darwin' ]]; then
-  all_dylibs=($(find "$install_root" -name '*.dylib'))
-  for dylib in "${all_dylibs[@]}"; do
-    echo "All dependencies of $dylib are $(otool -L $dylib) with rpath $(otool -l $dylib | grep LC_RPATH -A2)"
-
-    # Check that OpenBlas is not linked to on Macs
-    echo "Checking the OpenBLAS is not linked to"
-    if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then
-      echo "ERROR: Found openblas as a dependency of $dylib"
-      echo "Full dependencies is: $(otool -L $dylib)"
-      exit 1
-    fi
-
-    # Check for protobuf symbols
-    #proto_symbols="$(nm $dylib | grep protobuf)" || true
-    #if [[ -n "$proto_symbols" ]]; then
-    #  echo "ERROR: Detected protobuf symbols in $dylib"
-    #  echo "Symbols are $proto_symbols"
-    #  exit 1
-    #fi
-  done
-else
-  all_libs=($(find "$install_root" -name '*.so'))
-  for lib in "${all_libs[@]}"; do
-    echo "All dependencies of $lib are $(ldd $lib) with runpath $(objdump -p $lib | grep RUNPATH)"
-
-    # Check for protobuf symbols
-    #proto_symbols=$(nm $lib | grep protobuf) || true
-    #if [[ -n "$proto_symbols" ]]; then
-    #  echo "ERROR: Detected protobuf symbols in $lib"
-    #  echo "Symbols are $proto_symbols"
-    #  exit 1
-    #fi
-  done
-fi
-
-setup_link_flags () {
-  REF_LIB="-Wl,-R${install_root}/lib"
-  if [[ "$(uname)" == 'Darwin' ]]; then
-    REF_LIB="-Wl,-rpath ${install_root}/lib"
-  fi
-  ADDITIONAL_LINKER_FLAGS=""
-  if [[ "$(uname)" == 'Linux' ]]; then
-    ADDITIONAL_LINKER_FLAGS="-Wl,--no-as-needed"
-  fi
-  C10_LINK_FLAGS=""
-  if [ -f "${install_root}/lib/libc10.so" ] || [ -f "${install_root}/lib/libc10.dylib" ]; then
-    C10_LINK_FLAGS="-lc10"
-  fi
-  TORCH_CPU_LINK_FLAGS=""
-  if [ -f "${install_root}/lib/libtorch_cpu.so" ] || [ -f "${install_root}/lib/libtorch_cpu.dylib" ]; then
-    TORCH_CPU_LINK_FLAGS="-ltorch_cpu"
-  fi
-  TORCH_CUDA_LINK_FLAGS=""
-  if [ -f "${install_root}/lib/libtorch_cuda.so" ] || [ -f "${install_root}/lib/libtorch_cuda.dylib" ]; then
-    TORCH_CUDA_LINK_FLAGS="-ltorch_cuda"
-  elif [ -f "${install_root}/lib/libtorch_cuda_cpp.so" ] && [ -f "${install_root}/lib/libtorch_cuda_cpp.so" ] || \
-    [ -f "${install_root}/lib/libtorch_cuda_cu.dylib" ] && [ -f "${install_root}/lib/libtorch_cuda_cu.dylib" ]; then
-    TORCH_CUDA_LINK_FLAGS="-ltorch_cuda_cpp -ltorch_cuda_cu"
-  fi
-}
-
-TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
-build_and_run_example_cpp () {
-  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    GLIBCXX_USE_CXX11_ABI=1
-  else
-    GLIBCXX_USE_CXX11_ABI=0
-  fi
-  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
-  ./$1
-}
-
-build_example_cpp_with_incorrect_abi () {
-  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    GLIBCXX_USE_CXX11_ABI=0
-  else
-    GLIBCXX_USE_CXX11_ABI=1
-  fi
-  set +e
-  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
-  ERRCODE=$?
-  set -e
-  if [ "$ERRCODE" -eq "0" ]; then
-    echo "Building example with incorrect ABI didn't throw error. Aborting."
-    exit 1
-  else
-    echo "Building example with incorrect ABI throws expected error. Proceeding."
-  fi
-}
-
-###############################################################################
-# Check simple Python/C++ calls
-###############################################################################
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-  # NS: Set LD_LIBRARY_PATH for CUDA builds, but perhaps it should be removed
-  if [[ "$DESIRED_CUDA" == "cu"* ]]; then
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
-  fi
-  build_and_run_example_cpp simple-torch-test
-  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
-  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
-  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
-    build_example_cpp_with_incorrect_abi simple-torch-test
-  fi
-else
-  pushd /tmp
-  python -c 'import torch'
-  popd
-fi
-
-###############################################################################
-# Check torch.git_version
-###############################################################################
-if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  pushd /tmp
-  python -c 'import torch; assert torch.version.git_version != "Unknown"'
-  python -c 'import torch; assert torch.version.git_version != None'
-  popd
-fi
-
-
-###############################################################################
-# Check for MKL
-###############################################################################
-
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-  echo "Checking that MKL is available"
-  build_and_run_example_cpp check-torch-mkl
-elif [[ "$(uname -m)" != "arm64" && "$(uname -m)" != "s390x" ]]; then
-  if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]]; then
-    if [[ "$(uname -m)" == "aarch64" ]]; then
-      echo "Checking that MKLDNN is available on aarch64"
-      pushd /tmp
-      python -c 'import torch; exit(0 if torch.backends.mkldnn.is_available() else 1)'
-      popd
-    else
-      echo "Checking that MKL is available"
-      pushd /tmp
-      python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)'
-      popd
-    fi
-  fi
-fi
-
-###############################################################################
-# Check for XNNPACK
-###############################################################################
-
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-  echo "Checking that XNNPACK is available"
-  build_and_run_example_cpp check-torch-xnnpack
-else
-  if [[ "$(uname)" != 'Darwin' || "$PACKAGE_TYPE" != *wheel ]] && [[ "$(uname -m)" != "s390x"  ]]; then
-    echo "Checking that XNNPACK is available"
-    pushd /tmp
-    python -c 'import torch.backends.xnnpack; exit(0 if torch.backends.xnnpack.enabled else 1)'
-    popd
-  fi
-fi
-
-###############################################################################
-# Check CUDA configured correctly
-###############################################################################
-# Skip these for Windows machines without GPUs
-if [[ "$OSTYPE" == "msys" ]]; then
-    GPUS=$(wmic path win32_VideoController get name)
-    if [[ ! "$GPUS" == *NVIDIA* ]]; then
-        echo "Skip CUDA tests for machines without a Nvidia GPU card"
-        exit 0
-    fi
-fi
-
-# Test that CUDA builds are setup correctly
-if [[ "$DESIRED_CUDA" != 'cpu' && "$DESIRED_CUDA" != 'xpu' && "$DESIRED_CUDA" != 'cpu-cxx11-abi' && "$DESIRED_CUDA" != *"rocm"* && "$(uname -m)" != "s390x" ]]; then
-  if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-    build_and_run_example_cpp check-torch-cuda
-  else
-    pushd /tmp
-    echo "Checking that CUDA archs are setup correctly"
-    timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()'
-
-    # These have to run after CUDA is initialized
-
-    echo "Checking that magma is available"
-    python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)'
-
-    echo "Checking that CuDNN is available"
-    python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)'
-
-    # Validates builds is free of linker regressions reported in https://github.com/pytorch/pytorch/issues/57744
-    echo "Checking that exception handling works"
-    python -c "import torch; from unittest import TestCase;TestCase().assertRaises(RuntimeError, lambda:torch.eye(7, 7, device='cuda:7'))"
-
-    echo "Checking that basic RNN works"
-    python ${TEST_CODE_DIR}/rnn_smoke.py
-
-    echo "Checking that basic CNN works"
-    python "${TEST_CODE_DIR}/cnn_smoke.py"
-
-    echo "Test that linalg works"
-    python -c "import torch;x=torch.rand(3,3,device='cuda');print(torch.linalg.svd(torch.mm(x.t(), x)))"
-
-    popd
-  fi # if libtorch
-fi # if cuda
-
-##########################
-# Run parts of smoke tests
-##########################
-if [[ "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  pushd "$(dirname ${BASH_SOURCE[0]})/smoke_test"
-  python -c "from smoke_test import test_linalg; test_linalg()"
-  if [[ "$DESIRED_CUDA" == *cuda* ]]; then
-    python -c "from smoke_test import test_linalg; test_linalg('cuda')"
-  fi
-  popd
-fi
-
-###############################################################################
-# Check PyTorch supports TCP_TLS gloo transport
-###############################################################################
-
-if [[ "$(uname)" == 'Linux' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  GLOO_CHECK="import torch.distributed as dist
-try:
-    dist.init_process_group('gloo', rank=0, world_size=1)
-except RuntimeError as e:
-    print(e)
-"
-  RESULT=`GLOO_DEVICE_TRANSPORT=TCP_TLS MASTER_ADDR=localhost MASTER_PORT=63945 python -c "$GLOO_CHECK"`
-  GLOO_TRANSPORT_IS_NOT_SUPPORTED='gloo transport is not supported'
-  if [[ "$RESULT" =~ "$GLOO_TRANSPORT_IS_NOT_SUPPORTED" ]]; then
-    echo "PyTorch doesn't support TLS_TCP transport, please build with USE_GLOO_WITH_OPENSSL=1"
-    exit 1
-  fi
-fi
-
-###############################################################################
-# Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
-###############################################################################
-if [[ "$(uname)" == 'Linux' && ("$PACKAGE_TYPE" == 'conda' || "$PACKAGE_TYPE" == 'manywheel')]]; then
-  pushd /tmp
-  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
-  popd
-fi
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@ -6,12 +6,6 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
    # Save the absolute path in case later we chdir (as occurs in the gpu perf test)
    script_dir="$( cd "$(dirname "${BASH_SOURCE[0]}")" || exit ; pwd -P )"

-    if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
-        # This is really weird, but newer sccache somehow produces broken binary
-        # see https://github.com/pytorch/pytorch/issues/139188
-        sudo mv /opt/cache/bin/sccache-0.2.14a /opt/cache/bin/sccache
-    fi
-
    if which sccache > /dev/null; then
        # Save sccache logs to file
        sccache --stop-server > /dev/null  2>&1 || true
--- a/.ci/pytorch/common.sh
+++ b/.ci/pytorch/common.sh
@ -3,7 +3,7 @@
 # Common setup for all Jenkins scripts
 # shellcheck source=./common_utils.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-set -ex -o pipefail
+set -ex

 # Required environment variables:
 #   $BUILD_ENVIRONMENT (should be set by your Docker image)
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -81,15 +81,14 @@ function pip_install_whl() {

 function pip_install() {
  # retry 3 times
-  pip_install_pkg="python3 -m pip install --progress-bar off"
-  ${pip_install_pkg} "$@" || \
-    ${pip_install_pkg} "$@" || \
-    ${pip_install_pkg} "$@"
+  # old versions of pip don't have the "--progress-bar" flag
+  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
+  pip install "$@" || pip install "$@" || pip install "$@"
 }

 function pip_uninstall() {
  # uninstall 2 times
-  pip3 uninstall -y "$@" || pip3 uninstall -y "$@"
+  pip uninstall -y "$@" || pip uninstall -y "$@"
 }

 function get_exit_code() {
@ -105,12 +104,32 @@ function get_bazel() {
  # version of Bazelisk to fetch the platform specific version of
  # Bazel to use from .bazelversion.
  retry curl --location --output tools/bazel \
-    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.23.0/bazelisk.py
+    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.16.0/bazelisk.py
  shasum --algorithm=1 --check \
-    <(echo '01df9cf7f08dd80d83979ed0d0666a99349ae93c  tools/bazel')
+    <(echo 'd4369c3d293814d3188019c9f7527a948972d9f8  tools/bazel')
  chmod u+x tools/bazel
 }

+# This function is bazel specific because of the bug
+# in the bazel that requires some special paths massaging
+# as a workaround. See
+# https://github.com/bazelbuild/bazel/issues/10167
+function install_sccache_nvcc_for_bazel() {
+  sudo mv /usr/local/cuda/bin/nvcc /usr/local/cuda/bin/nvcc-real
+
+  # Write the `/usr/local/cuda/bin/nvcc`
+  cat << EOF | sudo tee /usr/local/cuda/bin/nvcc
+#!/bin/sh
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache /usr/local/cuda/bin/nvcc "\$@"
+else
+  exec external/local_cuda/cuda/bin/nvcc-real "\$@"
+fi
+EOF
+
+  sudo chmod +x /usr/local/cuda/bin/nvcc
+}
+
 function install_monkeytype {
  # Install MonkeyType
  pip_install MonkeyType
@ -160,7 +179,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.30"
+  pip_install --user "tlparse==0.3.25"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

@ -221,12 +240,6 @@ function checkout_install_torchbench() {
  popd
 }

-function install_torchao() {
-  local commit
-  commit=$(get_pinned_commit torchao)
-  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/ao.git@${commit}"
-}
-
 function print_sccache_stats() {
  echo 'PyTorch Build Statistics'
  sccache --show-stats
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -40,7 +40,7 @@ echo "Building PyTorch C++ API docs..."
 rm -rf cppdocs
 git clone https://github.com/pytorch/cppdocs

-set -ex -o pipefail
+set -ex

 # Generate ATen files
 pushd "${pt_checkout}"
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -45,7 +45,8 @@ def create_cert(path, C, ST, L, O, key):
        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
+            datetime.now(timezone.utc)
+            + timedelta(days=10)
        )
        .add_extension(
            x509.BasicConstraints(ca=True, path_length=None),
@ -90,7 +91,8 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc) + timedelta(days=10)
+            datetime.now(timezone.utc)
+            + timedelta(days=10)
            # Sign our certificate with our private key
        )
        .sign(private_ca_key, hashes.SHA256())
--- a/.ci/pytorch/functorch_doc_push_script.sh
+++ b/.ci/pytorch/functorch_doc_push_script.sh
@ -5,7 +5,7 @@ pt_checkout="/var/lib/jenkins/workspace"
 source "$pt_checkout/.ci/pytorch/common_utils.sh"
 echo "functorch_doc_push_script.sh: Invoked with $*"

-set -ex -o pipefail
+set -ex

 version=${DOCS_VERSION:-nightly}
 echo "version: $version"
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -6,7 +6,7 @@
 # return the same thing, ex checks for for rocm, CUDA, and changing the path
 # where sccache is installed, and not changing /etc/environment.

-set -ex -o pipefail
+set -ex

 install_binary() {
  echo "Downloading sccache binary from S3 repo"
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -1,5 +1,4 @@
 #!/bin/bash
-set -x

 # shellcheck disable=SC2034
 # shellcheck source=./macos-common.sh
@ -149,146 +148,9 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

-torchbench_setup_macos() {
-  git clone --recursive https://github.com/pytorch/vision torchvision
-  git clone --recursive https://github.com/pytorch/audio torchaudio
-
-  pushd torchvision
-  git fetch
-  git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
-  git submodule update --init --recursive
-  python setup.py clean
-  python setup.py develop
-  popd
-
-  pushd torchaudio
-  git fetch
-  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
-  git submodule update --init --recursive
-  python setup.py clean
-  python setup.py develop
-  popd
-
-  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
-  # shellcheck disable=SC2119,SC2120
-  checkout_install_torchbench
-}
-
-conda_benchmark_deps() {
-  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
-  conda install -y -c conda-forge librosa
-}
-
-
-test_torchbench_perf() {
-  print_cmake_info
-
-  echo "Launching torchbench setup"
-  conda_benchmark_deps
-  torchbench_setup_macos
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local backend=eager
-  local dtype=notset
-  local device=mps
-
-  echo "Setup complete, launching torchbench training performance run"
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-    --performance --backend "$backend" --training --devices "$device" \
-    --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-
-  echo "Launching torchbench inference performance run"
-  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-    --performance --backend "$backend" --inference --devices "$device" \
-    --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-
-  echo "Pytorch benchmark on mps device completed"
-}
-
-test_torchbench_smoketest() {
-  print_cmake_info
-
-  echo "Launching torchbench setup"
-  conda_benchmark_deps
-  # shellcheck disable=SC2119,SC2120
-  torchbench_setup_macos
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
-  local backend=eager
-  local dtype=notset
-  local device=mps
-
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-
-  echo "Setup complete, launching torchbench training performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --training --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  done
-
-  echo "Launching torchbench inference performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-  done
-
-  echo "Pytorch benchmark on mps device completed"
-}
-
-test_hf_perf() {
-  print_cmake_info
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  conda_benchmark_deps
-  torchbench_setup_macos
-
-  echo "Launching HuggingFace training perf run"
-  python "$(pwd)"/benchmarks/dynamo/huggingface.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/hf_training.csv
-
-  echo "Launching HuggingFace inference perf run"
-  python "$(pwd)"/benchmarks/dynamo/huggingface.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/hf_inference.csv
-
-  echo "HuggingFace benchmark on mps device completed"
-}
-
-test_timm_perf() {
-  print_cmake_info
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  conda_benchmark_deps
-  torchbench_setup_macos
-
-  echo "Launching timm training perf run"
-  python "$(pwd)"/benchmarks/dynamo/timm_models.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/timm_training.csv
-
-  echo "Launching timm inference perf run"
-  python "$(pwd)"/benchmarks/dynamo/timm_models.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/timm_inference.csv
-
-  echo "timm benchmark on mps device completed"
-}
-
 install_tlparse

-if [[ $TEST_CONFIG == *"perf_all"* ]]; then
-  test_torchbench_perf
-  test_hf_perf
-  test_timm_perf
-elif [[ $TEST_CONFIG == *"perf_torchbench"* ]]; then
-  test_torchbench_perf
-elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
-  test_hf_perf
-elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
-  test_timm_perf
-elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
-  test_torchbench_smoketest
-elif [[ $NUM_TEST_SHARDS -gt 1 ]]; then
+if [[ $NUM_TEST_SHARDS -gt 1 ]]; then
  test_python_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_libtorch
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -8,62 +8,55 @@
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

 echo "Testing pytorch"
-# When adding more tests, please use HUD to see which shard is shorter
-if [[ "${SHARD_NUMBER:-1}" == "1" ]]; then
-    # FSDP tests
-    for f in test/distributed/fsdp/*.py ; do time python test/run_test.py --verbose -i "${f#*/}" ; done
-fi
+time python test/run_test.py --include test_cuda_multigpu test_cuda_primary_ctx --verbose

-if [[ "${SHARD_NUMBER:-2}" == "2" ]]; then
-    time python test/run_test.py --include test_cuda_multigpu test_cuda_primary_ctx --verbose
+# Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
+# python tools/download_mnist.py --quiet -d test/cpp/api/mnist
+# OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
+time python test/run_test.py --verbose -i distributed/test_c10d_common
+time python test/run_test.py --verbose -i distributed/test_c10d_gloo
+time python test/run_test.py --verbose -i distributed/test_c10d_nccl
+time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
+time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
+time python test/run_test.py --verbose -i distributed/test_store
+time python test/run_test.py --verbose -i distributed/test_symmetric_memory
+time python test/run_test.py --verbose -i distributed/test_pg_wrapper
+time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
+# FSDP tests
+for f in test/distributed/fsdp/*.py ; do time python test/run_test.py --verbose -i "${f#*/}" ; done
+# ShardedTensor tests
+time python test/run_test.py --verbose -i distributed/checkpoint/test_checkpoint
+time python test/run_test.py --verbose -i distributed/checkpoint/test_file_system_checkpoint
+time python test/run_test.py --verbose -i distributed/_shard/sharding_spec/test_sharding_spec
+time python test/run_test.py --verbose -i distributed/_shard/sharding_plan/test_sharding_plan
+time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor
+time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor_reshard

-    # Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
-    # python tools/download_mnist.py --quiet -d test/cpp/api/mnist
-    # OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api
-    time python test/run_test.py --verbose -i distributed/test_c10d_common
-    time python test/run_test.py --verbose -i distributed/test_c10d_gloo
-    time python test/run_test.py --verbose -i distributed/test_c10d_nccl
-    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
-    time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-    time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
-    time python test/run_test.py --verbose -i distributed/test_store
-    time python test/run_test.py --verbose -i distributed/test_symmetric_memory
-    time python test/run_test.py --verbose -i distributed/test_pg_wrapper
-    time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
+# functional collective tests
+time python test/run_test.py --verbose -i distributed/test_functional_api

-    # ShardedTensor tests
-    time python test/run_test.py --verbose -i distributed/checkpoint/test_checkpoint
-    time python test/run_test.py --verbose -i distributed/checkpoint/test_file_system_checkpoint
-    time python test/run_test.py --verbose -i distributed/_shard/sharding_spec/test_sharding_spec
-    time python test/run_test.py --verbose -i distributed/_shard/sharding_plan/test_sharding_plan
-    time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor
-    time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test_sharded_tensor_reshard
+# DTensor tests
+time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
+time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile

-    # functional collective tests
-    time python test/run_test.py --verbose -i distributed/test_functional_api
+# DeviceMesh test
+time python test/run_test.py --verbose -i distributed/test_device_mesh

-    # DTensor tests
-    time python test/run_test.py --verbose -i distributed/tensor/test_random_ops
-    time python test/run_test.py --verbose -i distributed/tensor/test_dtensor_compile
+# DTensor/TP tests
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

-    # DeviceMesh test
-    time python test/run_test.py --verbose -i distributed/test_device_mesh
+# FSDP2 tests
+time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

-    # DTensor/TP tests
-    time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
-    time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
+# ND composability tests
+time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
+time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability

-    # FSDP2 tests
-    time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
-
-    # ND composability tests
-    time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
-    time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability
-
-    # Other tests
-    time python test/run_test.py --verbose -i test_cuda_primary_ctx
-    time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
-    time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
-    time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
-fi
+# Other tests
+time python test/run_test.py --verbose -i test_cuda_primary_ctx
+time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
+time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
+time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
 assert_git_not_dirty
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -7,7 +7,7 @@ source "$pt_checkout/.ci/pytorch/common_utils.sh"

 echo "python_doc_push_script.sh: Invoked with $*"

-set -ex -o pipefail
+set -ex

 # for statements like ${1:-${DOCS_INSTALL_PATH:-docs/}}
 # the order of operations goes:
@ -63,7 +63,7 @@ build_docs () {
    echo "(tried to echo the WARNINGS above the ==== line)"
    echo =========================
  fi
-  set -ex -o pipefail
+  set -ex
  return $code
 }

--- a/.ci/pytorch/run_tests.sh
+++ b/.ci/pytorch/run_tests.sh
@ -1,436 +0,0 @@
-#!/bin/bash
-# shellcheck disable=SC2086,SC2048,SC2068,SC2145,SC2034,SC2207,SC2143
-# TODO: Re-enable shellchecks above
-
-set -eux -o pipefail
-
-# Essentially runs pytorch/test/run_test.py, but keeps track of which tests to
-# skip in a centralized place.
-#
-# TODO Except for a few tests, this entire file is a giant TODO. Why are these
-# tests # failing?
-# TODO deal with Windows
-
-# This script expects to be in the pytorch root folder
-if [[ ! -d 'test' || ! -f 'test/run_test.py' ]]; then
-    echo "run_tests.sh expects to be run from the Pytorch root directory " \
-         "but I'm actually in $(pwd)"
-    exit 2
-fi
-
-# Allow master skip of all tests
-if [[ -n "${SKIP_ALL_TESTS:-}" ]]; then
-    exit 0
-fi
-
-# If given specific test params then just run those
-if [[ -n "${RUN_TEST_PARAMS:-}" ]]; then
-    echo "$(date) :: Calling user-command $(pwd)/test/run_test.py ${RUN_TEST_PARAMS[@]}"
-    python test/run_test.py ${RUN_TEST_PARAMS[@]}
-    exit 0
-fi
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Parameters
-##############################################################################
-if [[ "$#" != 3 ]]; then
-  if [[ -z "${DESIRED_PYTHON:-}" || -z "${DESIRED_CUDA:-}" || -z "${PACKAGE_TYPE:-}" ]]; then
-    echo "USAGE: run_tests.sh  PACKAGE_TYPE  DESIRED_PYTHON  DESIRED_CUDA"
-    echo "The env variable PACKAGE_TYPE must be set to 'conda' or 'manywheel' or 'libtorch'"
-    echo "The env variable DESIRED_PYTHON must be set like '2.7mu' or '3.6m' etc"
-    echo "The env variable DESIRED_CUDA must be set like 'cpu' or 'cu80' etc"
-    exit 1
-  fi
-  package_type="$PACKAGE_TYPE"
-  py_ver="$DESIRED_PYTHON"
-  cuda_ver="$DESIRED_CUDA"
-else
-  package_type="$1"
-  py_ver="$2"
-  cuda_ver="$3"
-fi
-
-if [[ "$cuda_ver" == 'cpu-cxx11-abi' ]]; then
-    cuda_ver="cpu"
-fi
-
-# cu80, cu90, cu100, cpu
-if [[ ${#cuda_ver} -eq 4 ]]; then
-    cuda_ver_majmin="${cuda_ver:2:1}.${cuda_ver:3:1}"
-elif [[ ${#cuda_ver} -eq 5 ]]; then
-    cuda_ver_majmin="${cuda_ver:2:2}.${cuda_ver:4:1}"
-fi
-
-NUMPY_PACKAGE=""
-if [[ ${py_ver} == "3.10" ]]; then
-    PROTOBUF_PACKAGE="protobuf>=3.17.2"
-    NUMPY_PACKAGE="numpy>=1.21.2"
-else
-    PROTOBUF_PACKAGE="protobuf=3.14.0"
-fi
-
-# Environment initialization
-if [[ "$(uname)" == Darwin ]]; then
-    # Install the testing dependencies
-    retry conda install -yq future hypothesis ${NUMPY_PACKAGE} ${PROTOBUF_PACKAGE} pytest setuptools six typing_extensions pyyaml
-else
-    retry pip install -qr requirements.txt || true
-    retry pip install -q hypothesis protobuf pytest setuptools || true
-    numpy_ver=1.15
-    case "$(python --version 2>&1)" in
-      *2* | *3.5* | *3.6*)
-        numpy_ver=1.11
-        ;;
-    esac
-    retry pip install -q "numpy==${numpy_ver}" || true
-fi
-
-echo "Testing with:"
-pip freeze
-conda list || true
-
-##############################################################################
-# Smoke tests
-##############################################################################
-# TODO use check_binary.sh, which requires making sure it runs on Windows
-pushd /
-echo "Smoke testing imports"
-python -c 'import torch'
-
-# Test that MKL is there
-if [[ "$(uname)" == 'Darwin' && "$package_type" == *wheel ]]; then
-    echo 'Not checking for MKL on Darwin wheel packages'
-else
-    echo "Checking that MKL is available"
-    python -c 'import torch; exit(0 if torch.backends.mkl.is_available() else 1)'
-fi
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    GPUS=$(wmic path win32_VideoController get name)
-    if [[ ! "$GPUS" == *NVIDIA* ]]; then
-        echo "Skip CUDA tests for machines without a Nvidia GPU card"
-        exit 0
-    fi
-fi
-
-# Test that the version number is consistent during building and testing
-if [[ "$PYTORCH_BUILD_NUMBER" -gt 1 ]]; then
-    expected_version="${PYTORCH_BUILD_VERSION}.post${PYTORCH_BUILD_NUMBER}"
-else
-    expected_version="${PYTORCH_BUILD_VERSION}"
-fi
-echo "Checking that we are testing the package that is just built"
-python -c "import torch; exit(0 if torch.__version__ == '$expected_version' else 1)"
-
-# Test that CUDA builds are setup correctly
-if [[ "$cuda_ver" != 'cpu' ]]; then
-    cuda_installed=1
-    nvidia-smi || cuda_installed=0
-    if [[ "$cuda_installed" == 0 ]]; then
-      echo "Skip CUDA tests for machines without a Nvidia GPU card"
-    else
-      # Test CUDA archs
-      echo "Checking that CUDA archs are setup correctly"
-      timeout 20 python -c 'import torch; torch.randn([3,5]).cuda()'
-
-      # These have to run after CUDA is initialized
-      echo "Checking that magma is available"
-      python -c 'import torch; torch.rand(1).cuda(); exit(0 if torch.cuda.has_magma else 1)'
-      echo "Checking that CuDNN is available"
-      python -c 'import torch; exit(0 if torch.backends.cudnn.is_available() else 1)'
-    fi
-fi
-
-# Check that OpenBlas is not linked to on MacOS
-if [[ "$(uname)" == 'Darwin' ]]; then
-    echo "Checking the OpenBLAS is not linked to"
-    all_dylibs=($(find "$(python -c "import site; print(site.getsitepackages()[0])")"/torch -name '*.dylib'))
-    for dylib in "${all_dylibs[@]}"; do
-        if [[ -n "$(otool -L $dylib | grep -i openblas)" ]]; then
-            echo "Found openblas as a dependency of $dylib"
-            echo "Full dependencies is: $(otool -L $dylib)"
-            exit 1
-        fi
-    done
-
-    echo "Checking that OpenMP is available"
-    python -c "import torch; exit(0 if torch.backends.openmp.is_available() else 1)"
-fi
-
-popd
-
-# TODO re-enable the other tests after the nightlies are moved to CI. This is
-# because the binaries keep breaking, often from additional tests, that aren't
-# real problems. Once these are on circleci and a smoke-binary-build is added
-# to PRs then this should stop happening and these can be re-enabled.
-echo "Not running unit tests. Hopefully these problems are caught by CI"
-exit 0
-
-
-##############################################################################
-# Running unit tests (except not right now)
-##############################################################################
-echo "$(date) :: Starting tests for $package_type package for python$py_ver and $cuda_ver"
-
-# We keep track of exact tests to skip, as otherwise we would be hardly running
-# any tests. But b/c of issues working with pytest/normal-python-test/ and b/c
-# of special snowflake tests in test/run_test.py we also take special care of
-# those
-tests_to_skip=()
-
-#
-# Entire file exclusions
-##############################################################################
-entire_file_exclusions=("-x")
-
-# cpp_extensions doesn't work with pytest, so we exclude it from the pytest run
-# here and then manually run it later. Note that this is only because this
-# entire_fil_exclusions flag is only passed to the pytest run
-entire_file_exclusions+=("cpp_extensions")
-
-# TODO temporary line to fix next days nightlies, but should be removed when
-# issue is fixed
-entire_file_exclusions+=('type_info')
-
-if [[ "$cuda_ver" == 'cpu' ]]; then
-    # test/test_cuda.py exits early if the installed torch is not built with
-    # CUDA, but the exit doesn't work when running with pytest, so pytest will
-    # still try to run all the CUDA tests and then fail
-    entire_file_exclusions+=("cuda")
-    entire_file_exclusions+=("nccl")
-fi
-
-if [[ "$(uname)" == 'Darwin' || "$OSTYPE" == "msys" ]]; then
-    # pytest on Mac doesn't like the exits in these files
-    entire_file_exclusions+=('c10d')
-    entire_file_exclusions+=('distributed')
-
-    # pytest doesn't mind the exit but fails the tests. On Mac we run this
-    # later without pytest
-    entire_file_exclusions+=('thd_distributed')
-fi
-
-
-#
-# Universal flaky tests
-##############################################################################
-
-# RendezvousEnvTest sometimes hangs forever
-# Otherwise it will fail on CUDA with
-#   Traceback (most recent call last):
-#     File "test_c10d.py", line 179, in test_common_errors
-#       next(gen)
-#   AssertionError: ValueError not raised
-tests_to_skip+=('RendezvousEnvTest and test_common_errors')
-
-# This hung forever once on conda_3.5_cu92
-tests_to_skip+=('TestTorch and test_sum_dim')
-
-# test_trace_warn isn't actually flaky, but it doesn't work with pytest so we
-# just skip it
-tests_to_skip+=('TestJit and test_trace_warn')
-#
-# Python specific flaky tests
-##############################################################################
-
-# test_dataloader.py:721: AssertionError
-# looks like a timeout, but interestingly only appears on python 3
-if [[ "$py_ver" == 3* ]]; then
-    tests_to_skip+=('TestDataLoader and test_proper_exit')
-fi
-
-#
-# CUDA flaky tests, all package types
-##############################################################################
-if [[ "$cuda_ver" != 'cpu' ]]; then
-
-    #
-    # DistributedDataParallelTest
-    # All of these seem to fail
-    tests_to_skip+=('DistributedDataParallelTest')
-
-    #
-    # RendezvousEnvTest
-    # Traceback (most recent call last):
-    #   File "test_c10d.py", line 201, in test_nominal
-    #     store0, rank0, size0 = next(gen0)
-    #   File "/opt/python/cp36-cp36m/lib/python3.6/site-packages/torch/distributed/rendezvous.py", line 131, in _env_rendezvous_handler
-    #     store = TCPStore(master_addr, master_port, start_daemon)
-    # RuntimeError: Address already in use
-    tests_to_skip+=('RendezvousEnvTest and test_nominal')
-
-    #
-    # TestCppExtension
-    #
-    # Traceback (most recent call last):
-    #   File "test_cpp_extensions.py", line 134, in test_jit_cudnn_extension
-    #     with_cuda=True)
-    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 552, in load
-    #     with_cuda)
-    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 729, in _jit_compile
-    #     return _import_module_from_library(name, build_directory)
-    #   File "/opt/python/cp35-cp35m/lib/python3.5/site-packages/torch/utils/cpp_extension.py", line 867, in _import_module_from_library
-    #     return imp.load_module(module_name, file, path, description)
-    #   File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 243, in load_module
-    #     return load_dynamic(name, filename, file)
-    #   File "/opt/python/cp35-cp35m/lib/python3.5/imp.py", line 343, in load_dynamic
-    #     return _load(spec)
-    #   File "<frozen importlib._bootstrap>", line 693, in _load
-    #   File "<frozen importlib._bootstrap>", line 666, in _load_unlocked
-    #   File "<frozen importlib._bootstrap>", line 577, in module_from_spec
-    #   File "<frozen importlib._bootstrap_external>", line 938, in create_module
-    #   File "<frozen importlib._bootstrap>", line 222, in _call_with_frames_removed
-    # ImportError: libcudnn.so.7: cannot open shared object file: No such file or directory
-    tests_to_skip+=('TestCppExtension and test_jit_cudnn_extension')
-
-    #
-    # TestCuda
-    #
-
-    # 3.7_cu80
-    #  RuntimeError: CUDA error: out of memory
-    tests_to_skip+=('TestCuda and test_arithmetic_large_tensor')
-
-    # 3.7_cu80
-    # RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch-nightly_1538097262541/work/aten/src/THC/THCTensorCopy.cu:205
-    tests_to_skip+=('TestCuda and test_autogpu')
-
-    #
-    # TestDistBackend
-    #
-
-    # Traceback (most recent call last):
-    #   File "test_thd_distributed.py", line 1046, in wrapper
-    #     self._join_and_reduce(fn)
-    #   File "test_thd_distributed.py", line 1108, in _join_and_reduce
-    #     self.assertEqual(p.exitcode, first_process.exitcode)
-    #   File "/pytorch/test/common.py", line 399, in assertEqual
-    #     super(TestCase, self).assertEqual(x, y, message)
-    # AssertionError: None != 77 :
-    tests_to_skip+=('TestDistBackend and test_all_gather_group')
-    tests_to_skip+=('TestDistBackend and test_all_reduce_group_max')
-    tests_to_skip+=('TestDistBackend and test_all_reduce_group_min')
-    tests_to_skip+=('TestDistBackend and test_all_reduce_group_sum')
-    tests_to_skip+=('TestDistBackend and test_all_reduce_group_product')
-    tests_to_skip+=('TestDistBackend and test_barrier_group')
-    tests_to_skip+=('TestDistBackend and test_broadcast_group')
-
-    # Traceback (most recent call last):
-    #   File "test_thd_distributed.py", line 1046, in wrapper
-    #     self._join_and_reduce(fn)
-    #   File "test_thd_distributed.py", line 1108, in _join_and_reduce
-    #     self.assertEqual(p.exitcode, first_process.exitcode)
-    #   File "/pytorch/test/common.py", line 397, in assertEqual
-    #     super(TestCase, self).assertLessEqual(abs(x - y), prec, message)
-    # AssertionError: 12 not less than or equal to 1e-05
-    tests_to_skip+=('TestDistBackend and test_barrier')
-
-    # Traceback (most recent call last):
-    #   File "test_distributed.py", line 1267, in wrapper
-    #     self._join_and_reduce(fn)
-    #   File "test_distributed.py", line 1350, in _join_and_reduce
-    #     self.assertEqual(p.exitcode, first_process.exitcode)
-    #   File "/pytorch/test/common.py", line 399, in assertEqual
-    #     super(TestCase, self).assertEqual(x, y, message)
-    # AssertionError: None != 1
-    tests_to_skip+=('TestDistBackend and test_broadcast')
-
-    # Memory leak very similar to all the conda ones below, but appears on manywheel
-    # 3.6m_cu80
-    # AssertionError: 1605632 not less than or equal to 1e-05 : __main__.TestEndToEndHybridFrontendModels.test_vae_cuda leaked 1605632 bytes CUDA memory on device 0
-    tests_to_skip+=('TestEndToEndHybridFrontendModels and test_vae_cuda')
-
-    # ________________________ TestNN.test_embedding_bag_cuda ________________________
-    #
-    # self = <test_nn.TestNN testMethod=test_embedding_bag_cuda>
-    # dtype = torch.float32
-    #
-    #     @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    #     @repeat_test_for_types(ALL_TENSORTYPES)
-    #     @skipIfRocm
-    #     def test_embedding_bag_cuda(self, dtype=torch.float):
-    #         self._test_EmbeddingBag(True, 'sum', False, dtype)
-    #         self._test_EmbeddingBag(True, 'mean', False, dtype)
-    #         self._test_EmbeddingBag(True, 'max', False, dtype)
-    #         if dtype != torch.half:
-    #             # torch.cuda.sparse.HalfTensor is not enabled.
-    #             self._test_EmbeddingBag(True, 'sum', True, dtype)
-    # >           self._test_EmbeddingBag(True, 'mean', True, dtype)
-    #
-    # test_nn.py:2144:
-    # _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
-    # test_nn.py:2062: in _test_EmbeddingBag
-    #     _test_vs_Embedding(N, D, B, L)
-    # test_nn.py:2059: in _test_vs_Embedding
-    #     self.assertEqual(es_weight_grad, e.weight.grad, needed_prec)
-    # common.py:373: in assertEqual
-    #     assertTensorsEqual(x, y)
-    # common.py:365: in assertTensorsEqual
-    #     self.assertLessEqual(max_err, prec, message)
-    # E   AssertionError: tensor(0.0000, device='cuda:0', dtype=torch.float32) not less than or equal to 2e-05 :
-    #  1 failed, 1202 passed, 19 skipped, 2 xfailed, 796 warnings in 1166.73 seconds =
-    # Traceback (most recent call last):
-    #   File "test/run_test.py", line 391, in <module>
-    #     main()
-    #   File "test/run_test.py", line 383, in main
-    #     raise RuntimeError(message)
-    tests_to_skip+=('TestNN and test_embedding_bag_cuda')
-fi
-
-##############################################################################
-# MacOS specific flaky tests
-##############################################################################
-
-if [[ "$(uname)" == 'Darwin' ]]; then
-    # TestCppExtensions by default uses a temp folder in /tmp. This doesn't
-    # work for this Mac machine cause there is only one machine and /tmp is
-    # shared. (All the linux builds are on docker so have their own /tmp).
-    tests_to_skip+=('TestCppExtension')
-fi
-
-# Turn the set of tests to skip into an invocation that pytest understands
-excluded_tests_logic=''
-for exclusion in "${tests_to_skip[@]}"; do
-    if [[ -z "$excluded_tests_logic" ]]; then
-        # Only true for i==0
-        excluded_tests_logic="not ($exclusion)"
-    else
-        excluded_tests_logic="$excluded_tests_logic and not ($exclusion)"
-    fi
-done
-
-
-##############################################################################
-# Run the tests
-##############################################################################
-echo
-echo "$(date) :: Calling 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"
-
-python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k "'" "$excluded_tests_logic" "'"
-
-echo
-echo "$(date) :: Finished 'python test/run_test.py -v -p pytest ${entire_file_exclusions[@]} -- --disable-pytest-warnings -k '$excluded_tests_logic'"
-
-# cpp_extensions don't work with pytest, so we run them without pytest here,
-# except there's a failure on CUDA builds (documented above), and
-# cpp_extensions doesn't work on a shared mac machine (also documented above)
-if [[ "$cuda_ver" == 'cpu' && "$(uname)" != 'Darwin' ]]; then
-    echo
-    echo "$(date) :: Calling 'python test/run_test.py -v -i cpp_extensions'"
-    python test/run_test.py -v -i cpp_extensions
-    echo
-    echo "$(date) :: Finished 'python test/run_test.py -v -i cpp_extensions'"
-fi
-
-# thd_distributed can run on Mac but not in pytest
-if [[ "$(uname)" == 'Darwin' ]]; then
-    echo
-    echo "$(date) :: Calling 'python test/run_test.py -v -i thd_distributed'"
-    python test/run_test.py -v -i thd_distributed
-    echo
-    echo "$(date) :: Finished 'python test/run_test.py -v -i thd_distributed'"
-fi
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -1,130 +0,0 @@
-#!/usr/bin/env python3
-import concurrent.futures
-import distutils.sysconfig
-import functools
-import itertools
-import os
-import re
-from pathlib import Path
-from typing import Any, List, Tuple
-
-
-# We also check that there are [not] cxx11 symbols in libtorch
-#
-# To check whether it is using cxx11 ABI, check non-existence of symbol:
-PRE_CXX11_SYMBOLS = (
-    "std::basic_string<",
-    "std::list",
-)
-# To check whether it is using pre-cxx11 ABI, check non-existence of symbol:
-CXX11_SYMBOLS = (
-    "std::__cxx11::basic_string",
-    "std::__cxx11::list",
-)
-# NOTE: Checking the above symbols in all namespaces doesn't work, because
-# devtoolset7 always produces some cxx11 symbols even if we build with old ABI,
-# and CuDNN always has pre-cxx11 symbols even if we build with new ABI using gcc 5.4.
-# Instead, we *only* check the above symbols in the following namespaces:
-LIBTORCH_NAMESPACE_LIST = (
-    "c10::",
-    "at::",
-    "caffe2::",
-    "torch::",
-)
-
-
-def _apply_libtorch_symbols(symbols):
-    return [
-        re.compile(f"{x}.*{y}")
-        for (x, y) in itertools.product(LIBTORCH_NAMESPACE_LIST, symbols)
-    ]
-
-
-LIBTORCH_CXX11_PATTERNS = _apply_libtorch_symbols(CXX11_SYMBOLS)
-
-LIBTORCH_PRE_CXX11_PATTERNS = _apply_libtorch_symbols(PRE_CXX11_SYMBOLS)
-
-
-@functools.lru_cache(100)
-def get_symbols(lib: str) -> List[Tuple[str, str, str]]:
-    from subprocess import check_output
-
-    lines = check_output(f'nm "{lib}"|c++filt', shell=True)
-    return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]
-
-
-def grep_symbols(lib: str, patterns: List[Any]) -> List[str]:
-    def _grep_symbols(
-        symbols: List[Tuple[str, str, str]], patterns: List[Any]
-    ) -> List[str]:
-        rc = []
-        for _s_addr, _s_type, s_name in symbols:
-            for pattern in patterns:
-                if pattern.match(s_name):
-                    rc.append(s_name)
-                    continue
-        return rc
-
-    all_symbols = get_symbols(lib)
-    num_workers = 32
-    chunk_size = (len(all_symbols) + num_workers - 1) // num_workers
-
-    def _get_symbols_chunk(i):
-        return all_symbols[i * chunk_size : (i + 1) * chunk_size]
-
-    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
-        tasks = [
-            executor.submit(_grep_symbols, _get_symbols_chunk(i), patterns)
-            for i in range(num_workers)
-        ]
-        return functools.reduce(list.__add__, (x.result() for x in tasks), [])
-
-
-def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None:
-    print(f"lib: {lib}")
-    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
-    pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS)
-    num_cxx11_symbols = len(cxx11_symbols)
-    num_pre_cxx11_symbols = len(pre_cxx11_symbols)
-    print(f"num_cxx11_symbols: {num_cxx11_symbols}")
-    print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}")
-    if pre_cxx11_abi:
-        if num_cxx11_symbols > 0:
-            raise RuntimeError(
-                f"Found cxx11 symbols, but there shouldn't be any, see: {cxx11_symbols[:100]}"
-            )
-        if num_pre_cxx11_symbols < 1000:
-            raise RuntimeError("Didn't find enough pre-cxx11 symbols.")
-        # Check for no recursive iterators, regression test for https://github.com/pytorch/pytorch/issues/133437
-        rec_iter_symbols = grep_symbols(
-            lib, [re.compile("std::filesystem::recursive_directory_iterator.*")]
-        )
-        if len(rec_iter_symbols) > 0:
-            raise RuntimeError(
-                f"recursive_directory_iterator in used pre-CXX11 binaries, see; {rec_iter_symbols}"
-            )
-    else:
-        if num_pre_cxx11_symbols > 0:
-            raise RuntimeError(
-                f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
-            )
-        if num_cxx11_symbols < 100:
-            raise RuntimeError("Didn't find enought cxx11 symbols")
-
-
-def main() -> None:
-    if "install_root" in os.environ:
-        install_root = Path(os.getenv("install_root"))  # noqa: SIM112
-    else:
-        if os.getenv("PACKAGE_TYPE") == "libtorch":
-            install_root = Path(os.getcwd())
-        else:
-            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"
-
-    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
-    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)
-
-
-if __name__ == "__main__":
-    main()
--- a/.ci/pytorch/smoke_test/max_autotune.py
+++ b/.ci/pytorch/smoke_test/max_autotune.py
@ -1,205 +0,0 @@
-import argparse
-
-from torchvision import datasets, transforms
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-from torch.optim.lr_scheduler import StepLR
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super(Net, self).__init__()  # noqa: UP008
-        self.conv1 = nn.Conv2d(1, 32, 3, 1)
-        self.conv2 = nn.Conv2d(32, 64, 3, 1)
-        self.dropout1 = nn.Dropout(0.25)
-        self.dropout2 = nn.Dropout(0.5)
-        self.fc1 = nn.Linear(9216, 128)
-        self.fc2 = nn.Linear(128, 10)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = F.relu(x)
-        x = self.conv2(x)
-        x = F.relu(x)
-        x = F.max_pool2d(x, 2)
-        x = self.dropout1(x)
-        x = torch.flatten(x, 1)
-        x = self.fc1(x)
-        x = F.relu(x)
-        x = self.dropout2(x)
-        x = self.fc2(x)
-        output = F.log_softmax(x, dim=1)
-        return output
-
-
-def train(args, model, device, train_loader, optimizer, epoch):
-    model.train()
-    for batch_idx, (data, target) in enumerate(train_loader):
-        data, target = data.to(device), target.to(device)
-        optimizer.zero_grad()
-        output = model(data)
-        loss = F.nll_loss(output, target)
-        loss.backward()
-        optimizer.step()
-        if batch_idx % args.log_interval == 0:
-            print(
-                f"Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}"  # noqa: B950
-            )
-            if args.dry_run:
-                break
-
-
-def test(model, device, test_loader):
-    model.eval()
-    test_loss = 0
-    correct = 0
-    with torch.no_grad():
-        for data, target in test_loader:
-            data, target = data.to(device), target.to(device)
-            output = model(data)
-            test_loss += F.nll_loss(
-                output, target, reduction="sum"
-            ).item()  # sum up batch loss
-            pred = output.argmax(
-                dim=1, keepdim=True
-            )  # get the index of the max log-probability
-            correct += pred.eq(target.view_as(pred)).sum().item()
-
-    test_loss /= len(test_loader.dataset)
-
-    print(
-        f"\nTest set: Average loss: {test_loss:.4f}, Accuracy: {correct}/{len(test_loader.dataset)} ({100. * correct / len(test_loader.dataset):.0f}%)\n"  # noqa: B950
-    )
-
-
-def timed(fn):
-    start = torch.cuda.Event(enable_timing=True)
-    end = torch.cuda.Event(enable_timing=True)
-    start.record()
-    result = fn()
-    end.record()
-    torch.cuda.synchronize()
-    return result, start.elapsed_time(end) / 1000
-
-
-def main():
-    # Training settings
-    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
-    parser.add_argument(
-        "--batch-size",
-        type=int,
-        default=64,
-        metavar="N",
-        help="input batch size for training (default: 64)",
-    )
-    parser.add_argument(
-        "--test-batch-size",
-        type=int,
-        default=1000,
-        metavar="N",
-        help="input batch size for testing (default: 1000)",
-    )
-    parser.add_argument(
-        "--epochs",
-        type=int,
-        default=4,
-        metavar="N",
-        help="number of epochs to train (default: 14)",
-    )
-    parser.add_argument(
-        "--lr",
-        type=float,
-        default=1.0,
-        metavar="LR",
-        help="learning rate (default: 1.0)",
-    )
-    parser.add_argument(
-        "--gamma",
-        type=float,
-        default=0.7,
-        metavar="M",
-        help="Learning rate step gamma (default: 0.7)",
-    )
-    parser.add_argument(
-        "--no-cuda", action="store_true", default=False, help="disables CUDA training"
-    )
-    parser.add_argument(
-        "--no-mps",
-        action="store_true",
-        default=False,
-        help="disables macOS GPU training",
-    )
-    parser.add_argument(
-        "--dry-run",
-        action="store_true",
-        default=False,
-        help="quickly check a single pass",
-    )
-    parser.add_argument(
-        "--seed", type=int, default=1, metavar="S", help="random seed (default: 1)"
-    )
-    parser.add_argument(
-        "--log-interval",
-        type=int,
-        default=100,
-        metavar="N",
-        help="how many batches to wait before logging training status",
-    )
-    parser.add_argument(
-        "--save-model",
-        action="store_true",
-        default=False,
-        help="For Saving the current Model",
-    )
-    args = parser.parse_args()
-    use_cuda = not args.no_cuda and torch.cuda.is_available()
-    use_mps = not args.no_mps and torch.backends.mps.is_available()
-
-    torch.manual_seed(args.seed)
-    torch.backends.cuda.matmul.allow_tf32 = True
-
-    if use_cuda:
-        device = torch.device("cuda")
-    elif use_mps:
-        device = torch.device("mps")
-    else:
-        device = torch.device("cpu")
-
-    train_kwargs = {"batch_size": args.batch_size}
-    test_kwargs = {"batch_size": args.test_batch_size}
-    if use_cuda:
-        cuda_kwargs = {"num_workers": 1, "pin_memory": True, "shuffle": True}
-        train_kwargs.update(cuda_kwargs)
-        test_kwargs.update(cuda_kwargs)
-
-    transform = transforms.Compose(
-        [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
-    )
-    dataset1 = datasets.MNIST("../data", train=True, download=True, transform=transform)
-    dataset2 = datasets.MNIST("../data", train=False, transform=transform)
-    train_loader = torch.utils.data.DataLoader(dataset1, **train_kwargs)
-    test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
-
-    model = Net().to(device)
-    opt_model = torch.compile(model, mode="max-autotune")
-    optimizer = optim.Adadelta(opt_model.parameters(), lr=args.lr)
-
-    scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
-    for epoch in range(1, args.epochs + 1):
-        print(
-            f"Training Time: {timed(lambda: train(args, opt_model, device, train_loader, optimizer, epoch))[1]}"
-        )
-        print(
-            f"Evaluation Time: {timed(lambda: test(opt_model, device, test_loader))[1]}"
-        )
-        scheduler.step()
-
-    if args.save_model:
-        torch.save(opt_model.state_dict(), "mnist_cnn.pt")
-
-
-if __name__ == "__main__":
-    main()
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -1,394 +0,0 @@
-import argparse
-import importlib
-import json
-import os
-import re
-import subprocess
-import sys
-from pathlib import Path
-
-import torch
-import torch._dynamo
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-if "MATRIX_GPU_ARCH_VERSION" in os.environ:
-    gpu_arch_ver = os.getenv("MATRIX_GPU_ARCH_VERSION")
-else:
-    gpu_arch_ver = os.getenv("GPU_ARCH_VERSION")  # Use fallback if available
-gpu_arch_type = os.getenv("MATRIX_GPU_ARCH_TYPE")
-channel = os.getenv("MATRIX_CHANNEL")
-package_type = os.getenv("MATRIX_PACKAGE_TYPE")
-target_os = os.getenv("TARGET_OS", sys.platform)
-BASE_DIR = Path(__file__).parent.parent.parent
-
-is_cuda_system = gpu_arch_type == "cuda"
-NIGHTLY_ALLOWED_DELTA = 3
-
-MODULES = [
-    {
-        "name": "torchvision",
-        "repo": "https://github.com/pytorch/vision.git",
-        "smoke_test": "./vision/test/smoke_test.py",
-        "extension": "extension",
-        "repo_name": "vision",
-    },
-    {
-        "name": "torchaudio",
-        "repo": "https://github.com/pytorch/audio.git",
-        "smoke_test": "./audio/test/smoke_test/smoke_test.py --no-ffmpeg",
-        "extension": "_extension",
-        "repo_name": "audio",
-    },
-]
-
-
-class Net(nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.conv1 = nn.Conv2d(1, 32, 3, 1)
-        self.conv2 = nn.Conv2d(32, 64, 3, 1)
-        self.fc1 = nn.Linear(9216, 1)
-
-    def forward(self, x):
-        x = self.conv1(x)
-        x = self.conv2(x)
-        x = F.max_pool2d(x, 2)
-        x = torch.flatten(x, 1)
-        output = self.fc1(x)
-        return output
-
-
-def load_json_from_basedir(filename: str):
-    try:
-        with open(BASE_DIR / filename) as fptr:
-            return json.load(fptr)
-    except FileNotFoundError as exc:
-        raise ImportError(f"File {filename} not found error: {exc.strerror}") from exc
-    except json.JSONDecodeError as exc:
-        raise ImportError(f"Invalid JSON {filename}") from exc
-
-
-def read_release_matrix():
-    return load_json_from_basedir("release_matrix.json")
-
-
-def test_numpy():
-    import numpy as np
-
-    x = np.arange(5)
-    torch.tensor(x)
-
-
-def check_version(package: str) -> None:
-    release_version = os.getenv("RELEASE_VERSION")
-    # if release_version is specified, use it to validate the packages
-    if release_version:
-        release_matrix = read_release_matrix()
-        stable_version = release_matrix["torch"]
-    else:
-        stable_version = os.getenv("MATRIX_STABLE_VERSION")
-
-    # only makes sense to check nightly package where dates are known
-    if channel == "nightly":
-        check_nightly_binaries_date(package)
-    elif stable_version is not None:
-        if not torch.__version__.startswith(stable_version):
-            raise RuntimeError(
-                f"Torch version mismatch, expected {stable_version} for channel {channel}. But its {torch.__version__}"
-            )
-
-        if release_version and package == "all":
-            for module in MODULES:
-                imported_module = importlib.import_module(module["name"])
-                module_version = imported_module.__version__
-                if not module_version.startswith(release_matrix[module["name"]]):
-                    raise RuntimeError(
-                        f"{module['name']} version mismatch, expected: \
-                            {release_matrix[module['name']]} for channel {channel}. But its {module_version}"
-                    )
-                else:
-                    print(
-                        f"{module['name']} version actual: {module_version} expected: \
-                        {release_matrix[module['name']]} for channel {channel}."
-                    )
-
-    else:
-        print(f"Skip version check for channel {channel} as stable version is None")
-
-
-def check_nightly_binaries_date(package: str) -> None:
-    from datetime import datetime
-
-    format_dt = "%Y%m%d"
-
-    date_t_str = re.findall("dev\\d+", torch.__version__)
-    date_t_delta = datetime.now() - datetime.strptime(date_t_str[0][3:], format_dt)
-    if date_t_delta.days >= NIGHTLY_ALLOWED_DELTA:
-        raise RuntimeError(
-            f"the binaries are from {date_t_str} and are more than {NIGHTLY_ALLOWED_DELTA} days old!"
-        )
-
-    if package == "all":
-        for module in MODULES:
-            imported_module = importlib.import_module(module["name"])
-            module_version = imported_module.__version__
-            date_m_str = re.findall("dev\\d+", module_version)
-            date_m_delta = datetime.now() - datetime.strptime(
-                date_m_str[0][3:], format_dt
-            )
-            print(f"Nightly date check for {module['name']} version {module_version}")
-            if date_m_delta.days > NIGHTLY_ALLOWED_DELTA:
-                raise RuntimeError(
-                    f"Expected {module['name']} to be less then {NIGHTLY_ALLOWED_DELTA} days. But its {date_m_delta}"
-                )
-
-
-def test_cuda_runtime_errors_captured() -> None:
-    cuda_exception_missed = True
-    try:
-        print("Testing test_cuda_runtime_errors_captured")
-        torch._assert_async(torch.tensor(0, device="cuda"))
-        torch._assert_async(torch.tensor(0 + 0j, device="cuda"))
-    except RuntimeError as e:
-        if re.search("CUDA", f"{e}"):
-            print(f"Caught CUDA exception with success: {e}")
-            cuda_exception_missed = False
-        else:
-            raise e
-    if cuda_exception_missed:
-        raise RuntimeError("Expected CUDA RuntimeError but have not received!")
-
-
-def smoke_test_cuda(
-    package: str, runtime_error_check: str, torch_compile_check: str
-) -> None:
-    if not torch.cuda.is_available() and is_cuda_system:
-        raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
-
-    if package == "all" and is_cuda_system:
-        for module in MODULES:
-            imported_module = importlib.import_module(module["name"])
-            # TBD for vision move extension module to private so it will
-            # be _extention.
-            version = "N/A"
-            if module["extension"] == "extension":
-                version = imported_module.extension._check_cuda_version()
-            else:
-                version = imported_module._extension._check_cuda_version()
-            print(f"{module['name']} CUDA: {version}")
-
-    # torch.compile is available on macos-arm64 and Linux for python 3.8-3.13
-    if (
-        torch_compile_check == "enabled"
-        and sys.version_info < (3, 14, 0)
-        and target_os in ["linux", "linux-aarch64", "macos-arm64", "darwin"]
-    ):
-        smoke_test_compile("cuda" if torch.cuda.is_available() else "cpu")
-
-    if torch.cuda.is_available():
-        if torch.version.cuda != gpu_arch_ver:
-            raise RuntimeError(
-                f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
-            )
-        print(f"torch cuda: {torch.version.cuda}")
-        # todo add cudnn version validation
-        print(f"torch cudnn: {torch.backends.cudnn.version()}")
-        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
-
-        torch.cuda.init()
-        print("CUDA initialized successfully")
-        print(f"Number of CUDA devices: {torch.cuda.device_count()}")
-        for i in range(torch.cuda.device_count()):
-            print(f"Device {i}: {torch.cuda.get_device_name(i)}")
-
-        # nccl is availbale only on Linux
-        if sys.platform in ["linux", "linux2"]:
-            print(f"torch nccl version: {torch.cuda.nccl.version()}")
-
-        if runtime_error_check == "enabled":
-            test_cuda_runtime_errors_captured()
-
-
-def smoke_test_conv2d() -> None:
-    import torch.nn as nn
-
-    print("Testing smoke_test_conv2d")
-    # With square kernels and equal stride
-    m = nn.Conv2d(16, 33, 3, stride=2)
-    # non-square kernels and unequal stride and with padding
-    m = nn.Conv2d(16, 33, (3, 5), stride=(2, 1), padding=(4, 2))
-    assert m is not None
-    # non-square kernels and unequal stride and with padding and dilation
-    basic_conv = nn.Conv2d(
-        16, 33, (3, 5), stride=(2, 1), padding=(4, 2), dilation=(3, 1)
-    )
-    input = torch.randn(20, 16, 50, 100)
-    output = basic_conv(input)
-
-    if is_cuda_system:
-        print("Testing smoke_test_conv2d with cuda")
-        conv = nn.Conv2d(3, 3, 3).cuda()
-        x = torch.randn(1, 3, 24, 24, device="cuda")
-        with torch.cuda.amp.autocast():
-            out = conv(x)
-        assert out is not None
-
-        supported_dtypes = [torch.float16, torch.float32, torch.float64]
-        for dtype in supported_dtypes:
-            print(f"Testing smoke_test_conv2d with cuda for {dtype}")
-            conv = basic_conv.to(dtype).cuda()
-            input = torch.randn(20, 16, 50, 100, device="cuda").type(dtype)
-            output = conv(input)
-            assert output is not None
-
-
-def test_linalg(device="cpu") -> None:
-    print(f"Testing smoke_test_linalg on {device}")
-    A = torch.randn(5, 3, device=device)
-    U, S, Vh = torch.linalg.svd(A, full_matrices=False)
-    assert (
-        U.shape == A.shape
-        and S.shape == torch.Size([3])
-        and Vh.shape == torch.Size([3, 3])
-    )
-    torch.dist(A, U @ torch.diag(S) @ Vh)
-
-    U, S, Vh = torch.linalg.svd(A)
-    assert (
-        U.shape == torch.Size([5, 5])
-        and S.shape == torch.Size([3])
-        and Vh.shape == torch.Size([3, 3])
-    )
-    torch.dist(A, U[:, :3] @ torch.diag(S) @ Vh)
-
-    A = torch.randn(7, 5, 3, device=device)
-    U, S, Vh = torch.linalg.svd(A, full_matrices=False)
-    torch.dist(A, U @ torch.diag_embed(S) @ Vh)
-
-    if device == "cuda":
-        supported_dtypes = [torch.float32, torch.float64]
-        for dtype in supported_dtypes:
-            print(f"Testing smoke_test_linalg with cuda for {dtype}")
-            A = torch.randn(20, 16, 50, 100, device=device, dtype=dtype)
-            torch.linalg.svd(A)
-
-
-def smoke_test_compile(device: str = "cpu") -> None:
-    supported_dtypes = [torch.float16, torch.float32, torch.float64]
-
-    def foo(x: torch.Tensor) -> torch.Tensor:
-        return torch.sin(x) + torch.cos(x)
-
-    for dtype in supported_dtypes:
-        print(f"Testing smoke_test_compile for {device} and {dtype}")
-        x = torch.rand(3, 3, device=device).type(dtype)
-        x_eager = foo(x)
-        x_pt2 = torch.compile(foo)(x)
-        torch.testing.assert_close(x_eager, x_pt2)
-
-    # Check that SIMD were detected for the architecture
-    if device == "cpu":
-        from torch._inductor.codecache import pick_vec_isa
-
-        isa = pick_vec_isa()
-        if not isa:
-            raise RuntimeError("Can't detect vectorized ISA for CPU")
-        print(f"Picked CPU ISA {type(isa).__name__} bit width {isa.bit_width()}")
-
-    # Reset torch dynamo since we are changing mode
-    torch._dynamo.reset()
-    dtype = torch.float32
-    torch.set_float32_matmul_precision("high")
-    print(f"Testing smoke_test_compile with mode 'max-autotune' for {dtype}")
-    x = torch.rand(64, 1, 28, 28, device=device).type(torch.float32)
-    model = Net().to(device=device)
-    x_pt2 = torch.compile(model, mode="max-autotune")(x)
-
-
-def smoke_test_modules():
-    cwd = os.getcwd()
-    for module in MODULES:
-        if module["repo"]:
-            if not os.path.exists(f"{cwd}/{module['repo_name']}"):
-                print(f"Path does not exist: {cwd}/{module['repo_name']}")
-                try:
-                    subprocess.check_output(
-                        f"git clone --depth 1 {module['repo']}",
-                        stderr=subprocess.STDOUT,
-                        shell=True,
-                    )
-                except subprocess.CalledProcessError as exc:
-                    raise RuntimeError(
-                        f"Cloning {module['repo']} FAIL: {exc.returncode} Output: {exc.output}"
-                    ) from exc
-            try:
-                smoke_test_command = f"python3 {module['smoke_test']}"
-                if target_os == "windows":
-                    smoke_test_command = f"python {module['smoke_test']}"
-                output = subprocess.check_output(
-                    smoke_test_command,
-                    stderr=subprocess.STDOUT,
-                    shell=True,
-                    universal_newlines=True,
-                )
-            except subprocess.CalledProcessError as exc:
-                raise RuntimeError(
-                    f"Module {module['name']} FAIL: {exc.returncode} Output: {exc.output}"
-                ) from exc
-            else:
-                print(f"Output: \n{output}\n")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--package",
-        help="Package to include in smoke testing",
-        type=str,
-        choices=["all", "torchonly"],
-        default="all",
-    )
-    parser.add_argument(
-        "--runtime-error-check",
-        help="No Runtime Error check",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
-    parser.add_argument(
-        "--torch-compile-check",
-        help="Check torch compile",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
-    return parser.parse_args()
-
-
-def main() -> None:
-    options = parse_args()
-    print(f"torch: {torch.__version__}")
-    print(torch.__config__.parallel_info())
-    # All PyTorch binary builds should be built with OpenMP
-    if not torch.backends.openmp.is_available():
-        raise RuntimeError("PyTorch must be built with OpenMP support")
-
-    check_version(options.package)
-    smoke_test_conv2d()
-    test_linalg()
-    test_numpy()
-    if is_cuda_system:
-        test_linalg("cuda")
-
-    if options.package == "all":
-        smoke_test_modules()
-
-    smoke_test_cuda(
-        options.package, options.runtime_error_check, options.torch_compile_check
-    )
-
-
-if __name__ == "__main__":
-    main()
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -4,7 +4,7 @@
 # (This is set by default in the Docker images we build, so you don't
 # need to set it yourself.

-set -ex -o pipefail
+set -ex

 # Suppress ANSI color escape sequences
 export TERM=vt100
@ -12,9 +12,9 @@ export TERM=vt100
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

-# Do not change workspace permissions for ROCm and s390x CI jobs
+# Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -48,17 +48,17 @@ NUM_TEST_SHARDS="${NUM_TEST_SHARDS:=1}"

 export VALGRIND=ON
 # export TORCH_INDUCTOR_INSTALL_GXX=ON
-if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-  # clang9 appears to miscompile code involving std::optional<c10::SymInt>,
+if [[ "$BUILD_ENVIRONMENT" == *clang9* ]]; then
+  # clang9 appears to miscompile code involving c10::optional<c10::SymInt>,
  # such that valgrind complains along these lines:
  #
  # Conditional jump or move depends on uninitialised value(s)
  #    at 0x40303A: ~optional_base (Optional.h:281)
  #    by 0x40303A: call (Dispatcher.h:448)
-  #    by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:10)
+  #    by 0x40303A: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, c10::optional<c10::SymInt>) (basic.cpp:10)
  #    by 0x403700: main (basic.cpp:16)
  #  Uninitialised value was created by a stack allocation
-  #    at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, std::optional<c10::SymInt>) (basic.cpp:6)
+  #    at 0x402AAA: call(at::Tensor const&, c10::ArrayRef<c10::SymInt>, c10::ArrayRef<c10::SymInt>, c10::optional<c10::SymInt>) (basic.cpp:6)
  #
  # The problem does not appear with gcc or newer versions of clang (we tested
  # clang14).  So we suppress valgrind testing for clang9 specifically.
@ -72,7 +72,7 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  #
  # using namespace at;
  #
-  # Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, std::optional<c10::SymInt> storage_offset) {
+  # Tensor call(const at::Tensor & self, c10::SymIntArrayRef size, c10::SymIntArrayRef stride, c10::optional<c10::SymInt> storage_offset) {
  #   auto op = c10::Dispatcher::singleton()
  #       .findSchemaOrThrow(at::_ops::as_strided::name, at::_ops::as_strided::overload_name)
  #       .typed<at::_ops::as_strided::schema>();
@ -81,18 +81,11 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  #
  # int main(int argv) {
  #   Tensor b = empty({3, 4});
-  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), std::nullopt);
+  #   auto z = call(b, b.sym_sizes(), b.sym_strides(), c10::nullopt);
  # }
  export VALGRIND=OFF
 fi

-
-if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
-  # There are additional warnings on s390x, maybe due to newer gcc.
-  # Skip this check for now
-  export VALGRIND=OFF
-fi
-
 if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]] || [[ "${CONTINUE_THROUGH_ERROR}" == "1" ]]; then
  # When rerunning disable tests, do not generate core dumps as it could consume
  # the runner disk space when crashed tests are run multiple times. Running out
@ -136,7 +129,7 @@ if [[ "$TEST_CONFIG" == 'default' ]]; then
 fi

 if [[ "$TEST_CONFIG" == 'distributed' ]] && [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export HIP_VISIBLE_DEVICES=0,1,2,3
+  export HIP_VISIBLE_DEVICES=0,1
 fi

 if [[ "$TEST_CONFIG" == 'slow' ]]; then
@ -160,8 +153,6 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
  # setting PYTHON_TEST_EXTRA_OPTION
  export PYTHON_TEST_EXTRA_OPTION="--xpu"
-  # Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
-  sudo rm -rf /opt/cache
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -178,13 +169,9 @@ fi

 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  if [ -f /opt/intel/oneapi/umf/latest/env/vars.sh ]; then
-    # shellcheck disable=SC1091
-    source /opt/intel/oneapi/umf/latest/env/vars.sh
-  fi
  # Check XPU status before testing
  xpu-smi discovery
 fi
@ -209,9 +196,6 @@ install_tlparse
 # ASAN test is not working
 if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export ASAN_OPTIONS=detect_leaks=0:symbolize=1:detect_stack_use_after_return=true:strict_init_order=true:detect_odr_violation=1:detect_container_overflow=0:check_initialization_order=true:debug=true
-    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-        export ASAN_OPTIONS="${ASAN_OPTIONS}:protect_shadow_gap=0"
-    fi
    export UBSAN_OPTIONS=print_stacktrace=1:suppressions=$PWD/ubsan.supp
    export PYTORCH_TEST_WITH_ASAN=1
    export PYTORCH_TEST_WITH_UBSAN=1
@ -249,8 +233,8 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    # it depends on a ton of dynamic libraries that most programs aren't gonna
    # have, and it applies to child processes.

-    LD_PRELOAD=$(clang --print-file-name=libclang_rt.asan-x86_64.so)
-    export LD_PRELOAD
+    # TODO: get rid of the hardcoded path
+    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
    # Disable valgrind for asan
    export VALGRIND=OFF

@ -297,7 +281,7 @@ test_python_shard() {

  # modify LD_LIBRARY_PATH to ensure it has the conda env.
  # This set of tests has been shown to be buggy without it for the split-build
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
 }
@ -309,7 +293,7 @@ test_python() {
 }


-test_dynamo_wrapped_shard() {
+test_dynamo_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
    exit 1
@ -322,10 +306,8 @@ test_dynamo_wrapped_shard() {
    --exclude-jit-executor \
    --exclude-distributed-tests \
    --exclude-torch-export-tests \
-    --exclude-aot-dispatch-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose \
-    --upload-artifacts-while-running
+    --verbose
  assert_git_not_dirty
 }

@ -336,9 +318,8 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
-  python test/run_test.py -i distributed/tensor/test_dtensor_compile.py --verbose
+  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
-  python test/run_test.py -i distributed/_composable/test_replicate_with_compiler.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -350,12 +331,11 @@ test_inductor_distributed() {
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_compile.py --verbose
  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
-  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives distributed/test_compute_comm_reordering --verbose
+  python test/run_test.py --include distributed/test_dynamo_distributed distributed/test_inductor_collectives --verbose
  assert_git_not_dirty
 }

@ -389,53 +369,21 @@ test_inductor_aoti() {
  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }

-test_inductor_cpp_wrapper_shard() {
-  if [[ -z "$NUM_TEST_SHARDS" ]]; then
-    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
-    exit 1
-  fi
-
-  export TORCHINDUCTOR_CPP_WRAPPER=1
+test_inductor_cpp_wrapper_abi_compatible() {
+  export TORCHINDUCTOR_ABI_COMPATIBLE=1
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  if [[ "$1" -eq "2" ]]; then
-    # For now, manually put the opinfo tests in shard 2, and all other tests in
-    # shard 1.  Test specific things triggering past bugs, for now.
-    python test/run_test.py \
-      --include inductor/test_torchinductor_opinfo \
-      -k 'linalg or to_sparse' \
-      --verbose
-    exit
-  fi
+  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
+  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend

-  # Run certain inductor unit tests with cpp wrapper. In the end state, we
-  # should be able to run all the inductor unit tests with cpp_wrapper.
-  python test/run_test.py --include inductor/test_torchinductor --verbose
-
-  # Run inductor benchmark tests with cpp wrapper.
-  # Skip benchmark tests if it's in rerun-disabled-mode.
-  if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
-    echo "skip dynamo benchmark tests for rerun-disabled-test"
-  else
-    echo "run dynamo benchmark tests with cpp wrapper"
-    python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
-
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
-  fi
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -455,7 +403,7 @@ pr_time_benchmarks() {
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "$TEST_REPORTS_DIR/new_expected_results.csv"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
 }

 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
@ -541,7 +489,7 @@ test_perf_for_dashboard() {
            --dynamic-batch-only "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_dynamic_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
-      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]]; then
+      if [[ "$DASHBOARD_TAG" == *cppwrapper-true* ]] && [[ "$mode" == "inference" ]]; then
        TORCHINDUCTOR_CPP_WRAPPER=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_cpp_wrapper_${suite}_${dtype}_${mode}_${device}_${target}.csv"
@ -563,7 +511,7 @@ test_perf_for_dashboard() {
              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
        fi
-        $TASKSET python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
@ -618,6 +566,13 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
+    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+      # Test AOTInductor with the ABI-compatible mode on CI
+      # This can be removed once the ABI-compatible mode becomes default.
+      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
+      export TORCHINDUCTOR_ABI_COMPATIBLE=1
+    fi
+
    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
      TEST_CONFIG=${TEST_CONFIG//_avx2/}
    fi
@ -639,11 +594,6 @@ test_single_dynamo_benchmark() {
 }

 test_inductor_micro_benchmark() {
-  # torchao requires cuda 8.0 or above for bfloat16 support
-  if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
-    export TORCH_CUDA_ARCH_LIST="8.0;8.6"
-  fi
-  install_torchao
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
    test_inductor_set_cpu_affinity
@ -698,6 +648,17 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

+  # Test some models in the cpp wrapper mode
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
+    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+
  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
    --output "$TEST_REPORTS_DIR/inductor_training_smoketest.csv"
@ -787,9 +748,19 @@ test_inductor_torchbench_cpu_smoketest_perf(){
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
-    # Allow 1% variance for CPU perf to accommodate perf fluctuation
-    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target" -s 0.99
+    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
  done
+
+  # Add a few ABI-compatible accuracy tests for CPU. These can be removed once we turn on ABI-compatible as default.
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
+    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only adv_inception_v3 \
+    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/timm_models.py --device cpu --accuracy \
+    --bfloat16 --inference --export-aot-inductor --disable-cudagraphs --only beit_base_patch16_224 \
+    --output "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv"
+  python benchmarks/dynamo/check_accuracy.py \
+    --actual "$TEST_REPORTS_DIR/aot_inductor_smoke_test.csv" \
+    --expected "benchmarks/dynamo/ci_expected_accuracy/aot_inductor_timm_inference.csv"
 }

 test_torchbench_gcp_smoketest(){
@ -847,7 +818,7 @@ test_without_numpy() {
  # Regression test for https://github.com/pytorch/pytorch/issues/66353
  python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;print(torch.tensor([torch.tensor(0.), torch.tensor(1.)]))"
  # Regression test for https://github.com/pytorch/pytorch/issues/109387
-  if [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
+  if [[ "${TEST_CONFIG}" == *dynamo* ]]; then
    python -c "import sys;sys.path.insert(0, 'fake_numpy');import torch;torch.compile(lambda x:print(x))('Hello World')"
  fi
  popd
@ -917,20 +888,10 @@ test_libtorch_api() {
  else
    # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
    OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
-
-    # On s390x, pytorch is built without llvm.
-    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
-    # test fails with errors like:
-    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
-    # unknown file: Failure
-    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
-    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
-      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
-    fi
+    python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
  fi

-  # quantization is not fully supported on s390x yet
-  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* && "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
+  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* ]]; then
    # NB: This test is not under TORCH_BIN_DIR but under BUILD_BIN_DIR
    export CPP_TESTS_DIR="${BUILD_BIN_DIR}"
    python test/run_test.py --cpp --verbose -i cpp/static_runtime_test
@ -991,9 +952,6 @@ test_distributed() {
    python test/run_test.py --cpp --verbose -i cpp/HashStoreTest
    python test/run_test.py --cpp --verbose -i cpp/TCPStoreTest

-    echo "Testing multi-GPU linalg tests"
-    python test/run_test.py -i test_linalg.py -k test_matmul_offline_mgpu_tunable --verbose
-
    if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
      MPIEXEC=$(command -v mpiexec)
      if [[ -n "$MPIEXEC" ]]; then
@ -1243,7 +1201,7 @@ EOF
  git reset --hard "${SHA_TO_COMPARE}"
  git submodule sync && git submodule update --init --recursive
  echo "::group::Installing Torch From Base Commit"
-  pip3 install -r requirements.txt
+  pip install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
@ -1277,7 +1235,7 @@ EOF
 }

 test_bazel() {
-  set -e -o pipefail
+  set -e

  # bazel test needs sccache setup.
  # shellcheck source=./common-build.sh
@ -1400,11 +1358,10 @@ test_executorch() {
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  # For llama3
-  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
-  bash .ci/scripts/setup-linux.sh cmake
+  # shellcheck disable=SC1091
+  source .ci/scripts/setup-linux.sh cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1414,7 +1371,7 @@ test_executorch() {
  echo "Run ExecuTorch regression tests for some models"
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
-  source .ci/scripts/test_model.sh mv3 cmake xnnpack-quantization-delegation ''
+  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''

  popd

@ -1427,8 +1384,7 @@ test_executorch() {

 test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-        test_transformers test_multiprocessing test_numpy_interop test_autograd test_binary_ufuncs test_complex test_spectral_ops \
-        test_foreach test_reductions test_unary_ufuncs \
+        test_transformers test_multiprocessing test_numpy_interop \
        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
@ -1446,7 +1402,6 @@ test_linux_aarch64() {
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
-       inductor/test_triton_cpu_backend inductor/test_triton_extension_backend inductor/test_mkldnn_pattern_matcher inductor/test_cpu_cpp_wrapper \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

@ -1454,11 +1409,7 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
-  # Install numpy-2.0.2 and compatible scipy & numba versions
-  python -mpip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
-  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
-elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
+if [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
@ -1502,6 +1453,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    install_torchaudio cuda
  fi
+  install_torchtext
  install_torchvision
  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
@ -1527,11 +1479,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
-elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
-  install_torchaudio cuda
+elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
-  checkout_install_torchbench hf_T5 llama moco
-  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  test_inductor_cpp_wrapper_abi_compatible
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
@ -1540,9 +1490,9 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
      test_inductor_distributed
    fi
  fi
-elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
+elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
  install_torchvision
-  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
+  test_dynamo_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_aten
  fi
--- a/.ci/pytorch/test_example_code/CMakeLists.txt
+++ b/.ci/pytorch/test_example_code/CMakeLists.txt
@ -1,26 +0,0 @@
-cmake_minimum_required(VERSION 3.0 FATAL_ERROR)
-project(simple-torch-test)
-
-find_package(Torch REQUIRED)
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TORCH_CXX_FLAGS}")
-
-
-add_executable(simple-torch-test simple-torch-test.cpp)
-target_include_directories(simple-torch-test PRIVATE  ${TORCH_INCLUDE_DIRS})
-target_link_libraries(simple-torch-test "${TORCH_LIBRARIES}")
-set_property(TARGET simple-torch-test PROPERTY CXX_STANDARD 17)
-
-find_package(CUDAToolkit 11.8)
-
-target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse CUDA::cublas CUDA::cusolver)
-find_library(CUDNN_LIBRARY NAMES cudnn)
-target_link_libraries(simple-torch-test  ${CUDNN_LIBRARY} )
-if(MSVC)
-  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll")
-  message("dlls to copy "  ${TORCH_DLLS})
-  add_custom_command(TARGET simple-torch-test
-                     POST_BUILD
-                     COMMAND ${CMAKE_COMMAND} -E copy_if_different
-                     ${TORCH_DLLS}
-                     $<TARGET_FILE_DIR:simple-torch-test>)
-endif(MSVC)
--- a/.ci/pytorch/test_example_code/check-torch-cuda.cpp
+++ b/.ci/pytorch/test_example_code/check-torch-cuda.cpp
@ -1,15 +0,0 @@
-#include <torch/torch.h>
-
-int main(int argc, const char* argv[]) {
-    std::cout << "Checking that CUDA archs are setup correctly" << std::endl;
-    TORCH_CHECK(torch::rand({ 3, 5 }, torch::Device(torch::kCUDA)).defined(), "CUDA archs are not setup correctly");
-
-    // These have to run after CUDA is initialized
-
-    std::cout << "Checking that magma is available" << std::endl;
-    TORCH_CHECK(torch::hasMAGMA(), "MAGMA is not available");
-
-    std::cout << "Checking that CuDNN is available" << std::endl;
-    TORCH_CHECK(torch::cuda::cudnn_is_available(), "CuDNN is not available");
-    return 0;
-}
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .5.0
 .1.1
 @ -1 +1 @@
 .2.0
 .1.0
				`@ -1 +0,0 @@`
				`6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213 magma-2.6.1.tar.gz`