only run hf_T5

Add ExportedProgram type annotation (#141247 )
Signed-off-by: Edward Z. Yang <ezyang@meta.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/141247 Approved by: https://github.com/Skylion007
2025-11-02 06:24:59 +08:00 · 2024-11-26 16:24:46 -08:00 · 2024-11-22 10:40:42 +00:00 · 2024-11-22 07:44:50 +00:00 · 2024-11-22 07:27:28 +00:00 · 2024-11-22 07:02:30 +00:00
1655 changed files with 59611 additions and 31988 deletions
--- a/.bazelversion
+++ b/.bazelversion
@ -1 +1 @@
-6.1.1
+6.5.0
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -1,23 +0,0 @@
-[pt]
-  is_oss=1
-
-[buildfile]
-  name = BUCK.oss
-  includes = //tools/build_defs/select.bzl
-
-[repositories]
-  bazel_skylib = third_party/bazel-skylib/
-  ovr_config = .
-
-[download]
-  in_build = true
-
-[cxx]
-  cxxflags = -std=c++17
-  ldflags = -Wl,--no-undefined
-  should_remap_host_platform = true
-  cpp = /usr/bin/clang
-  cc = /usr/bin/clang
-  cxx = /usr/bin/clang++
-  cxxpp = /usr/bin/clang++
-  ld = /usr/bin/clang++
--- a/.ci/aarch64_linux/README.md
+++ b/.ci/aarch64_linux/README.md
@ -0,0 +1,19 @@
+# Aarch64 (ARM/Graviton) Support Scripts
+Scripts for building aarch64 PyTorch PIP Wheels. These scripts build the following wheels:
+* torch
+* torchvision
+* torchaudio
+* torchtext
+* torchdata
+## Aarch64_ci_build.sh
+This script is design to support CD operations within PyPi manylinux aarch64 container, and be executed in the container. It prepares the container and then executes __aarch64_wheel_ci_build.py__ to build the wheels. The script "assumes" the PyTorch repo is located at: ```/pytorch``` and will put the wheels into ```/artifacts```.
+### Usage
+```DESIRED_PYTHON=<PythonVersion> aarch64_ci_build.sh```
+
+__NOTE:__ CI build is currently __EXPERMINTAL__
+
+## Build_aarch64_wheel.py
+This app allows a person to build using AWS EC3 resources and requires AWS-CLI and Boto3 with AWS credentials to support building EC2 instances for the wheel builds. Can be used in a codebuild CD or from a local system.
+
+### Usage
+```build_aarch64_wheel.py --key-name <YourPemKey> --use-docker --python 3.8 --branch <RCtag>```
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -0,0 +1,39 @@
+#!/bin/bash
+set -eux -o pipefail
+
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
+source $SCRIPTPATH/aarch64_ci_setup.sh
+
+tagged_version() {
+  GIT_DESCRIBE="git --git-dir /pytorch/.git describe --tags --match v[0-9]*.[0-9]*.[0-9]*"
+  if ${GIT_DESCRIBE} --exact >/dev/null; then
+    ${GIT_DESCRIBE}
+  else
+    return 1
+  fi
+}
+
+if tagged_version >/dev/null; then
+  export OVERRIDE_PACKAGE_VERSION="$(tagged_version | sed -e 's/^v//' -e 's/-.*$//')"
+fi
+
+###############################################################################
+# Run aarch64 builder python
+###############################################################################
+cd /
+# adding safe directory for git as the permissions will be
+# on the mounted pytorch repo
+git config --global --add safe.directory /pytorch
+pip install -r /pytorch/requirements.txt
+pip install auditwheel
+if [ "$DESIRED_CUDA" = "cpu" ]; then
+    echo "BASE_CUDA_VERSION is not set. Building cpu wheel."
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+else
+    echo "BASE_CUDA_VERSION is set to: $DESIRED_CUDA"
+    #USE_PRIORITIZED_TEXT_FOR_LD for enable linker script optimization https://github.com/pytorch/pytorch/pull/121975/files
+    USE_PRIORITIZED_TEXT_FOR_LD=1 python /pytorch/.ci/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+fi
--- a/.ci/aarch64_linux/aarch64_ci_setup.sh
+++ b/.ci/aarch64_linux/aarch64_ci_setup.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+set -eux -o pipefail
+
+# This script is used to prepare the Docker container for aarch64_ci_wheel_build.py python script
+# By creating symlinks from desired /opt/python to /usr/local/bin/
+
+NUMPY_VERSION=2.0.2
+PYGIT2_VERSION=1.15.1
+if [[ "$DESIRED_PYTHON"  == "3.13" ]]; then
+    NUMPY_VERSION=2.1.2
+    PYGIT2_VERSION=1.16.0
+fi
+
+SCRIPTPATH="$( cd "$(dirname "$0")" ; pwd -P )"
+source $SCRIPTPATH/../manywheel/set_desired_python.sh
+
+pip install -q numpy==${NUMPY_VERSION} pyyaml==6.0.2 scons==4.7.0 ninja==1.11.1 patchelf==0.17.2 pygit2==${PYGIT2_VERSION}
+
+for tool in python python3 pip pip3 ninja scons patchelf; do
+    ln -sf ${DESIRED_PYTHON_BIN_DIR}/${tool} /usr/local/bin;
+done
+
+python --version
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -0,0 +1,230 @@
+#!/usr/bin/env python3
+# encoding: UTF-8
+
+import os
+import shutil
+from subprocess import check_call, check_output
+from typing import List
+
+from pygit2 import Repository
+
+
+def list_dir(path: str) -> List[str]:
+    """'
+    Helper for getting paths for Python
+    """
+    return check_output(["ls", "-1", path]).decode().split("\n")
+
+
+def build_ArmComputeLibrary() -> None:
+    """
+    Using ArmComputeLibrary for aarch64 PyTorch
+    """
+    print("Building Arm Compute Library")
+    acl_build_flags = [
+        "debug=0",
+        "neon=1",
+        "opencl=0",
+        "os=linux",
+        "openmp=1",
+        "cppthreads=0",
+        "arch=armv8a",
+        "multi_isa=1",
+        "fixed_format_kernels=1",
+        "build=native",
+    ]
+    acl_install_dir = "/acl"
+    acl_checkout_dir = "ComputeLibrary"
+    os.makedirs(acl_install_dir)
+    check_call(
+        [
+            "git",
+            "clone",
+            "https://github.com/ARM-software/ComputeLibrary.git",
+            "-b",
+            "v24.09",
+            "--depth",
+            "1",
+            "--shallow-submodules",
+        ]
+    )
+
+    check_call(
+        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+        + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
+    for d in ["arm_compute", "include", "utils", "support", "src"]:
+        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
+
+
+def update_wheel(wheel_path) -> None:
+    """
+    Update the cuda wheel libraries
+    """
+    folder = os.path.dirname(wheel_path)
+    wheelname = os.path.basename(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+        "/usr/local/cuda/lib64/libcudnn.so.9",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
+        "/usr/local/cuda/lib64/libcurand.so.10",
+        "/usr/local/cuda/lib64/libnvToolsExt.so.1",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
+        "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.4",
+        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
+        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
+        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
+        "/usr/local/cuda/lib64/libcudnn_ops.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_runtime_compiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_engines_precompiled.so.9",
+        "/usr/local/cuda/lib64/libcudnn_heuristic.so.9",
+        "/lib64/libgomp.so.1",
+        "/usr/lib64/libgfortran.so.5",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+    ]
+    if enable_cuda:
+        libs_to_copy += [
+            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+            "/usr/local/lib/libnvpl_lapack_core.so.0",
+            "/usr/local/lib/libnvpl_blas_core.so.0",
+        ]
+    else:
+        libs_to_copy += [
+            "/opt/OpenBLAS/lib/libopenblas.so.0",
+        ]
+    # Copy libraries to unzipped_folder/a/lib
+    for lib_path in libs_to_copy:
+        lib_name = os.path.basename(lib_path)
+        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
+        os.system(
+            f"cd {folder}/tmp/torch/lib/; "
+            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
+        )
+    os.mkdir(f"{folder}/cuda_wheel")
+    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
+    shutil.move(
+        f"{folder}/cuda_wheel/{wheelname}",
+        f"{folder}/{wheelname}",
+        copy_function=shutil.copy2,
+    )
+    os.system(f"rm -rf {folder}/tmp/ {folder}/cuda_wheel/")
+
+
+def complete_wheel(folder: str) -> str:
+    """
+    Complete wheel build and put in artifact location
+    """
+    wheel_name = list_dir(f"/{folder}/dist")[0]
+
+    if "pytorch" in folder and not enable_cuda:
+        print("Repairing Wheel with AuditWheel")
+        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
+        repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
+
+        print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
+        os.rename(
+            f"/{folder}/wheelhouse/{repaired_wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
+    else:
+        repaired_wheel_name = wheel_name
+
+    print(f"Copying {repaired_wheel_name} to artifacts")
+    shutil.copy2(
+        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+    )
+
+    return repaired_wheel_name
+
+
+def parse_arguments():
+    """
+    Parse inline arguments
+    """
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("AARCH64 wheels python CD")
+    parser.add_argument("--debug", action="store_true")
+    parser.add_argument("--build-only", action="store_true")
+    parser.add_argument("--test-only", type=str)
+    parser.add_argument("--enable-mkldnn", action="store_true")
+    parser.add_argument("--enable-cuda", action="store_true")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    """
+    Entry Point
+    """
+    args = parse_arguments()
+    enable_mkldnn = args.enable_mkldnn
+    enable_cuda = args.enable_cuda
+    repo = Repository("/pytorch")
+    branch = repo.head.name
+    if branch == "HEAD":
+        branch = "master"
+
+    print("Building PyTorch wheel")
+    build_vars = "MAX_JOBS=5 CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
+    os.system("cd /pytorch; python setup.py clean")
+
+    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
+    if override_package_version is not None:
+        version = override_package_version
+        build_vars += (
+            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+        )
+    elif branch in ["nightly", "master"]:
+        build_date = (
+            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+            .decode()
+            .replace("-", "")
+        )
+        version = (
+            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+        )
+        if enable_cuda:
+            desired_cuda = os.getenv("DESIRED_CUDA")
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date}+{desired_cuda} PYTORCH_BUILD_NUMBER=1 "
+        else:
+            build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
+    elif branch.startswith(("v1.", "v2.")):
+        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
+
+    if enable_mkldnn:
+        build_ArmComputeLibrary()
+        print("build pytorch with mkldnn+acl backend")
+        build_vars += (
+            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+            "ACL_ROOT_DIR=/acl "
+            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
+            "ACL_INCLUDE_DIR=/acl/build "
+            "ACL_LIBRARY=/acl/build "
+        )
+        if enable_cuda:
+            build_vars += "BLAS=NVPL "
+        else:
+            build_vars += "BLAS=OpenBLAS OpenBLAS_HOME=/OpenBLAS "
+    else:
+        print("build pytorch without mkldnn backend")
+
+    os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
+    if enable_cuda:
+        print("Updating Cuda Dependency")
+        filename = os.listdir("/pytorch/dist/")
+        wheel_path = f"/pytorch/dist/{filename[0]}"
+        update_wheel(wheel_path)
+    pytorch_wheel_name = complete_wheel("/pytorch/")
+    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
--- a/.ci/aarch64_linux/embed_library.py
+++ b/.ci/aarch64_linux/embed_library.py
@ -0,0 +1,87 @@
+#!/usr/bin/env python3
+
+import os
+import shutil
+import sys
+from subprocess import check_call
+from tempfile import TemporaryDirectory
+
+from auditwheel.elfutils import elf_file_filter
+from auditwheel.lddtree import lddtree
+from auditwheel.patcher import Patchelf
+from auditwheel.repair import copylib
+from auditwheel.wheeltools import InWheelCtx
+
+
+def replace_tag(filename):
+    with open(filename) as f:
+        lines = f.read().split("\\n")
+    for i, line in enumerate(lines):
+        if not line.startswith("Tag: "):
+            continue
+        lines[i] = line.replace("-linux_", "-manylinux2014_")
+        print(f"Updated tag from {line} to {lines[i]}")
+
+    with open(filename, "w") as f:
+        f.write("\\n".join(lines))
+
+
+class AlignedPatchelf(Patchelf):
+    def set_soname(self, file_name: str, new_soname: str) -> None:
+        check_call(
+            ["patchelf", "--page-size", "65536", "--set-soname", new_soname, file_name]
+        )
+
+    def replace_needed(self, file_name: str, soname: str, new_soname: str) -> None:
+        check_call(
+            [
+                "patchelf",
+                "--page-size",
+                "65536",
+                "--replace-needed",
+                soname,
+                new_soname,
+                file_name,
+            ]
+        )
+
+
+def embed_library(whl_path, lib_soname, update_tag=False):
+    patcher = AlignedPatchelf()
+    out_dir = TemporaryDirectory()
+    whl_name = os.path.basename(whl_path)
+    tmp_whl_name = os.path.join(out_dir.name, whl_name)
+    with InWheelCtx(whl_path) as ctx:
+        torchlib_path = os.path.join(ctx._tmpdir.name, "torch", "lib")
+        ctx.out_wheel = tmp_whl_name
+        new_lib_path, new_lib_soname = None, None
+        for filename, _ in elf_file_filter(ctx.iter_files()):
+            if not filename.startswith("torch/lib"):
+                continue
+            libtree = lddtree(filename)
+            if lib_soname not in libtree["needed"]:
+                continue
+            lib_path = libtree["libs"][lib_soname]["path"]
+            if lib_path is None:
+                print(f"Can't embed {lib_soname} as it could not be found")
+                break
+            if lib_path.startswith(torchlib_path):
+                continue
+
+            if new_lib_path is None:
+                new_lib_soname, new_lib_path = copylib(lib_path, torchlib_path, patcher)
+            patcher.replace_needed(filename, lib_soname, new_lib_soname)
+            print(f"Replacing {lib_soname} with {new_lib_soname} for {filename}")
+        if update_tag:
+            # Add manylinux2014 tag
+            for filename in ctx.iter_files():
+                if os.path.basename(filename) != "WHEEL":
+                    continue
+                replace_tag(filename)
+    shutil.move(tmp_whl_name, whl_path)
+
+
+if __name__ == "__main__":
+    embed_library(
+        sys.argv[1], "libgomp.so.1", len(sys.argv) > 2 and sys.argv[2] == "--update-tag"
+    )
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -1,47 +1,39 @@
-ARG CUDA_VERSION=10.2
+ARG CUDA_VERSION=12.4
 ARG BASE_TARGET=cuda${CUDA_VERSION}
-FROM centos:7 as base
+FROM amd64/almalinux:8 as base

 ENV LC_ALL en_US.UTF-8
 ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

-ARG DEVTOOLSET_VERSION=9
-RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-RUN yum update -y
-RUN yum install -y wget curl perl util-linux xz bzip2 git patch which unzip
+ARG DEVTOOLSET_VERSION=11
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+
+RUN yum -y update
+RUN yum -y install epel-release
+RUN yum install -y sudo wget curl perl util-linux xz bzip2 git patch which perl zlib-devel openssl-devel yum-utils autoconf automake make gcc-toolset-${DEVTOOLSET_VERSION}-toolchain
 # Just add everything as a safe.directory for git since these will be used in multiple places with git
 RUN git config --global --add safe.directory '*'
-RUN yum install -y yum-utils centos-release-scl
-RUN yum-config-manager --enable rhel-server-rhscl-7-rpms
-RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
-RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
-# EPEL for cmake
-RUN yum --enablerepo=extras install -y epel-release
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH

-# cmake
-RUN yum install -y cmake3 && \
-    ln -s /usr/bin/cmake3 /usr/bin/cmake
-ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
-ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
-
-RUN yum install -y autoconf aclocal automake make sudo
+# cmake-3.18.4 from pip
+RUN yum install -y python3-pip && \
+    python3 -mpip install cmake==3.18.4 && \
+    ln -s /usr/local/bin/cmake /usr/bin/cmake3
 RUN rm -rf /usr/local/cuda-*

+FROM base as openssl
+ADD ./common/install_openssl.sh install_openssl.sh
+RUN bash ./install_openssl.sh && rm install_openssl.sh
+
 FROM base as patchelf
 # Install patchelf
 ADD ./common/install_patchelf.sh install_patchelf.sh
 RUN bash ./install_patchelf.sh && rm install_patchelf.sh && cp $(which patchelf) /patchelf

-FROM base as openssl
-# Install openssl
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-
 FROM base as conda
 # Install Anaconda
 ADD ./common/install_conda_docker.sh install_conda.sh
@ -49,7 +41,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh

 # Install CUDA
 FROM base as cuda
-ARG CUDA_VERSION=10.2
+ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
@ -96,7 +88,8 @@ COPY ./common/install_jni.sh install_jni.sh
 COPY ./java/jni.h jni.h
 RUN bash ./install_jni.sh && rm install_jni.sh

-ENV  PATH /opt/conda/bin:$PATH
+ENV PATH /opt/conda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
 COPY --from=mnist  /usr/local/mnist /usr/local/mnist
 RUN rm -rf /usr/local/cuda
 RUN chmod o+rw /usr/local
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -48,10 +48,10 @@ esac
    --progress plain \
    --build-arg "BASE_TARGET=${BASE_TARGET}" \
    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-    --build-arg "DEVTOOLSET_VERSION=9" \
+    --build-arg "DEVTOOLSET_VERSION=11" \
    -t ${DOCKER_IMAGE_NAME} \
    $@ \
-    -f "${TOPDIR}/.ci/docker/conda/Dockerfile" \
+    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
    ${TOPDIR}/.ci/docker/
 )

--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -308,6 +308,17 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-jammy-xpu-2025.0-py3)
+    ANACONDA_PYTHON_VERSION=3.9
+    GCC_VERSION=11
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    XPU_VERSION=2025.0
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-16b633b4daa7f3d3442be62a3589bd60b2f7fdc7
+6f638937d64e3396793956d75ee3e14802022745
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -3,7 +3,7 @@
 set -ex

 NCCL_VERSION=v2.21.5-1
-CUDNN_VERSION=9.1.0.70
+CUDNN_VERSION=9.5.1.17

 function install_cusparselt_040 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -39,6 +39,7 @@ function install_cusparselt_062 {
 }

 function install_118 {
+    CUDNN_VERSION=9.1.0.70
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
@ -105,6 +106,7 @@ function install_121 {
 }

 function install_124 {
+  CUDNN_VERSION=9.1.0.70
  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -4,6 +4,7 @@
 set -ex

 NCCL_VERSION=v2.21.5-1
+CUDNN_VERSION=9.5.1.17

 function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
@ -17,7 +18,8 @@ function install_cusparselt_062 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  CUDNN_VERSION=9.1.0.70
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -28,10 +30,10 @@ function install_124 {

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  mkdir tmp_cudnn && cd tmp_cudnn
-  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz -O cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
-  tar xf cudnn-linux-sbsa-9.1.0.70_cuda12-archive.tar.xz
-  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/include/* /usr/local/cuda/include/
-  cp -a cudnn-linux-sbsa-9.1.0.70_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
  cd ..
  rm -rf tmp_cudnn

@ -74,18 +76,87 @@ function prune_124 {
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a

  #####################################################################################
-  # CUDA 12.1 prune visual tools
+  # CUDA 12.4 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }

+function install_126 {
+  echo "Installing CUDA 12.6.2 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
+  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
+  # install CUDA 12.6.2 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.2/local_installers/cuda_12.6.2_560.35.03_linux_sbsa.run
+  chmod +x cuda_12.6.2_560.35.03_linux_sbsa.run
+  ./cuda_12.6.2_560.35.03_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.6.2_560.35.03_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.6 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz -O cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-${CUDNN_VERSION}_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_062
+
+  ldconfig
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
    12.4) install_124; prune_124
        ;;
+    12.6) install_126; prune_126
+        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -4,7 +4,9 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
+    if [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
+    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -36,12 +36,8 @@ install_conda_dependencies() {
 }

 install_pip_dependencies() {
-  pushd executorch/.ci/docker
-  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
-  # binaries later, ExecuTorch only needs CPU
-  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-  # Install all Python dependencies
-  pip_install -r requirements-ci.txt
+  pushd executorch
+  as_jenkins bash install_requirements.sh --pybind xnnpack
  popd
 }

@ -54,7 +50,7 @@ setup_executorch() {
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  as_jenkins .ci/scripts/setup-linux.sh cmake
+  as_jenkins .ci/scripts/setup-linux.sh cmake || true
  popd
 }

--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -24,10 +24,10 @@ function install_ubuntu() {
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor > /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+        | gpg --dearmor > /usr/share/keyrings/oneapi-archive-keyring.gpg.gpg
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg.gpg] \
+        https://apt.repos.intel.com/${XPU_REPO_NAME} all main" \
+        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
    apt-get update
@ -47,11 +47,7 @@ function install_ubuntu() {
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
-    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev-0.9
-    else
-        apt-get install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
-    fi
+    apt-get install -y ${XPU_PACKAGES}

    # Cleanup
    apt-get autoclean && apt-get clean
@ -61,13 +57,13 @@ function install_ubuntu() {
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
-        if [[ ! " 8.6 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
+        if [[ ! " 8.8 8.9 9.0 9.2 9.3 " =~ " ${VERSION_ID} " ]]; then
            echo "RHEL version ${VERSION_ID} not supported"
            exit
        fi
    elif [[ "${ID}" == "almalinux" ]]; then
        # Workaround for almalinux8 which used by quay.io/pypa/manylinux_2_28_x86_64
-        VERSION_ID="8.6"
+        VERSION_ID="8.8"
    fi

    dnf install -y 'dnf-command(config-manager)'
@ -75,16 +71,18 @@ function install_rhel() {
    dnf config-manager --add-repo \
        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
    # To add the online network network package repository for the Intel Support Packages
-    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
-[intel-for-pytorch-gpu-dev]
+    tee > /etc/yum.repos.d/oneAPI.repo << EOF
+[oneAPI]
 name=Intel for Pytorch GPU dev repository
-baseurl=https://yum.repos.intel.com/intel-for-pytorch-gpu-dev
+baseurl=https://yum.repos.intel.com/${XPU_REPO_NAME}
 enabled=1
 gpgcheck=1
 repo_gpgcheck=1
 gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
 EOF

+    # Install Intel Support Packages
+    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
    # Compute and Media Runtimes
@ -99,8 +97,6 @@ EOF
    dnf install -y --refresh \
        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
        level-zero-devel
-    # Install Intel Support Packages
-    yum install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9

    # Cleanup
    dnf clean all
@ -122,7 +118,7 @@ function install_sles() {
        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
-    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
+    zypper addrepo https://yum.repos.intel.com/${XPU_REPO_NAME} oneAPI
    rpm --import https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB

    # The xpu-smi packages
@ -134,7 +130,7 @@ function install_sles() {
    zypper install -y libigdfcl-devel intel-igc-cm libigfxcmrt-devel level-zero-devel

    # Install Intel Support Packages
-    zypper install -y intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9
+    zypper install -y ${XPU_PACKAGES}

 }

@ -145,6 +141,13 @@ if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
    XPU_DRIVER_VERSION=""
 fi

+XPU_REPO_NAME="intel-for-pytorch-gpu-dev"
+XPU_PACKAGES="intel-for-pytorch-gpu-dev-0.5 intel-pti-dev-0.9"
+if [[ "$XPU_VERSION" == "2025.0" ]]; then
+    XPU_REPO_NAME="oneapi"
+    XPU_PACKAGES="intel-deep-learning-essentials-2025.0"
+fi
+
 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 case "$ID" in
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -144,6 +144,10 @@ COPY --from=libpng             /usr/local/lib/pkgconfig              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=10.1
 ARG DEVTOOLSET_VERSION=9
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+ENV PATH /opt/conda/bin:$PATH
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -117,9 +117,18 @@ COPY --from=jni                /usr/local/include/jni.h              /usr/local/
 FROM common as cpu_final
 ARG BASE_CUDA_VERSION=11.8
 ARG DEVTOOLSET_VERSION=11
+# Install Anaconda
+ADD ./common/install_conda_docker.sh install_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh
+ENV PATH /opt/conda/bin:$PATH
 # Ensure the expected devtoolset is used
 ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH
+# Install setuptools and wheel for python 3.12/3.13
+RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
+    /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
+    done;
+

 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
@ -130,6 +139,9 @@ FROM cpu_final as cuda_final
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
+RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
+ENV PATH=/usr/local/cuda/bin:$PATH
+

 FROM common as rocm_final
 ARG ROCM_VERSION=3.7
@ -150,8 +162,7 @@ ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
-# Install setuptools and wheel for python 3.13
-RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
+ENV XPU_VERSION 2025.0
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -48,6 +48,11 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${GCCTOOLSET_VERSION}/root/usr/lib64:/op
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"

+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh
+
 FROM base as final

 # remove unncessary python versions
@ -55,3 +60,5 @@ RUN rm -rf /opt/python/cp26-cp26m /opt/_internal/cpython-2.6.9-ucs2
 RUN rm -rf /opt/python/cp26-cp26mu /opt/_internal/cpython-2.6.9-ucs4
 RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -1,17 +1,20 @@
-FROM --platform=linux/s390x docker.io/ubuntu:24.04 as base
+FROM quay.io/pypa/manylinux_2_28_s390x as base

 # Language variables
 ENV LC_ALL=C.UTF-8
 ENV LANG=C.UTF-8
 ENV LANGUAGE=C.UTF-8

+ARG DEVTOOLSET_VERSION=13
 # Installed needed OS packages. This is to support all
 # the binary builds (torch, vision, audio, text, data)
-RUN apt update ; apt upgrade -y
-RUN apt install -y \
-  build-essential \
+RUN yum -y install epel-release
+RUN yum -y update
+RUN yum install -y \
+  sudo \
  autoconf \
  automake \
+  bison \
  bzip2 \
  curl \
  diffutils \
@ -24,19 +27,40 @@ RUN apt install -y \
  util-linux \
  wget \
  which \
-  xz-utils \
+  xz \
+  yasm \
  less \
  zstd \
+  libgomp \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-c++ \
+  gcc-toolset-${DEVTOOLSET_VERSION}-binutils \
+  gcc-toolset-${DEVTOOLSET_VERSION}-gcc-gfortran \
  cmake \
-  python3 \
-  python3-dev \
-  python3-setuptools \
-  python3-yaml \
-  python3-typing-extensions \
-  libblas-dev \
-  libopenblas-dev \
-  liblapack-dev \
-  libatlas-base-dev
+  rust \
+  cargo \
+  llvm-devel \
+  libzstd-devel \
+  python3.12-devel \
+  python3.12-setuptools \
+  python3.12-pip \
+  python3-virtualenv \
+  python3.12-pyyaml \
+  python3.12-numpy \
+  python3.12-wheel \
+  python3.12-cryptography \
+  blas-devel \
+  openblas-devel \
+  lapack-devel \
+  atlas-devel \
+  libjpeg-devel \
+  libxslt-devel \
+  libxml2-devel \
+  openssl-devel \
+  valgrind
+
+ENV PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
@ -44,14 +68,8 @@ RUN apt install -y \
 # For more details see https://github.com/pytorch/pytorch/issues/78659#issuecomment-1144107327
 RUN git config --global --add safe.directory "*"

-FROM base as openssl
-# Install openssl (this must precede `build python` step)
-# (In order to have a proper SSL module, Python is compiled
-# against a recent openssl [see env vars above], which is linked
-# statically. We delete openssl afterwards.)
-ADD ./common/install_openssl.sh install_openssl.sh
-RUN bash ./install_openssl.sh && rm install_openssl.sh
-ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+# installed python doesn't have development parts. Rebuild it from scratch
+RUN /bin/rm -rf /opt/_internal /opt/python /usr/local/*/*

 # EPEL for cmake
 FROM base as patchelf
@ -64,10 +82,43 @@ FROM patchelf as python
 # build python
 COPY manywheel/build_scripts /build_scripts
 ADD ./common/install_cpython.sh /build_scripts/install_cpython.sh
+ENV SSL_CERT_FILE=
 RUN bash build_scripts/build.sh && rm -r build_scripts

-FROM openssl as final
+FROM base as final
 COPY --from=python             /opt/python                           /opt/python
 COPY --from=python             /opt/_internal                        /opt/_internal
-COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel /usr/local/bin/auditwheel
+COPY --from=python             /opt/python/cp39-cp39/bin/auditwheel  /usr/local/bin/auditwheel
 COPY --from=patchelf           /usr/local/bin/patchelf               /usr/local/bin/patchelf
+
+RUN alternatives --set python /usr/bin/python3.12
+RUN alternatives --set python3 /usr/bin/python3.12
+
+RUN pip-3.12 install typing_extensions
+
+ENTRYPOINT []
+CMD ["/bin/bash"]
+
+# install test dependencies:
+# - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
+RUN dnf install -y \
+  protobuf-devel \
+  protobuf-c-devel \
+  protobuf-lite-devel \
+  wget \
+  patch
+
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
+  git submodule update --init --recursive && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -61,7 +61,7 @@ case ${GPU_ARCH_TYPE} in
    cpu-s390x)
        TARGET=final
        DOCKER_TAG=cpu-s390x
-        GPU_IMAGE=redhat/ubi9
+        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
@ -125,11 +125,13 @@ fi
 (
    set -x

-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-    sudo systemctl daemon-reload
-    sudo systemctl restart docker
+    if [ "$(uname -m)" != "s390x" ]; then
+        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+        sudo systemctl daemon-reload
+        sudo systemctl restart docker
+    fi

    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
--- a/.ci/docker/manywheel/build_scripts/build.sh
+++ b/.ci/docker/manywheel/build_scripts/build.sh
@ -16,37 +16,27 @@ CURL_HASH=cf34fe0b07b800f1c01a499a6e8b2af548f6d0e044dca4a29d88a4bee146d131
 AUTOCONF_ROOT=autoconf-2.69
 AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969

+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel libpcap-devel xz-devel libffi-devel"
+
+if [ "$(uname -m)" != "s390x" ] ; then
+    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} db4-devel"
+else
+    PYTHON_COMPILE_DEPS="${PYTHON_COMPILE_DEPS} libdb-devel"
+fi
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
 # Get build utilities
 MY_DIR=$(dirname "${BASH_SOURCE[0]}")
 source $MY_DIR/build_utils.sh

-if [ "$(uname -m)" != "s390x" ] ; then
-    # Dependencies for compiling Python that we want to remove from
-    # the final image after compiling Python
-    PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"
-
-    # Libraries that are allowed as part of the manylinux1 profile
-    MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
-
-    # Development tools and libraries
-    yum -y install bzip2 make git patch unzip bison yasm diffutils \
-        automake which file cmake28 \
-        kernel-devel-`uname -r` \
-        ${PYTHON_COMPILE_DEPS}
-else
-    # Dependencies for compiling Python that we want to remove from
-    # the final image after compiling Python
-    PYTHON_COMPILE_DEPS="zlib1g-dev libbz2-dev libncurses-dev libsqlite3-dev libdb-dev libpcap-dev liblzma-dev libffi-dev"
-
-    # Libraries that are allowed as part of the manylinux1 profile
-    MANYLINUX1_DEPS="libglib2.0-dev libX11-dev libncurses-dev"
-
-    # Development tools and libraries
-    apt install -y bzip2 make git patch unzip diffutils \
-        automake which file cmake \
-        linux-headers-virtual \
-        ${PYTHON_COMPILE_DEPS}
-fi
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    ${PYTHON_COMPILE_DEPS}

 # Install newest autoconf
 build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
@ -92,16 +82,13 @@ ln -s $PY39_BIN/auditwheel /usr/local/bin/auditwheel

 # Clean up development headers and other unnecessary stuff for
 # final image
-if [ "$(uname -m)" != "s390x" ] ; then
-    yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
-        avahi freetype bitstream-vera-fonts \
-        ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
-    yum -y install ${MANYLINUX1_DEPS}
-    yum -y clean all > /dev/null 2>&1
-    yum list installed
-else
-    apt purge -y ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
-fi
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS} || true > /dev/null 2>&1
+yum -y install ${MANYLINUX1_DEPS}
+yum -y clean all > /dev/null 2>&1
+yum list installed
+
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
 # Strip what we can -- and ignore errors, because this just attempts to strip
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -36,7 +36,7 @@ expecttest==0.2.1
 #Pinned versions: 0.2.1
 #test that import:

-fbscribelogger==0.1.6
+fbscribelogger==0.1.7
 #Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:
@ -281,11 +281,6 @@ redis>=4.0.0
 #Description: redis database
 #test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)

-rockset==1.0.3
-#Description: queries Rockset
-#Pinned versions: 1.0.3
-#test that import:
-
 ghstack==0.8.0
 #Description: ghstack tool
 #Pinned versions: 0.8.0
--- a/.ci/magma/.gitignore
+++ b/.ci/magma/.gitignore
@ -0,0 +1,2 @@
+output/
+magma-cuda*/
--- a/.ci/magma/Makefile
+++ b/.ci/magma/Makefile
@ -0,0 +1,48 @@
+SHELL=/usr/bin/env bash
+
+DOCKER_CMD ?= docker
+DESIRED_CUDA ?= 11.8
+DESIRED_CUDA_SHORT = $(subst .,,$(DESIRED_CUDA))
+PACKAGE_NAME = magma-cuda
+CUDA_ARCH_LIST ?= -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
+
+DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
+	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
+	-w /builder \
+	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_CUDA_SHORT} \
+	-e DESIRED_CUDA=${DESIRED_CUDA} \
+	-e CUDA_ARCH_LIST="${CUDA_ARCH_LIST}" \
+	"pytorch/manylinux-builder:cuda${DESIRED_CUDA}-main" \
+	magma/build_magma.sh
+
+.PHONY: all
+all: magma-cuda126
+all: magma-cuda124
+all: magma-cuda121
+all: magma-cuda118
+
+.PHONY:
+clean:
+	$(RM) -r magma-*
+	$(RM) -r output
+
+.PHONY: magma-cuda126
+magma-cuda126: DESIRED_CUDA := 12.6
+magma-cuda126:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda124
+magma-cuda124: DESIRED_CUDA := 12.4
+magma-cuda124:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda121
+magma-cuda121: DESIRED_CUDA := 12.1
+magma-cuda121:
+	$(DOCKER_RUN)
+
+.PHONY: magma-cuda118
+magma-cuda118: DESIRED_CUDA := 11.8
+magma-cuda118: CUDA_ARCH_LIST += -gencode arch=compute_37,code=sm_37
+magma-cuda118:
+	$(DOCKER_RUN)
--- a/.ci/magma/README.md
+++ b/.ci/magma/README.md
@ -0,0 +1,50 @@
+# Magma
+
+This folder contains the scripts and configurations to build magma, statically linked for various versions of CUDA.
+
+## Building
+
+Look in the `Makefile` for available targets to build. To build any target, for example `magma-cuda118`, run
+
+```
+# Using `docker`
+make magma-cuda118
+
+# Using `podman`
+DOCKER_CMD=podman make magma-cuda118
+```
+
+This spawns a `pytorch/manylinux-cuda<version>` docker image, which has the required `devtoolset` and CUDA versions installed.
+Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
+into a tarball, with the following structure:
+
+```
+.
+├── include       # header files
+├── lib           # libmagma.a
+├── info
+│   ├── licenses  # license file
+│   └── recipe    # build script and patches
+```
+
+More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the CUDA version.
+Outputted binaries should be in the `output` folder.
+
+
+## Pushing
+
+Packages can be uploaded to an S3 bucket using:
+
+```
+aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
+```
+
+If you do not have upload permissions, please ping @seemethere or @soumith to gain access
+
+## New versions
+
+New CUDA versions can be added by creating a new make target with the next desired version. For CUDA version NN.n, the target should be named `magma-cudaNNn`.
+
+Make sure to edit the appropriate environment variables (e.g., DESIRED_CUDA, CUDA_ARCH_LIST) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
+
+New patches can be added by editing `Makefile` and`build_magma.sh` the same way `getrf_nbparam.patch` is implemented.
--- a/.ci/magma/build_magma.sh
+++ b/.ci/magma/build_magma.sh
@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+set -eou pipefail
+
+# Environment variables
+# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+MAGMA_VERSION=2.6.1
+
+# Folders for the build
+PACKAGE_FILES=${ROOT_DIR}/magma/package_files # source patches and metadata
+PACKAGE_DIR=${ROOT_DIR}/magma/${PACKAGE_NAME} # build workspace
+PACKAGE_OUTPUT=${ROOT_DIR}/magma/output # where tarballs are stored
+PACKAGE_BUILD=${PACKAGE_DIR}/build # where the content of the tarball is prepared
+PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
+PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
+mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
+
+# Fetch magma sources and verify checksum
+pushd ${PACKAGE_DIR}
+curl -LO http://icl.utk.edu/projectsfiles/magma/downloads/magma-${MAGMA_VERSION}.tar.gz
+tar zxf magma-${MAGMA_VERSION}.tar.gz
+sha256sum --check < ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256
+popd
+
+# Apply patches and build
+pushd ${PACKAGE_DIR}/magma-${MAGMA_VERSION}
+patch < ${PACKAGE_FILES}/CMake.patch
+patch < ${PACKAGE_FILES}/cmakelists.patch
+patch -p0 < ${PACKAGE_FILES}/thread_queue.patch
+patch -p1 < ${PACKAGE_FILES}/getrf_shfl.patch
+patch -p1 < ${PACKAGE_FILES}/getrf_nbparam.patch
+# The build.sh script expects to be executed from the sources root folder
+INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
+popd
+
+# Package recipe, license and tarball
+# Folder and package name are backward compatible for the build workflow
+cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
+cp ${PACKAGE_FILES}/thread_queue.patch ${PACKAGE_RECIPE}/thread_queue.patch
+cp ${PACKAGE_FILES}/cmakelists.patch ${PACKAGE_RECIPE}/cmakelists.patch
+cp ${PACKAGE_FILES}/getrf_shfl.patch ${PACKAGE_RECIPE}/getrf_shfl.patch
+cp ${PACKAGE_FILES}/getrf_nbparam.patch ${PACKAGE_RECIPE}/getrf_nbparam.patch
+cp ${PACKAGE_FILES}/CMake.patch ${PACKAGE_RECIPE}/CMake.patch
+cp ${PACKAGE_FILES}/magma-${MAGMA_VERSION}.sha256 ${PACKAGE_RECIPE}/magma-${MAGMA_VERSION}.sha256
+cp ${PACKAGE_DIR}/magma-${MAGMA_VERSION}/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
+pushd ${PACKAGE_BUILD}
+tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
+echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
+popd
--- a/.ci/magma/package_files/CMake.patch
+++ b/.ci/magma/package_files/CMake.patch
@ -0,0 +1,40 @@
+--- CMake.src.cuda	2023-03-29 10:05:32.136954140 +0000
+++ CMake.src.cuda	2023-03-29 10:05:50.281318043 +0000
+@@ -283,10 +283,10 @@
+ magmablas/zgeadd.cu
+ magmablas/zgeadd2.cu
+ magmablas/zgeam.cu
+-magmablas/zgemm_fermi.cu
+#magmablas/zgemm_fermi.cu
+ magmablas/zgemm_reduce.cu
+ magmablas/zgemv_conj.cu
+-magmablas/zgemv_fermi.cu
+#magmablas/zgemv_fermi.cu
+ magmablas/zgerbt.cu
+ magmablas/zgerbt_kernels.cu
+ magmablas/zgetmatrix_transpose.cpp
+@@ -1009,18 +1009,18 @@
+ magmablas/sgeam.cu
+ magmablas/dgeam.cu
+ magmablas/cgeam.cu
+-magmablas/sgemm_fermi.cu
+-magmablas/dgemm_fermi.cu
+-magmablas/cgemm_fermi.cu
+#magmablas/sgemm_fermi.cu
+#magmablas/dgemm_fermi.cu
+#magmablas/cgemm_fermi.cu
+ magmablas/sgemm_reduce.cu
+ magmablas/dgemm_reduce.cu
+ magmablas/cgemm_reduce.cu
+ magmablas/sgemv_conj.cu
+ magmablas/dgemv_conj.cu
+ magmablas/cgemv_conj.cu
+-magmablas/sgemv_fermi.cu
+-magmablas/dgemv_fermi.cu
+-magmablas/cgemv_fermi.cu
+#magmablas/sgemv_fermi.cu
+#magmablas/dgemv_fermi.cu
+#magmablas/cgemv_fermi.cu
+ magmablas/sgerbt.cu
+ magmablas/dgerbt.cu
+ magmablas/cgerbt.cu
--- a/.ci/magma/package_files/build.sh
+++ b/.ci/magma/package_files/build.sh
@ -0,0 +1,12 @@
+CUDA__VERSION=$(nvcc --version|sed -n 4p|cut -f5 -d" "|cut -f1 -d",")
+if [ "$CUDA__VERSION" != "$DESIRED_CUDA" ]; then
+    echo "CUDA Version is not $DESIRED_CUDA. CUDA Version found: $CUDA__VERSION"
+    exit 1
+fi
+
+mkdir build
+cd build
+cmake .. -DUSE_FORTRAN=OFF -DGPU_TARGET="All" -DCMAKE_INSTALL_PREFIX="$INSTALL_DIR" -DCUDA_ARCH_LIST="$CUDA_ARCH_LIST"
+make -j$(getconf _NPROCESSORS_CONF)
+make install
+cd ..
--- a/.ci/magma/package_files/cmakelists.patch
+++ b/.ci/magma/package_files/cmakelists.patch
@ -0,0 +1,388 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index d5d8d87d..8a507334 100644
+--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+@@ -3,7 +3,7 @@ cmake_minimum_required( VERSION 2.8.1 )
+ # ----------------------------------------
+ # to disable Fortran, set this to "off"
+ # see also -DADD_ below
+-option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" ON )
+option( USE_FORTRAN "Fortran is required for some tester checks, but can be disabled with reduced functionality" OFF )
+
+ if (USE_FORTRAN)
+     project( MAGMA C CXX Fortran )
+@@ -75,6 +75,8 @@ else()
+     message( WARNING "The compiler ${CMAKE_CXX_COMPILER} doesn't support the -std=c++11 flag. Some code may not compile.")
+ endif()
+
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static-libstdc++ -fno-exceptions")
+
+ CHECK_C_COMPILER_FLAG("-std=c99" COMPILER_SUPPORTS_C99)
+ if (COMPILER_SUPPORTS_C99)
+     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99")
+@@ -101,15 +103,15 @@ endif()
+
+
+ # ----------------------------------------
+-# locate OpenMP
+-find_package( OpenMP )
+-if (OPENMP_FOUND)
+-    message( STATUS "Found OpenMP" )
+-    message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
+-    message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
+-    set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
+-    set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
+-endif()
+# # locate OpenMP
+# find_package( OpenMP )
+# if (OPENMP_FOUND)
+#     message( STATUS "Found OpenMP" )
+#     message( STATUS "    OpenMP_C_FLAGS   ${OpenMP_C_FLAGS}" )
+#     message( STATUS "    OpenMP_CXX_FLAGS ${OpenMP_CXX_FLAGS}" )
+#     set( CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}" )
+#     set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}" )
+# endif()
+
+ if (MAGMA_ENABLE_CUDA)
+   # ----------------------------------------
+@@ -132,7 +134,7 @@ if (MAGMA_ENABLE_CUDA)
+     set( NV_SM    "" )
+     set( NV_COMP  "" )
+
+-    set(CUDA_SEPARABLE_COMPILATION ON)
+    set(CUDA_SEPARABLE_COMPILATION OFF)
+
+     # nvcc >= 6.5 supports -std=c++11, so propagate CXXFLAGS to NVCCFLAGS.
+     # Older nvcc didn't support -std=c++11, so previously we disabled propagation.
+@@ -294,11 +296,18 @@ if (MAGMA_ENABLE_CUDA)
+         message( STATUS "    compile for CUDA arch 8.0 (Ampere)" )
+     endif()
+
+    if ( ${GPU_TARGET} MATCHES "All")
+        set( MIN_ARCH 370)
+        SET( NV_SM ${CUDA_ARCH_LIST})
+        SET( NV_COMP "")
+    endif()
+
+     if (NOT MIN_ARCH)
+         message( FATAL_ERROR "GPU_TARGET must contain one or more of Fermi, Kepler, Maxwell, Pascal, Volta, Turing, Ampere, or valid sm_[0-9][0-9]" )
+     endif()
+
+-    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -Xcompiler -fPIC ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
+    set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} -DHAVE_CUBLAS -Xfatbin -compress-all -Xcompiler -fPIC -std=c++11 ${NV_SM} ${NV_COMP} ${FORTRAN_CONVENTION} )
+    MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
+     #add_definitions( "-DMAGMA_HAVE_CUDA -DMAGMA_CUDA_ARCH_MIN=${MIN_ARCH}" )
+     set(MAGMA_HAVE_CUDA "1")
+     set(MAGMA_CUDA_ARCH_MIN "${MIN_ARCH}")
+@@ -413,7 +422,7 @@ set_property(CACHE BLA_VENDOR PROPERTY STRINGS
+ set( LAPACK_LIBRARIES "" CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" )
+ if (LAPACK_LIBRARIES STREQUAL "")
+     message( STATUS "Searching for BLAS and LAPACK. To override, set LAPACK_LIBRARIES using ccmake." )
+-    find_package( LAPACK )
+    # find_package( LAPACK )
+     # force showing updated LAPACK_LIBRARIES in ccmake / cmake-gui.
+     set( LAPACK_LIBRARIES ${LAPACK_LIBRARIES} CACHE STRING "Libraries for LAPACK and BLAS, to manually override search" FORCE )
+ else()
+@@ -552,12 +561,12 @@ if (WIN32)
+     #message( "libmagma_all_f   ${libmagma_all_f}"   )
+
+     # on Windows, Fortran files aren't compiled if listed here...
+-    cuda_add_library( magma ${libmagma_all_cpp} )
+    cuda_add_library( magma STATIC ${libmagma_all_cpp} OPTIONS --compiler-options "-fPIC")
+     target_link_libraries( magma
+         ${LAPACK_LIBRARIES}
+         ${CUDA_CUDART_LIBRARY}
+         ${CUDA_CUBLAS_LIBRARIES}
+-        ${CUDA_cusparse_LIBRARY}
+        # ${CUDA_cusparse_LIBRARY}
+     )
+
+     # no Fortran files at the moment (how to test libmagma_all_f is not empty?),
+@@ -575,13 +584,13 @@ if (WIN32)
+ else()
+     # Unix doesn't seem to have a problem with mixing C, CUDA, and Fortran files
+     if (MAGMA_ENABLE_CUDA)
+-      cuda_add_library( magma ${libmagma_all} )
+      cuda_add_library( magma STATIC ${libmagma_all} OPTIONS --compiler-options "-fPIC")
+       target_link_libraries( magma
+         ${blas_fix}
+         ${LAPACK_LIBRARIES}
+         ${CUDA_CUDART_LIBRARY}
+         ${CUDA_CUBLAS_LIBRARIES}
+-        ${CUDA_cusparse_LIBRARY}
+        # ${CUDA_cusparse_LIBRARY}
+ 	)
+     else()
+       find_package( hipBLAS )
+@@ -614,138 +623,139 @@ else()
+     endif()
+ endif()
+ add_custom_target( lib DEPENDS magma )
+-
+-
+-# ----------------------------------------
+-# compile lapacktest library
+-# If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
+-# else,           compile only C++     files, not Fortran files
+-if (USE_FORTRAN)
+-    foreach( filename ${liblapacktest_all} )
+-        if (filename MATCHES "\\.(f|f90|F90)$")
+-            list( APPEND liblapacktest_all_f ${filename} )
+-        endif()
+-    endforeach()
+-    add_library( lapacktest ${liblapacktest_all_f} )
+-else()
+-    # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
+-    foreach( filename ${liblapacktest_all} )
+-        if (filename MATCHES "\\.(c|cu|cpp)$")
+-            list( APPEND liblapacktest_all_cpp ${filename} )
+-        endif()
+-    endforeach()
+-    add_library( lapacktest ${liblapacktest_all_cpp} )
+-endif()
+-target_link_libraries( lapacktest
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-)
+-
+-
+-# ----------------------------------------
+-# compile tester library
+-add_library( tester ${libtest_all} )
+-target_link_libraries( tester
+-    magma
+-    lapacktest
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-)
+set_target_properties(magma PROPERTIES POSITION_INDEPENDENT_CODE ON)
+
+
+# # ----------------------------------------
+# # compile lapacktest library
+# # If use fortran, compile only Fortran files, not magma_[sdcz]_no_fortran.cpp
+# # else,           compile only C++     files, not Fortran files
+# if (USE_FORTRAN)
+#     foreach( filename ${liblapacktest_all} )
+#         if (filename MATCHES "\\.(f|f90|F90)$")
+#             list( APPEND liblapacktest_all_f ${filename} )
+#         endif()
+#     endforeach()
+#     add_library( lapacktest ${liblapacktest_all_f} )
+# else()
+#     # alternatively, use only C/C++/CUDA files, including magma_[sdcz]_no_fortran.cpp
+#     foreach( filename ${liblapacktest_all} )
+#         if (filename MATCHES "\\.(c|cu|cpp)$")
+#             list( APPEND liblapacktest_all_cpp ${filename} )
+#         endif()
+#     endforeach()
+#     add_library( lapacktest ${liblapacktest_all_cpp} )
+# endif()
+# target_link_libraries( lapacktest
+#     ${blas_fix}
+#     ${LAPACK_LIBRARIES}
+# )
+
+
+# # ----------------------------------------
+# # compile tester library
+# add_library( tester ${libtest_all} )
+# target_link_libraries( tester
+#     magma
+#     lapacktest
+#     ${blas_fix}
+#     ${LAPACK_LIBRARIES}
+# )
+
+
+ # ----------------------------------------
+ # compile MAGMA sparse library
+
+ # sparse doesn't have Fortran at the moment, so no need for above shenanigans
+-if (MAGMA_ENABLE_CUDA)
+-  include_directories( sparse/include )
+-  include_directories( sparse/control )
+-else()
+-  include_directories( sparse_hip/include )
+-  include_directories( sparse_hip/control )
+-endif()
+-include_directories( testing )
+-
+-if (MAGMA_ENABLE_CUDA)
+-  cuda_add_library( magma_sparse ${libsparse_all} )
+-  target_link_libraries( magma_sparse
+-    magma
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-    ${CUDA_CUDART_LIBRARY}
+-    ${CUDA_CUBLAS_LIBRARIES}
+-    ${CUDA_cusparse_LIBRARY}
+-    )
+-else()
+-  add_library( magma_sparse ${libsparse_all} )
+-  target_link_libraries( magma_sparse
+-    magma
+-    ${blas_fix}
+-    ${LAPACK_LIBRARIES}
+-    hip::device
+-    roc::hipblas
+-    roc::hipsparse
+-    )
+-endif()
+-add_custom_target( sparse-lib DEPENDS magma_sparse )
+-
+-
+-# ----------------------------------------
+-# compile each tester
+-
+-# save testers to testing/
+-# save tester lib files to testing_lib/ to avoid cluttering lib/
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
+-set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
+-set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
+-
+-# skip Fortran testers, which require an extra file from CUDA
+-foreach( filename ${testing_all} )
+-    if (filename MATCHES "\\.(c|cu|cpp)$")
+-        list( APPEND testing_all_cpp ${filename} )
+-    endif()
+-endforeach()
+-foreach( TEST ${testing_all_cpp} )
+-    string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
+-    string( REGEX REPLACE "testing/" "" EXE ${EXE} )
+-    #message( "${TEST} --> ${EXE}" )
+-    add_executable( ${EXE} ${TEST} )
+-    target_link_libraries( ${EXE} tester lapacktest magma )
+-    list( APPEND testing ${EXE} )
+-endforeach()
+-add_custom_target( testing DEPENDS ${testing} )
+-
+-
+-# ----------------------------------------
+-# compile each sparse tester
+-
+-if (MAGMA_ENABLE_CUDA)
+-  set(SPARSE_TEST_DIR "sparse/testing")
+-else()
+-  set(SPARSE_TEST_DIR "sparse_hip/testing")
+-endif()
+-
+-
+-set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
+-cmake_policy( SET CMP0037 OLD)
+-foreach( TEST ${sparse_testing_all} )
+-    string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
+-    string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
+-    #message( "${TEST} --> ${EXE}" )
+-    add_executable( ${EXE} ${TEST} )
+-    target_link_libraries( ${EXE} magma_sparse magma )
+-    list( APPEND sparse-testing ${EXE} )
+-endforeach()
+-add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
+# if (MAGMA_ENABLE_CUDA)
+#   include_directories( sparse/include )
+#   include_directories( sparse/control )
+# else()
+#   include_directories( sparse_hip/include )
+#   include_directories( sparse_hip/control )
+# endif()
+# include_directories( testing )
+
+# if (MAGMA_ENABLE_CUDA)
+#   cuda_add_library( magma_sparse ${libsparse_all} )
+#   target_link_libraries( magma_sparse
+#     magma
+#     ${blas_fix}
+#     ${LAPACK_LIBRARIES}
+#     ${CUDA_CUDART_LIBRARY}
+#     ${CUDA_CUBLAS_LIBRARIES}
+#     ${CUDA_cusparse_LIBRARY}
+#     )
+# else()
+#   add_library( magma_sparse ${libsparse_all} )
+#   target_link_libraries( magma_sparse
+#     magma
+#     ${blas_fix}
+#     ${LAPACK_LIBRARIES}
+#     hip::device
+#     roc::hipblas
+#     roc::hipsparse
+#     )
+# endif()
+# add_custom_target( sparse-lib DEPENDS magma_sparse )
+
+
+# # ----------------------------------------
+# # compile each tester
+
+# # save testers to testing/
+# # save tester lib files to testing_lib/ to avoid cluttering lib/
+# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY testing )
+# set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY testing_lib )
+# set( CMAKE_LIBRARY_OUTPUT_DIRECTORY testing_lib )
+
+# # skip Fortran testers, which require an extra file from CUDA
+# foreach( filename ${testing_all} )
+#     if (filename MATCHES "\\.(c|cu|cpp)$")
+#         list( APPEND testing_all_cpp ${filename} )
+#     endif()
+# endforeach()
+# foreach( TEST ${testing_all_cpp} )
+#     string( REGEX REPLACE "\\.(cpp|f90|F90)" "" EXE ${TEST} )
+#     string( REGEX REPLACE "testing/" "" EXE ${EXE} )
+#     #message( "${TEST} --> ${EXE}" )
+#     add_executable( ${EXE} ${TEST} )
+#     target_link_libraries( ${EXE} tester lapacktest magma )
+#     list( APPEND testing ${EXE} )
+# endforeach()
+# add_custom_target( testing DEPENDS ${testing} )
+
+
+# # ----------------------------------------
+# # compile each sparse tester
+
+# if (MAGMA_ENABLE_CUDA)
+#   set(SPARSE_TEST_DIR "sparse/testing")
+# else()
+#   set(SPARSE_TEST_DIR "sparse_hip/testing")
+# endif()
+
+
+# set( CMAKE_RUNTIME_OUTPUT_DIRECTORY "${SPARSE_TEST_DIR}" )
+# cmake_policy( SET CMP0037 OLD)
+# foreach( TEST ${sparse_testing_all} )
+#     string( REGEX REPLACE "\\.(cpp|f90|F90)"     "" EXE ${TEST} )
+#     string( REGEX REPLACE "${SPARSE_TEST_DIR}/" "" EXE ${EXE} )
+#     #message( "${TEST} --> ${EXE}" )
+#     add_executable( ${EXE} ${TEST} )
+#     target_link_libraries( ${EXE} magma_sparse magma )
+#     list( APPEND sparse-testing ${EXE} )
+# endforeach()
+# add_custom_target( sparse-testing DEPENDS ${sparse-testing} )
+
+
+ # ----------------------------------------
+ # what to install
+-install( TARGETS magma magma_sparse ${blas_fix}
+install( TARGETS magma ${blas_fix}
+          RUNTIME DESTINATION bin
+          LIBRARY DESTINATION lib
+          ARCHIVE DESTINATION lib )
+-file( GLOB headers include/*.h sparse/include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
+file( GLOB headers include/*.h "${CMAKE_BINARY_DIR}/include/*.h" )
+ if (USE_FORTRAN)
+     install( FILES ${headers} ${modules}
+              DESTINATION include )
+@@ -769,9 +779,9 @@ else()
+     "${blas_fix_lib} ${LAPACK_LIBS} hip::device roc::hipblas roc::hipsparse" )
+ endif()
+ set( MAGMA_REQUIRED "" )
+-configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
+-install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
+-         DESTINATION lib/pkgconfig )
+# configure_file( "${pkgconfig}.in" "${pkgconfig}" @ONLY )
+# install( FILES "${CMAKE_BINARY_DIR}/${pkgconfig}"
+#          DESTINATION lib/pkgconfig )
+
+ # ----------------------------------------
+ get_directory_property( compile_definitions COMPILE_DEFINITIONS )
--- a/.ci/magma/package_files/getrf_nbparam.patch
+++ b/.ci/magma/package_files/getrf_nbparam.patch
@ -0,0 +1,40 @@
+diff --git a/control/get_batched_crossover.cpp b/control/get_batched_crossover.cpp
+index 4ec57306..912f8608 100644
+--- a/control/get_batched_crossover.cpp
+++ b/control/get_batched_crossover.cpp
+@@ -119,7 +119,7 @@ void magma_get_spotrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 64;
+-    *recnb = 32;
+    *recnb = 16;
+     return;
+ }
+ 
+@@ -127,7 +127,7 @@ void magma_get_zgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
+    *recnb =  16;
+     return;
+ }
+ 
+@@ -135,7 +135,7 @@ void magma_get_cgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
+    *recnb =  16;
+     return;
+ }
+ 
+@@ -143,7 +143,7 @@ void magma_get_dgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_
+ void magma_get_sgetrf_batched_nbparam(magma_int_t n, magma_int_t *nb, magma_int_t *recnb)
+ {
+     *nb    = 128;
+-    *recnb =  32;
+    *recnb =  16;
+     return;
+ }
+ 
--- a/.ci/magma/package_files/getrf_shfl.patch
+++ b/.ci/magma/package_files/getrf_shfl.patch
@ -0,0 +1,15 @@
+diff --git a/src/zgetrf_batched.cpp b/src/zgetrf_batched.cpp
+index 24a65a90..884d9352 100644
+--- a/src/zgetrf_batched.cpp
+++ b/src/zgetrf_batched.cpp
+@@ -116,7 +116,9 @@ magma_zgetrf_batched(
+             return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+         }
+         else{
+-            return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+            // magma_cgetrf_batched_smallsq_shfl is broken, therefore let's call noshfl version for arch < 700
+            // return magma_zgetrf_batched_smallsq_shfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+            return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
+         }
+         #else
+         return magma_zgetrf_batched_smallsq_noshfl( m, dA_array, ldda, ipiv_array, info_array, batchCount, queue );
--- a/.ci/magma/package_files/magma-2.6.1.sha256
+++ b/.ci/magma/package_files/magma-2.6.1.sha256
@ -0,0 +1 @@
+6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213  magma-2.6.1.tar.gz
--- a/.ci/magma/package_files/thread_queue.patch
+++ b/.ci/magma/package_files/thread_queue.patch
@ -0,0 +1,20 @@
+--- control/thread_queue.cpp	2016-08-30 06:37:49.000000000 -0700
+++ control/thread_queue.cpp	2016-10-10 19:47:28.911580965 -0700
+@@ -15,7 +15,7 @@
+ {
+     if ( err != 0 ) {
+         fprintf( stderr, "Error: %s (%d)\n", strerror(err), err );
+-        throw std::exception();
+        // throw std::exception();
+     }
+ }
+ 
+@@ -172,7 +172,7 @@
+     check( pthread_mutex_lock( &mutex ));
+     if ( quit_flag ) {
+         fprintf( stderr, "Error: push_task() called after quit()\n" );
+-        throw std::exception();
+        // throw std::exception();
+     }
+     q.push( task );
+     ntask += 1;
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -4,12 +4,9 @@
 set -ex
 SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"

+source ${SOURCE_DIR}/set_desired_python.sh
+

-# Require only one python installation
-if [[ -z "$DESIRED_PYTHON" ]]; then
-    echo "Need to set DESIRED_PYTHON env variable"
-    exit 1
-fi
 if [[ -n "$BUILD_PYTHONLESS" && -z "$LIBTORCH_VARIANT" ]]; then
    echo "BUILD_PYTHONLESS is set, so need LIBTORCH_VARIANT to also be set"
    echo "LIBTORCH_VARIANT should be one of shared-with-deps shared-without-deps static-with-deps static-without-deps"
@ -80,27 +77,7 @@ if [[ -e /opt/openssl ]]; then
    export CMAKE_INCLUDE_PATH="/opt/openssl/include":$CMAKE_INCLUDE_PATH
 fi

-# If given a python version like 3.6m or 2.7mu, convert this to the format we
-# expect. The binary CI jobs pass in python versions like this; they also only
-# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
-# in this case
-if [[ -n "$DESIRED_PYTHON" && $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
-    python_digits="$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
-    py_majmin="${DESIRED_PYTHON}"
-    DESIRED_PYTHON="cp${python_digits}-cp${python_digits}t"
-elif [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
-    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
-    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
-    if [[ ${python_nodot} -ge 310 ]]; then
-        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:2}"
-    else
-        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:1}"
-    fi
-fi

-pydir="/opt/python/$DESIRED_PYTHON"
-export PATH="$pydir/bin:$PATH"
-echo "Will build for Python version: ${DESIRED_PYTHON} with ${python_installation}"

 mkdir -p /tmp/$WHEELHOUSE_DIR

--- a/.ci/manywheel/build_cpu.sh
+++ b/.ci/manywheel/build_cpu.sh
@ -20,8 +20,8 @@ fi
 DIR_SUFFIX=cpu
 if [[ "$GPU_ARCH_TYPE" == "xpu" ]]; then
    DIR_SUFFIX=xpu
-    # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
-    source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+    # Refer https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+    source /opt/intel/oneapi/compiler/latest/env/vars.sh
    source /opt/intel/oneapi/pti/latest/env/vars.sh
    export USE_STATIC_MKL=1
 fi
@ -63,28 +63,18 @@ DEPS_SONAME=(
 if [[ "$GPU_ARCH_TYPE" == "xpu" ]]; then
    echo "Bundling with xpu support package libs."
    DEPS_LIST+=(
-        "/opt/intel/oneapi/compiler/latest/lib/libsycl-preview.so.7"
        "/opt/intel/oneapi/compiler/latest/lib/libOpenCL.so.1"
-        "/opt/intel/oneapi/compiler/latest/lib/libxptifw.so"
        "/opt/intel/oneapi/compiler/latest/lib/libsvml.so"
        "/opt/intel/oneapi/compiler/latest/lib/libirng.so"
        "/opt/intel/oneapi/compiler/latest/lib/libimf.so"
        "/opt/intel/oneapi/compiler/latest/lib/libintlc.so.5"
-        "/opt/intel/oneapi/compiler/latest/lib/libpi_level_zero.so"
-        "/opt/intel/oneapi/pti/latest/lib/libpti_view.so.0.9"
-        "/opt/intel/oneapi/pti/latest/lib/libpti.so.0.9"
    )
    DEPS_SONAME+=(
-        "libsycl-preview.so.7"
        "libOpenCL.so.1"
-        "libxptifw.so"
        "libsvml.so"
        "libirng.so"
        "libimf.so"
        "libintlc.so.5"
-        "libpi_level_zero.so"
-        "libpti_view.so.0.9"
-        "libpti.so.0.9"
    )
 fi

--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -59,7 +59,7 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')

 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
-    12.4)
+    12.6)
        if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
            TORCH_CUDA_ARCH_LIST="9.0"
        else
@ -67,6 +67,14 @@ case ${CUDA_VERSION} in
        fi
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
+    12.4)
+        if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
+            TORCH_CUDA_ARCH_LIST="9.0"
+        else
+            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
+        fi
+        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        ;;
    12.1)
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
@ -75,10 +83,6 @@ case ${CUDA_VERSION} in
        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7;9.0"
        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
        ;;
-    11.[67])
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};3.7"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
        exit 1
@ -118,7 +122,9 @@ DEPS_SONAME=(
    "libgomp.so.1"
 )

-if [[ $USE_CUSPARSELT == "1" ]]; then
+# CUDA 11.8 have to ship the libcusparseLt.so.0 with the binary
+# since nvidia-cusparselt-cu11 is not available in PYPI
+if [[ $USE_CUSPARSELT == "1" && $CUDA_VERSION == "11.8" ]]; then
        DEPS_SONAME+=(
            "libcusparseLt.so.0"
        )
@ -127,7 +133,7 @@ if [[ $USE_CUSPARSELT == "1" ]]; then
        )
 fi

-if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then
+if [[ $CUDA_VERSION == "12.4" || $CUDA_VERSION == "12.6" ]]; then
    export USE_STATIC_CUDNN=0
    # Try parallelizing nvcc as well
    export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
@ -145,6 +151,7 @@ if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then
            "/usr/local/cuda/lib64/libcudnn.so.9"
            "/usr/local/cuda/lib64/libcublas.so.12"
            "/usr/local/cuda/lib64/libcublasLt.so.12"
+            "/usr/local/cuda/lib64/libcusparseLt.so.0"
            "/usr/local/cuda/lib64/libcudart.so.12"
            "/usr/local/cuda/lib64/libnvToolsExt.so.1"
            "/usr/local/cuda/lib64/libnvrtc.so.12"
@ -161,6 +168,7 @@ if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then
            "libcudnn.so.9"
            "libcublas.so.12"
            "libcublasLt.so.12"
+            "libcusparseLt.so.0"
            "libcudart.so.12"
            "libnvToolsExt.so.1"
            "libnvrtc.so.12"
@ -178,6 +186,7 @@ if [[ $CUDA_VERSION == "12.1" || $CUDA_VERSION == "12.4" ]]; then
            '$ORIGIN/../../nvidia/curand/lib'
            '$ORIGIN/../../nvidia/cusolver/lib'
            '$ORIGIN/../../nvidia/cusparse/lib'
+            '$ORIGIN/../../cusparselt/lib'
            '$ORIGIN/../../nvidia/nccl/lib'
            '$ORIGIN/../../nvidia/nvtx/lib'
        )
--- a/.ci/manywheel/set_desired_python.sh
+++ b/.ci/manywheel/set_desired_python.sh
@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+# Require only one python installation
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    echo "Need to set DESIRED_PYTHON env variable"
+    exit 1
+fi
+
+# If given a python version like 3.6m or 2.7mu, convert this to the format we
+# expect. The binary CI jobs pass in python versions like this; they also only
+# ever pass one python version, so we assume that DESIRED_PYTHON is not a list
+# in this case
+if [[ -n "$DESIRED_PYTHON" && $DESIRED_PYTHON =~ ([0-9].[0-9]+)t ]]; then
+    python_digits="$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    py_majmin="${DESIRED_PYTHON}"
+    DESIRED_PYTHON="cp${python_digits}-cp${python_digits}t"
+elif [[ -n "$DESIRED_PYTHON" && "$DESIRED_PYTHON" != cp* ]]; then
+    python_nodot="$(echo $DESIRED_PYTHON | tr -d m.u)"
+    DESIRED_PYTHON="cp${python_nodot}-cp${python_nodot}"
+    if [[ ${python_nodot} -ge 310 ]]; then
+        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:2}"
+    else
+        py_majmin="${DESIRED_PYTHON:2:1}.${DESIRED_PYTHON:3:1}"
+    fi
+fi
+
+pydir="/opt/python/$DESIRED_PYTHON"
+export DESIRED_PYTHON_BIN_DIR="${pydir}/bin"
+export PATH="$DESIRED_PYTHON_BIN_DIR:$PATH"
+echo "Will build for Python version: ${DESIRED_PYTHON}"
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -81,14 +81,15 @@ function pip_install_whl() {

 function pip_install() {
  # retry 3 times
-  # old versions of pip don't have the "--progress-bar" flag
-  pip install --progress-bar off "$@" || pip install --progress-bar off "$@" || pip install --progress-bar off "$@" ||\
-  pip install "$@" || pip install "$@" || pip install "$@"
+  pip_install_pkg="python3 -m pip install --progress-bar off"
+  ${pip_install_pkg} "$@" || \
+    ${pip_install_pkg} "$@" || \
+    ${pip_install_pkg} "$@"
 }

 function pip_uninstall() {
  # uninstall 2 times
-  pip uninstall -y "$@" || pip uninstall -y "$@"
+  pip3 uninstall -y "$@" || pip3 uninstall -y "$@"
 }

 function get_exit_code() {
@ -104,9 +105,9 @@ function get_bazel() {
  # version of Bazelisk to fetch the platform specific version of
  # Bazel to use from .bazelversion.
  retry curl --location --output tools/bazel \
-    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.16.0/bazelisk.py
+    https://raw.githubusercontent.com/bazelbuild/bazelisk/v1.23.0/bazelisk.py
  shasum --algorithm=1 --check \
-    <(echo 'd4369c3d293814d3188019c9f7527a948972d9f8  tools/bazel')
+    <(echo '01df9cf7f08dd80d83979ed0d0666a99349ae93c  tools/bazel')
  chmod u+x tools/bazel
 }

@ -227,6 +228,9 @@ function checkout_install_torchbench() {
  git clone https://github.com/pytorch/benchmark torchbench
  pushd torchbench
  git checkout "$commit"
+  rm -rf torchbenchmark/models/*
+  git checkout -- torchbenchmark/models/__init__.py
+  git checkout -- torchbenchmark/models/hf_T5

  if [ "$1" ]; then
    python install.py --continue_on_fail models "$@"
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -1,4 +1,5 @@
 #!/bin/bash
+set -x

 # shellcheck disable=SC2034
 # shellcheck source=./macos-common.sh
@ -148,21 +149,153 @@ test_jit_hooks() {
  assert_git_not_dirty
 }

+torchbench_setup_macos() {
+  git clone --recursive https://github.com/pytorch/vision torchvision
+  git clone --recursive https://github.com/pytorch/audio torchaudio
+
+  pushd torchvision
+  git fetch
+  git checkout "$(cat ../.github/ci_commit_pins/vision.txt)"
+  git submodule update --init --recursive
+  python setup.py clean
+  python setup.py develop
+  popd
+
+  pushd torchaudio
+  git fetch
+  git checkout "$(cat ../.github/ci_commit_pins/audio.txt)"
+  git submodule update --init --recursive
+  python setup.py clean
+  python setup.py develop
+  popd
+
+  # Shellcheck doesn't like it when you pass no arguments to a function that can take args. See https://www.shellcheck.net/wiki/SC2120
+  # shellcheck disable=SC2119,SC2120
+  checkout_install_torchbench
+}
+
+conda_benchmark_deps() {
+  conda install -y astunparse numpy scipy ninja pyyaml setuptools cmake typing-extensions requests protobuf numba cython scikit-learn
+  conda install -y -c conda-forge librosa
+}
+
+
+test_torchbench_perf() {
+  print_cmake_info
+
+  echo "Launching torchbench setup"
+  conda_benchmark_deps
+  torchbench_setup_macos
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  echo "Setup complete, launching torchbench training performance run"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+
+  echo "Launching torchbench inference performance run"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+
+  echo "Pytorch benchmark on mps device completed"
+}
+
+test_torchbench_smoketest() {
+  print_cmake_info
+
+  echo "Launching torchbench setup"
+  conda_benchmark_deps
+  # shellcheck disable=SC2119,SC2120
+  torchbench_setup_macos
+
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  touch "$TEST_REPORTS_DIR"/torchbench_training.csv
+  touch "$TEST_REPORTS_DIR"/torchbench_inference.csv
+
+  echo "Setup complete, launching torchbench training performance run"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only hf_T5 --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only llama --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only BERT_pytorch --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only dcgan --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only hf_GPT2 --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only yolov3 --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only resnet152 --backend eager --training --devices mps --output "$TEST_REPORTS_DIR/torchbench_training.csv"
+
+  echo "Launching torchbench inference performance run"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only hf_T5 --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only llama --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only BERT_pytorch --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only dcgan --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only hf_GPT2 --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only yolov3 --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+  PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py --performance --only resnet152 --backend eager --inference --devices mps --output "$TEST_REPORTS_DIR/torchbench_inference.csv"
+
+  echo "Pytorch benchmark on mps device completed"
+}
+
+test_hf_perf() {
+  print_cmake_info
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  conda_benchmark_deps
+  torchbench_setup_macos
+
+  echo "Launching HuggingFace training perf run"
+  python "$(pwd)"/benchmarks/dynamo/huggingface.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/hf_training.csv
+
+  echo "Launching HuggingFace inference perf run"
+  python "$(pwd)"/benchmarks/dynamo/huggingface.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/hf_inference.csv
+
+  echo "HuggingFace benchmark on mps device completed"
+}
+
+test_timm_perf() {
+  print_cmake_info
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  conda_benchmark_deps
+  torchbench_setup_macos
+
+  echo "Launching timm training perf run"
+  python "$(pwd)"/benchmarks/dynamo/timm_models.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/timm_training.csv
+
+  echo "Launching timm inference perf run"
+  python "$(pwd)"/benchmarks/dynamo/timm_models.py --backend eager --device mps --performance --training --output="${TEST_REPORTS_DIR}"/timm_inference.csv
+
+  echo "timm benchmark on mps device completed"
+}
+
 install_tlparse

-if [[ $NUM_TEST_SHARDS -gt 1 ]]; then
-  test_python_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
+if [[ $TEST_CONFIG == *"test_mps"* ]]; then
+  if [[ $NUM_TEST_SHARDS -gt 1 ]]; then
+    test_python_shard "${SHARD_NUMBER}"
+    if [[ "${SHARD_NUMBER}" == 1 ]]; then
+      test_libtorch
+      test_custom_script_ops
+    elif [[ "${SHARD_NUMBER}" == 2 ]]; then
+      test_jit_hooks
+      test_custom_backend
+    fi
+  else
+    test_python_all
    test_libtorch
    test_custom_script_ops
-  elif [[ "${SHARD_NUMBER}" == 2 ]]; then
    test_jit_hooks
    test_custom_backend
  fi
-else
-  test_python_all
-  test_libtorch
-  test_custom_script_ops
-  test_jit_hooks
-  test_custom_backend
+fi
+
+if [[ $TEST_CONFIG == *"perf_all"* ]]; then
+  test_torchbench_perf
+  test_hf_perf
+  test_timm_perf
+elif [[ $TEST_CONFIG == *"perf_torchbench"* ]]; then
+  test_torchbench_perf
+elif [[ $TEST_CONFIG == *"perf_hf"* ]]; then
+  test_hf_perf
+elif [[ $TEST_CONFIG == *"perf_timm"* ]]; then
+  test_timm_perf
+elif [[ $TEST_CONFIG == *"perf_smoketest"* ]]; then
+  test_torchbench_smoketest
 fi
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -169,9 +169,13 @@ fi

 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
+  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
+  if [ -f /opt/intel/oneapi/umf/latest/env/vars.sh ]; then
+    # shellcheck disable=SC1091
+    source /opt/intel/oneapi/umf/latest/env/vars.sh
+  fi
  # Check XPU status before testing
  xpu-smi discovery
 fi
@ -496,6 +500,7 @@ test_perf_for_dashboard() {
      elif [[ "$target" == "accuracy" ]]; then
        target_flag+=( --no-translation-validation)
      fi
+      target_flag+=( --only hf_T5)

      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
@ -1197,7 +1202,7 @@ EOF
  git reset --hard "${SHA_TO_COMPARE}"
  git submodule sync && git submodule update --init --recursive
  echo "::group::Installing Torch From Base Commit"
-  pip install -r requirements.txt
+  pip3 install -r requirements.txt
  # shellcheck source=./common-build.sh
  source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"
  python setup.py bdist_wheel --bdist-dir="base_bdist_tmp" --dist-dir="base_dist"
@ -1354,10 +1359,11 @@ test_executorch() {
  export EXECUTORCH_BUILD_PYBIND=ON
  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

+  # For llama3
+  bash examples/models/llama3_2_vision/install_requirements.sh
  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
  # from the PR
-  # shellcheck disable=SC1091
-  source .ci/scripts/setup-linux.sh cmake
+  bash .ci/scripts/setup-linux.sh cmake

  echo "Run ExecuTorch unit tests"
  pytest -v -n auto
@ -1405,7 +1411,11 @@ if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-baze
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
+if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
+  # Install numpy-2.0.2 and test inductor tracing
+  python -mpip install --pre numpy==2.0.2
+  python test/run_test.py --include dynamo/test_unspec.py
+elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
@ -1,6 +1,6 @@
@echo on
 REM Description: Install Intel Support Packages on Windows
-REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
+REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html

 set XPU_INSTALL_MODE=%~1
 if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
@ -28,15 +28,28 @@ if "%XPU_INSTALL_MODE%"=="driver" goto xpu_install_end

 set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
 set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-for-pytorch-gpu-dev_p_0.5.3.37_offline.exe
-set XPU_PTI_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-pti-dev_p_0.9.0.37_offline.exe
-set XPU_BUNDLE_VERSION=0.5.3+31
-set XPU_PTI_VERSION=0.9.0+36
 set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.intel-for-pytorch-gpu-dev.product
-set XPU_PTI_PRODUCT_NAME=intel.oneapi.win.intel-pti-dev.product
+set XPU_BUNDLE_VERSION=0.5.3+31
 set XPU_BUNDLE_INSTALLED=0
-set XPU_PTI_INSTALLED=0
 set XPU_BUNDLE_UNINSTALL=0
-set XPU_PTI_UNINSTALL=0
+set XPU_EXTRA_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-pti-dev_p_0.9.0.37_offline.exe
+set XPU_EXTRA_PRODUCT_NAME=intel.oneapi.win.intel-pti-dev.product
+set XPU_EXTRA_VERSION=0.9.0+36
+set XPU_EXTRA_INSTALLED=0
+set XPU_EXTRA_UNINSTALL=0
+
+if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] (
+    set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe
+    set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
+    set XPU_BUNDLE_VERSION=2025.0.0+335
+    set XPU_BUNDLE_INSTALLED=0
+    set XPU_BUNDLE_UNINSTALL=0
+    set XPU_EXTRA_URL=NULL
+    set XPU_EXTRA_PRODUCT_NAME=intel.oneapi.win.compiler.product
+    set XPU_EXTRA_VERSION=2025.0.1+1226
+    set XPU_EXTRA_INSTALLED=0
+    set XPU_EXTRA_UNINSTALL=0
+)

 :: Check if XPU bundle is target version or already installed
 if exist "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" goto xpu_bundle_ver_check
@ -51,25 +64,34 @@ for /f "tokens=1,2" %%a in (xpu_bundle_installed_ver.log) do (
        echo %%a Installed Version: %%b
        set XPU_BUNDLE_INSTALLED=1
        if not "%XPU_BUNDLE_VERSION%"=="%%b" (
-            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_BUNDLE_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
+            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %%a --product-ver %%b --log-dir uninstall_bundle
            set XPU_BUNDLE_UNINSTALL=1
        )
    )
-    if "%%a"=="%XPU_PTI_PRODUCT_NAME%" (
+    if "%%a"=="%XPU_EXTRA_PRODUCT_NAME%" (
        echo %%a Installed Version: %%b
-        set XPU_PTI_INSTALLED=1
-        if not "%XPU_PTI_VERSION%"=="%%b" (
-            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_PTI_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
-            set XPU_PTI_UNINSTALL=1
+        set XPU_EXTRA_INSTALLED=1
+        if not "%XPU_EXTRA_VERSION%"=="%%b" (
+            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %%a --product-ver %%b --log-dir uninstall_bundle
+            set XPU_EXTRA_UNINSTALL=1
        )
    )
+    if not "%%b" == "Version" if not [%%b]==[] if not "%%a"=="%XPU_BUNDLE_PRODUCT_NAME%" if not "%%a"=="%XPU_EXTRA_PRODUCT_NAME%" (
+        echo "Uninstalling...."
+        start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %%a --product-ver %%b --log-dir uninstall_bundle
+    )
 )
 if errorlevel 1 exit /b 1
 if exist xpu_bundle_installed_ver.log del xpu_bundle_installed_ver.log
+if exist uninstall_bundle rmdir /s /q uninstall_bundle
 if "%XPU_BUNDLE_INSTALLED%"=="0" goto xpu_bundle_install
 if "%XPU_BUNDLE_UNINSTALL%"=="1" goto xpu_bundle_install
-if "%XPU_PTI_INSTALLED%"=="0" goto xpu_pti_install
-if "%XPU_PTI_UNINSTALL%"=="1" goto xpu_pti_install
+
+:xpu_extra_check
+
+if "%XPU_EXTRA_URL%"=="NULL" goto xpu_install_end
+if "%XPU_EXTRA_INSTALLED%"=="0" goto xpu_extra_install
+if "%XPU_EXTRA_UNINSTALL%"=="1" goto xpu_extra_install
 goto xpu_install_end

 :xpu_bundle_install
@ -79,13 +101,14 @@ echo "XPU Bundle installing..."
 start /wait "Intel Pytorch Bundle Installer" "xpu_bundle.exe" --action=install --eula=accept --silent --log-dir install_bundle
 if errorlevel 1 exit /b 1
 del xpu_bundle.exe
+goto xpu_extra_check

-:xpu_pti_install
+:xpu_extra_install

-curl -o xpu_pti.exe --retry 3 --retry-all-errors -k %XPU_PTI_URL%
-echo "XPU PTI installing..."
-start /wait "Intel PTI Installer" "xpu_pti.exe" --action=install --eula=accept --silent --log-dir install_bundle
+curl -o xpu_extra.exe --retry 3 --retry-all-errors -k %XPU_EXTRA_URL%
+echo "Intel XPU EXTRA installing..."
+start /wait "Intel XPU EXTRA Installer" "xpu_extra.exe" --action=install --eula=accept --silent --log-dir install_bundle
 if errorlevel 1 exit /b 1
-del xpu_pti.exe
+del xpu_extra.exe

 :xpu_install_end
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
 cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
 # build a FAT bianry
 cd ${ZIP_DIR}/install/lib
-target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
 for lib in ${target_libs[*]}
 do
    if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -13,6 +13,7 @@ export VC_YEAR=2019
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export VC_YEAR=2022
    export USE_SCCACHE=0
+    export XPU_VERSION=2025.0
 fi

 echo "Free space on filesystem before build:"
--- a/.clang-format
+++ b/.clang-format
@ -101,9 +101,17 @@ SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard:        c++17
 StatementMacros:
+  - C10_DEFINE_bool
+  - C10_DEFINE_int
+  - C10_DEFINE_int32
+  - C10_DEFINE_int64
+  - C10_DEFINE_string
+  - DEFINE_BINARY
  - PyObject_HEAD
  - PyObject_VAR_HEAD
  - PyException_HEAD
+  - TORCH_DECLARE_bool
+
 TabWidth:        8
 UseTab:          Never
 ---
--- a/.clang-tidy
+++ b/.clang-tidy
@ -29,19 +29,19 @@ cppcoreguidelines-*,
 -cppcoreguidelines-pro-type-static-cast-downcast,
 -cppcoreguidelines-pro-type-union-access,
 -cppcoreguidelines-pro-type-vararg,
-cppcoreguidelines-special-member-functions,
 -cppcoreguidelines-non-private-member-variables-in-classes,
 -facebook-hte-RelativeInclude,
 hicpp-exception-baseclass,
 hicpp-avoid-goto,
 misc-*,
+-misc-confusable-identifiers,
 -misc-const-correctness,
 -misc-include-cleaner,
 -misc-use-anonymous-namespace,
 -misc-unused-parameters,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
-misc-confusable-identifiers,
+-misc-unused-using-decls,
 modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
@ -63,5 +63,7 @@ readability-string-compare,
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
 CheckOptions:
-  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
+  cppcoreguidelines-special-member-functions.AllowSoleDefaultDtor: true
+  cppcoreguidelines-special-member-functions.AllowImplicitlyDeletedCopyOrMove: true
+  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h;IListRef.h'
 ...
--- a/.gitattributes
+++ b/.gitattributes
@ -5,3 +5,4 @@
 .github/scripts/gql_mocks.json linguist-generated=true
 third_party/LICENSES_BUNDLED.txt linguist-generated=true
 tools/build/bazel/requirements.txt linguist-generated=true
+torch/csrc/utils/generated_serialization_types.h linguist-generated=true
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -5,8 +5,7 @@ about: Tracking incidents for PyTorch's CI infra.

 > NOTE: Remember to label this issue with "`ci: sev`"

- <!-- uncomment the below line if you don't want this SEV to block merges -->
- <!--  **MERGE BLOCKING** -->
+ <!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->

 ## Current Status
 *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@ -14,7 +14,7 @@ body:

        - Ensure rtol/atol are at default tolerances

-        - Dont compare indices of max/min etc, because that avoids the above requirement
+        - Don't compare indices of max/min etc, because that avoids the above requirement

        - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline

@ -25,6 +25,14 @@ body:
      label: 🐛 Describe the bug
      description: |
        Please provide a clear and concise description of what the bug is.
+
+        See https://pytorch.org/docs/main/torch.compiler_troubleshooting.html#reporting-issues
+        for guidance on what to additionally include. In particular, consider including:
+
+        - The `tlparse` for your program
+        - Ablation - which `torch.compile` backend/mode/settings cause the bug
+        - A minimal reproducer
+
      placeholder: |
        A clear and concise description of what the bug is.
    validations:
@ -39,25 +47,7 @@ body:
        Error...
    validations:
      required: false
-  - type: textarea
-    attributes:
-      label: Minified repro
-      description: |
-        Please run the minifier on your example and paste the minified code below
-        Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html
-      placeholder: |
-        env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
-        or
-        env TORCHDYNAMO_REPRO_AFTER="dynamo" python your_model.py

-        import torch
-        ...
-
-        # torch version: 2.0.....
-
-        class Repro(torch.nn.Module)
-    validations:
-      required: false
  - type: textarea
    attributes:
      label: Versions
--- a/.github/actions/build-android/action.yml
+++ b/.github/actions/build-android/action.yml
@ -48,8 +48,6 @@ runs:
      run: |
        # detached container should get cleaned up by teardown_ec2_linux
        set -exo pipefail
-        # Fetch aws credential from IMDs
-        eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
        export container_name
        container_name=$(docker run \
          -e BUILD_ENVIRONMENT \
--- a/.github/actions/download-build-artifacts/action.yml
+++ b/.github/actions/download-build-artifacts/action.yml
@ -26,7 +26,7 @@ runs:

    - name: Download PyTorch Build Artifacts from GHA
      if: ${{ inputs.use-gha }}
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: ${{ inputs.name }}

--- a/.github/actions/download-td-artifacts/action.yml
+++ b/.github/actions/download-td-artifacts/action.yml
@ -18,7 +18,7 @@ runs:

    - name: Download TD Artifacts from GHA
      if: inputs.use-gha
-      uses: actions/download-artifact@v3
+      uses: actions/download-artifact@v4
      with:
        name: td_results.json

--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -47,7 +47,14 @@ inputs:
  GITHUB_TOKEN:
    description: GitHub token
    required: true
-
+  disable-monitor:
+    description: |
+      [Experimental] Disable utilization monitoring for tests.
+      Currently, by default we disable the monitor job and only look for specific tests,
+      since we are investigating the behaviour of the monitor script with different tests.
+    required: false
+    type: boolean
+    default: true
 #env:
 #  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}

@ -115,6 +122,7 @@ runs:

    - name: Start monitoring script
      id: monitor-script
+      if: ${{ !inputs.disable-monitor }}
      shell: bash
      continue-on-error: true
      run: |
@ -289,7 +297,7 @@ runs:
        cat test/**/*_toprint.log || true

    - name: Stop monitoring script
-      if: always() && steps.monitor-script.outputs.monitor-script-pid
+      if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
      shell: bash
      continue-on-error: true
      env:
--- a/.github/actions/upload-sccache-stats/action.yml
+++ b/.github/actions/upload-sccache-stats/action.yml
@ -0,0 +1,39 @@
+# Upload sccache stats to artifacts, and also as benchmark data when on an aws
+# linux or windows machine.  Does not currently handle mac builds
+name: Upload sccache stats
+
+description: Upload sccache stats to artifacts
+
+inputs:
+  github-token:
+    description: GITHUB_TOKEN
+    required: true
+  build-time:
+    description: Build time in seconds
+
+runs:
+  using: composite
+  steps:
+    - name: Upload sccache to s3
+      uses: seemethere/upload-artifact-s3@v5
+      with:
+        s3-prefix: |
+          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
+        retention-days: 14
+        if-no-files-found: warn
+        path: sccache-stats-*.json
+
+    - name: Format sccache stats
+      shell: bash
+      run: |
+        python3 -m tools.stats.sccache_stats_to_benchmark_format
+      env:
+        BUILD_TIME: ${{ inputs.build-time }}
+
+    - name: Upload sccache stats as benchmark
+      uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+      with:
+        benchmark-results-dir: test/test-reports
+        dry-run: false
+        schema-version: v3
+        github-token: ${{ inputs.github-token }}
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -147,7 +147,7 @@ runs:

    # GHA upload
    - name: Store Test Downloaded JSONs on Github
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      if: inputs.use-gha
      continue-on-error: true
      with:
@ -158,7 +158,7 @@ runs:
        path: test/**/*.json

    - name: Store Test Reports on Github
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      if: inputs.use-gha
      continue-on-error: true
      with:
@ -172,7 +172,7 @@ runs:
          test/**/*.csv

    - name: Store Usage Logs on Github
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
      if: inputs.use-gha
      continue-on-error: true
      with:
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-fa44bdab1fe49bab58389e7b6a33061ffced9bc7
+332760d4b300f00a0d862e3cfe1495db3b1a14f9
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-e522b45cd4535b9dfe067aa68d7315755df38f48
+766a5e3a189384659fd35a68c3b17b88c761aaac
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-2eb4a60ed14a38260b85b0c765161f0ce45be6d1
+2ec22641e390cda25ec7c61fcbce07507727d584
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -35,8 +35,11 @@
 - torch/distributed/_tensor/**
 - torch/distributed/fsdp/**
 - torch/csrc/inductor/**
+- torch/csrc/dynamo/**
 - test/cpp/aoti_abi_check/**
 - test/cpp/aoti_inference/**
+- test/inductor/**
+- test/dynamo/**

 "module: cpu":
 - aten/src/ATen/cpu/**
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -329,6 +329,7 @@

 - name: DCP
  patterns:
+  - docs/source/distributed.checkpoint.rst
  - torch/distributed/checkpoint/**
  approved_by:
  - LucasLLC
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -2,7 +2,6 @@ tracking_issue: 24422
 ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/binaries
- ciflow/binaries_conda
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
 - ciflow/inductor
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -12,4 +12,3 @@ nvidia-ml-py==11.525.84
 pyyaml==6.0
 requests==2.32.2
 rich==10.9.0
-rockset==1.0.3
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,7 +1,7 @@
 boto3==1.35.42
 hypothesis==6.56.4
 expecttest==0.2.1
-fbscribelogger==0.1.6
+fbscribelogger==0.1.7
 librosa>=0.6.2
 mpmath==1.3.0
 networkx==2.8.7
@ -24,7 +24,6 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
 pytest-cpp==2.3.0
-rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
 optree==0.13.0
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -45,15 +45,15 @@ def main() -> None:

    try:
        if not has_required_labels(pr):
-            print(LABEL_ERR_MSG)
+            print(LABEL_ERR_MSG, flush=True)
            add_label_err_comment(pr)
            if args.exit_non_zero:
-                sys.exit(1)
+                raise RuntimeError("PR does not have required labels")
        else:
            delete_all_label_err_comments(pr)
    except Exception as e:
        if args.exit_non_zero:
-            sys.exit(1)
+            raise RuntimeError(f"Error checking labels: {e}") from e

    sys.exit(0)

--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -15,13 +15,13 @@ import os
 from typing import Dict, List, Optional, Tuple


-CUDA_ARCHES = ["11.8", "12.1", "12.4"]
+CUDA_ARCHES = ["11.8", "12.4", "12.6"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.1"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.4": "12.4.1", "12.6": "12.6.2"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.4": "9", "12.6": "9"}


 ROCM_ARCHES = ["6.1", "6.2"]
@ -54,19 +54,6 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "12.1": (
-        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
-        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
    "12.4": (
        "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -82,6 +69,21 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
+    "12.6": (
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.6.3.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64'"
+    ),
 }


@ -155,7 +157,7 @@ DEFAULT_TAG = os.getenv("RELEASE_VERSION_TAG", "main")

 WHEEL_CONTAINER_IMAGES = {
    **{
-        gpu_arch: f"pytorch/manylinux-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
+        gpu_arch: f"pytorch/manylinux2_28-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in CUDA_ARCHES
    },
    **{
@ -163,20 +165,13 @@ WHEEL_CONTAINER_IMAGES = {
        for gpu_arch in ROCM_ARCHES
    },
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
-    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
+    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
-    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
+    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
    "cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
 }

-CONDA_CONTAINER_IMAGES = {
-    **{
-        gpu_arch: f"pytorch/conda-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
-        for gpu_arch in CUDA_ARCHES
-    },
-    "cpu": f"pytorch/conda-builder:cpu-{DEFAULT_TAG}",
-}

 PRE_CXX11_ABI = "pre-cxx11"
 CXX11_ABI = "cxx11-abi"
@ -236,35 +231,6 @@ def list_without(in_list: List[str], without: List[str]) -> List[str]:
    return [item for item in in_list if item not in without]


-def generate_conda_matrix(os: str) -> List[Dict[str, str]]:
-    ret: List[Dict[str, str]] = []
-    arches = ["cpu"]
-    python_versions = FULL_PYTHON_VERSIONS
-    if os == "linux" or os == "windows":
-        arches += CUDA_ARCHES
-    for python_version in python_versions:
-        # We don't currently build conda packages for rocm
-        for arch_version in arches:
-            gpu_arch_type = arch_type(arch_version)
-            gpu_arch_version = "" if arch_version == "cpu" else arch_version
-            ret.append(
-                {
-                    "python_version": python_version,
-                    "gpu_arch_type": gpu_arch_type,
-                    "gpu_arch_version": gpu_arch_version,
-                    "desired_cuda": translate_desired_cuda(
-                        gpu_arch_type, gpu_arch_version
-                    ),
-                    "container_image": CONDA_CONTAINER_IMAGES[arch_version],
-                    "package_type": "conda",
-                    "build_name": f"conda-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
-                        ".", "_"
-                    ),
-                }
-            )
-    return ret
-
-
 def generate_libtorch_matrix(
    os: str,
    abi_version: str,
@ -278,7 +244,9 @@ def generate_libtorch_matrix(
            arches += ROCM_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES
-
+            # skip CUDA 12.6 builds on Windows
+            if "12.6" in arches:
+                arches.remove("12.6")
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -343,6 +311,9 @@ def generate_wheels_matrix(
            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
+            # skip CUDA 12.6 builds on Windows
+            if "12.6" in arches:
+                arches.remove("12.6")
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
@ -370,29 +341,39 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
                gpu_arch_type == "rocm"
-                or os not in ["linux", "linux-s390x", "macos-arm64"]
+                or os
+                not in [
+                    "linux",
+                    "linux-s390x",
+                    "linux-aarch64",
+                    "macos-arm64",
+                    "windows",
+                ]
            ) and python_version in ["3.13", "3.13t"]:
                continue

-            # TODO: Enable python 3.13t on xpu and cpu-s390x or MacOS
+            # TODO: Enable python 3.13t on xpu and cpu-s390x or MacOS or Windows
            if (
-                gpu_arch_type in ["xpu", "cpu-s390x"] or os == "macos-arm64"
+                gpu_arch_type in ["xpu", "cpu-s390x"]
+                or os == "macos-arm64"
+                or os == "linux-aarch64"
+                or os == "windows"
            ) and python_version == "3.13t":
                continue

            if use_split_build and (
-                arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
+                arch_version not in ["12.6", "12.4", "11.8", "cpu"] or os != "linux"
            ):
                raise RuntimeError(
-                    "Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
+                    "Split build is only supported on linux with cuda 12.6, 12.4, 11.8, and cpu.\n"
                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
                    "Please modify the matrix generation to exclude this combination."
                )

-            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
+            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["12.4", "12.1", "11.8"]
+                arch_version in ["12.6", "12.4", "11.8"]
                and os == "linux"
                or arch_version == "cuda-aarch64"
            ):
@ -420,8 +401,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                # Special build building to use on Colab. Python 3.11 for 12.1 CUDA
-                if python_version == "3.11" and arch_version == "12.1":
+                # Special build building to use on Colab. Python 3.11 for 12.4 CUDA
+                if python_version == "3.11" and arch_version == "12.4":
                    ret.append(
                        {
                            "python_version": python_version,
@ -451,7 +432,9 @@ def generate_wheels_matrix(
                        ),
                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
+                            "cxx11-abi"
+                            if arch_version in ["cpu-cxx11-abi", "cpu-aarch64"]
+                            else ""
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
@ -469,6 +452,6 @@ def generate_wheels_matrix(
    return ret


+validate_nccl_dep_consistency("12.6")
 validate_nccl_dep_consistency("12.4")
-validate_nccl_dep_consistency("12.1")
 validate_nccl_dep_consistency("11.8")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -20,7 +20,6 @@ LABEL_CIFLOW_UNSTABLE = "ciflow/unstable"
 LABEL_CIFLOW_BINARIES = "ciflow/binaries"
 LABEL_CIFLOW_PERIODIC = "ciflow/periodic"
 LABEL_CIFLOW_BINARIES_LIBTORCH = "ciflow/binaries_libtorch"
-LABEL_CIFLOW_BINARIES_CONDA = "ciflow/binaries_conda"
 LABEL_CIFLOW_BINARIES_WHEEL = "ciflow/binaries_wheel"


@ -129,17 +128,6 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    #     ),
    #     use_split_build=True,
    # ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="conda",
-        build_configs=generate_binary_build_matrix.generate_conda_matrix(
-            OperatingSystem.LINUX
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
-            isolated_workflow=True,
-        ),
-    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
@ -176,7 +164,7 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        package_type="manywheel",
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
+            arches=["11.8", "12.4", "12.6"],
            python_versions=["3.9"],
        ),
        branches="main",
@ -235,17 +223,6 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="conda",
-        build_configs=generate_binary_build_matrix.generate_conda_matrix(
-            OperatingSystem.WINDOWS
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
-            isolated_workflow=True,
-        ),
-    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
@ -339,19 +316,6 @@ MACOS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.MACOS_ARM64,
-        package_type="conda",
-        cross_compile_arm64=False,
-        macos_runner="macos-14-xlarge",
-        build_configs=generate_binary_build_matrix.generate_conda_matrix(
-            OperatingSystem.MACOS_ARM64
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA},
-            isolated_workflow=True,
-        ),
-    ),
 ]

 AARCH64_BINARY_BUILD_WORKFLOWS = [
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -73,10 +73,10 @@ def gh_fetch_url(
    headers: Optional[Dict[str, str]] = None,
    data: Union[Optional[Dict[str, Any]], str] = None,
    method: Optional[str] = None,
-    reader: Callable[[Any], Any] = lambda x: x.read(),
+    reader: Callable[[Any], Any] = json.load,
 ) -> Any:
    return gh_fetch_url_and_headers(
-        url, headers=headers, data=data, reader=json.load, method=method
+        url, headers=headers, data=data, reader=reader, method=method
    )[1]


@ -178,7 +178,7 @@ def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None

 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
-    gh_fetch_url(url, method="DELETE")
+    gh_fetch_url(url, method="DELETE", reader=lambda x: x.read())


 def gh_fetch_merge_base(org: str, repo: str, base: str, head: str) -> str:
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -46,19 +46,25 @@ Example config:
    # Opt-ins:
    # Users can opt into the LF fleet by adding their GitHub username to this list
    # and specifying experiments to enable in a comma-separated list.
+    # To always opt out of an experiment, prefix it with a "-".
    # Experiments should be from the above list.

-    @User1,lf,split_build
+    @User1,-lf,split_build
    @User2,lf
    @User3,split_build
 """

+import json
 import logging
 import os
 import random
+import re
+import sys
 from argparse import ArgumentParser
+from functools import lru_cache
 from logging import LogRecord
-from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Tuple
+from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Set, Tuple
+from urllib.request import Request, urlopen

 import yaml
 from github import Auth, Github
@ -72,7 +78,7 @@ WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundati
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
-
+OPT_OUT_LABEL = "no-runner-experiments"

 SETTING_EXPERIMENTS = "experiments"

@ -191,6 +197,13 @@ def parse_args() -> Any:
        default="",
        help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
    )
+    parser.add_argument(
+        "--pr-number",
+        type=str,
+        required=False,
+        default="",
+        help="the optional PR number where this is run",
+    )

    return parser.parse_args()

@ -296,6 +309,27 @@ def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
    return optins


+def is_valid_experiment_name(experiment_name: str) -> bool:
+    """
+    Check if the experiment name is valid.
+    A valid name:
+        - Contains only alphanumeric characters and the special characters "_" & "-"
+        - The special characters "_" & "-" shouldn't be the first or last characters
+        - Cannot contain spaces
+    """
+
+    valid_char_regex = r"^[a-zA-Z0-9]([\w-]*[a-zA-Z0-9])?$"
+    valid = bool(re.match(valid_char_regex, experiment_name))
+
+    if valid:
+        return True
+
+    log.error(
+        f"Invalid experiment name: {experiment_name}. Experiment names should only contain alphanumeric characters, '_', and '-'. They cannot contain spaces, and the special characters '_' and '-' cannot be the first or last characters."
+    )
+    return False
+
+
 def parse_settings_from_text(settings_text: str) -> Settings:
    """
    Parse the experiments from the issue body into a list of ExperimentSettings
@ -314,6 +348,10 @@ def parse_settings_from_text(settings_text: str) -> Settings:
            experiments = {}

            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                if not is_valid_experiment_name(exp_name):
+                    # Exclude invalid experiments from the list. We log an error, but don't raise an exception so that other experiments can still be processed.
+                    continue
+
                valid_settings = {}
                for setting in exp_settings:
                    if setting not in Experiment._fields:
@ -361,6 +399,23 @@ def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -
    return experiment_name in user_optins.get(user, [])


+def is_user_opted_out(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+    """
+    Check if a user explicitly opted out of an experiment
+    """
+    # if the experiment is prefixed with a "-", then it's an opt-out
+    experiment_optout = "-" + experiment_name
+    if experiment_optout not in user_optins.get(user, []):
+        return False
+
+    if is_user_opted_in(user, user_optins, experiment_name):
+        log.warning(
+            f"User {user} is opted into experiment {experiment_name}, but also opted out of it. Defaulting to opting out"
+        )
+
+    return True
+
+
 def get_runner_prefix(
    rollout_state: str,
    workflow_requestors: Iterable[str],
@ -393,6 +448,19 @@ def get_runner_prefix(
            )
            continue

+        # Is any workflow_requestor opted out to this experiment?
+        opted_out_users = [
+            requestor
+            for requestor in workflow_requestors
+            if is_user_opted_out(requestor, user_optins, experiment_name)
+        ]
+
+        if opted_out_users:
+            log.info(
+                f"{', '.join(opted_out_users)} have opted out of experiment {experiment_name}."
+            )
+            continue
+
        # Is any workflow_requestor opted in to this experiment?
        opted_in_users = [
            requestor
@ -451,11 +519,66 @@ def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -
    return str(issue.get_comments()[0].body.strip("\n\t "))


+def download_json(url: str, headers: Dict[str, str], num_retries: int = 3) -> Any:
+    for _ in range(num_retries):
+        try:
+            req = Request(url=url, headers=headers)
+            content = urlopen(req, timeout=5).read().decode("utf-8")
+            return json.loads(content)
+        except Exception as e:
+            log.warning(f"Could not download {url}: {e}")
+
+    log.warning(f"All {num_retries} retries exhausted, downloading {url} failed")
+    return {}
+
+
+@lru_cache(maxsize=None)
+def get_pr_info(github_repo: str, github_token: str, pr_number: int) -> Dict[str, Any]:
+    """
+    Dynamically get PR information
+    """
+    github_api = f"https://api.github.com/repos/{github_repo}"
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"token {github_token}",
+    }
+    json_response: Dict[str, Any] = download_json(
+        url=f"{github_api}/issues/{pr_number}",
+        headers=headers,
+    )
+
+    if not json_response:
+        log.warning(f"Failed to get the labels for #{pr_number}")
+        return {}
+
+    return json_response
+
+
+def get_labels(github_repo: str, github_token: str, pr_number: int) -> Set[str]:
+    """
+    Dynamically get the latest list of labels from the pull request
+    """
+    pr_info = get_pr_info(github_repo, github_token, pr_number)
+    return {
+        label.get("name") for label in pr_info.get("labels", []) if label.get("name")
+    }
+
+
 def main() -> None:
    args = parse_args()

    runner_label_prefix = DEFAULT_LABEL_PREFIX

+    # Check if the PR is opt-out
+    if args.pr_number:
+        labels = get_labels(args.github_repo, args.github_token, int(args.pr_number))
+        if OPT_OUT_LABEL in labels:
+            log.info(
+                f"Opt-out runner determinator because #{args.pr_number} has {OPT_OUT_LABEL} label"
+            )
+            set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
+            sys.exit()
+
    try:
        rollout_state = get_rollout_state_from_issue(
            args.github_token, args.github_issue_repo, args.github_issue
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -69,3 +69,6 @@ RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions

 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
+
+# podman requires additional settings to use docker.io by default
+RUN mkdir -pv .config/containers ; echo 'unqualified-search-registries = ["docker.io"]' > .config/containers/registries.conf
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
@ -9,9 +9,10 @@ Type=simple
 Restart=always
 ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
 ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env
+ExecStartPre=-/usr/local/bin/gh_cat_token.sh /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.socket
 ExecStart=/usr/bin/docker run \
              --env-file=/etc/actions-runner/%i/env \
-              --env-file=/etc/actions-runner/%i/ghtoken.env \
+              --volume /etc/actions-runner/%i/ghtoken.socket:/run/runner_secret \
              --init \
              --interactive \
              --name=actions-runner.%i \
@ -21,6 +22,7 @@ ExecStart=/usr/bin/docker run \
 ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
 ExecStop=/bin/sh -c "docker wait actions-runner.%i"
 ExecStop=/bin/sh -c "docker rm actions-runner.%i"
+ExecStop=/usr/bin/env rm -f /etc/actions-runner/%i/ghtoken.env /etc/actions-runner/%i/ghtoken.socket

 [Install]
 WantedBy=multi-user.target
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@ -11,6 +11,8 @@ fi

 token_file=registration-token.json

+ACCESS_TOKEN="$(cat /run/runner_secret)"
+
 # Generate registration token
 curl \
        -X POST \
--- a/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_cat_token.sh
+++ b/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_cat_token.sh
@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+
+TOKEN_FILE=$1
+TOKEN_PIPE=$2
+
+mkfifo "${TOKEN_PIPE}"
+cat "${TOKEN_FILE}" > "${TOKEN_PIPE}" &
--- a/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_token_generator.sh
+++ b/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_token_generator.sh
@ -7,4 +7,4 @@ APP_PRIVATE_KEY=$3
 DST_FILE="$4"

 ACCESS_TOKEN="$(APP_ID="$(<"${APP_ID}")" INSTALL_ID="$(<"${INSTALL_ID}")" APP_PRIVATE_KEY="$(<"${APP_PRIVATE_KEY}")" "${SCRIPT_DIR}/app_token.sh")"
-echo "ACCESS_TOKEN=${ACCESS_TOKEN}" > "${DST_FILE}"
+echo "${ACCESS_TOKEN}" > "${DST_FILE}"
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -45,7 +45,6 @@ def main() -> None:
    platform_images = [
        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
-        generate_binary_build_matrix.CONDA_CONTAINER_IMAGES,
    ]
    default_tag = generate_binary_build_matrix.DEFAULT_TAG

--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -38,6 +38,31 @@ class TestRunnerDeterminatorIssueParser(TestCase):
            "otherExp settings not parsed correctly",
        )

+    def test_parse_settings_with_invalid_experiment_name_skips_experiment(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            -badExp:
+                rollout_perc: 0
+                default: false
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,-badExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertNotIn("-badExp", settings.experiments)
+
    def test_parse_settings_in_code_block(self) -> None:
        settings_text = """

@ -161,6 +186,40 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")

+    def test_explicitly_opted_out_user(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,-lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for User1")
+
+    def test_explicitly_opted_in_and_out_user_should_opt_out(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,-lf,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for User1")
+
    def test_opted_in_user_two_experiments(self) -> None:
        settings_text = """
        experiments:
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -559,8 +559,8 @@ class TestTryMerge(TestCase):
                "expected": "lintrunner / linux-job",
            },
            {
-                "name": "Test `run_test.py` is usable without boto3/rockset",
-                "expected": "Test `run_test.py` is usable without boto3/rockset",
+                "name": "Test `run_test.py` is usable without boto3",
+                "expected": "Test `run_test.py` is usable without boto3",
            },
        ]

--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -2005,17 +2005,18 @@ def check_for_sev(org: str, project: str, skip_mandatory_checks: bool) -> None:
        Dict[str, Any],
        gh_fetch_json_list(
            "https://api.github.com/search/issues",
-            params={"q": f'repo:{org}/{project} is:open is:issue label:"ci: sev"'},
+            # Having two label: queries is an AND operation
+            params={
+                "q": f'repo:{org}/{project} is:open is:issue label:"ci: sev" label:"merge blocking"'
+            },
        ),
    )
    if response["total_count"] != 0:
-        for item in response["items"]:
-            if "MERGE BLOCKING" in item["body"]:
-                raise RuntimeError(
-                    "Not merging any PRs at the moment because there is a "
-                    + "merge blocking https://github.com/pytorch/pytorch/labels/ci:%20sev issue open at: \n"
-                    + f"{item['html_url']}"
-                )
+        raise RuntimeError(
+            "Not merging any PRs at the moment because there is a "
+            + "merge blocking https://github.com/pytorch/pytorch/labels/ci:%20sev issue open at: \n"
+            + f"{response['items'][0]['html_url']}"
+        )
    return


--- a/.github/scripts/upload_aws_ossci.sh
+++ b/.github/scripts/upload_aws_ossci.sh
@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+
+# Upload a binary to a bucket, supports dry-run mode
+
+set -euo pipefail
+
+# Optional inputs. By default upload to s3://ossci-linux
+TARGET_OS=${TARGET_OS:-linux}
+UPLOAD_BUCKET=${UPLOAD_BUCKET:-s3://ossci-${TARGET_OS}}
+UPLOAD_SUBFOLDER=${UPLOAD_SUBFOLDER:-}
+
+# Download to ${{ runner.temp }}/artifacts to match the default
+PKG_DIR=${PKG_DIR:-/tmp/workspace/artifacts}
+
+# Optional package include.
+# By default looks for and uploads *.tar.bz2 files only
+PKG_INCLUDE=${PKG_INCLUDE:-'*.tar.bz2'}
+
+# Dry-run logs the upload command without actually executing it
+# Dry-run is enabled by default, it has to be disabled to upload
+DRY_RUN=${DRY_RUN:-enabled}
+# Don't actually do work unless explicit
+AWS_S3_CP="aws s3 cp --dryrun"
+if [[ "${DRY_RUN}" = "disabled" ]]; then
+  AWS_S3_CP="aws s3 cp"
+fi
+
+# Install dependencies (should be a no-op if previously installed)
+pip install -q awscli
+
+# Handle subfolders, if provided
+s3_root_dir="${UPLOAD_BUCKET}"
+if [[ -z ${UPLOAD_SUBFOLDER:-} ]]; then
+    s3_upload_dir="${s3_root_dir}/"
+else
+    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
+fi
+
+# Upload all packages that match PKG_INCLUDE within PKG_DIR and subdirs
+set -x
+${AWS_S3_CP} --no-progress --acl public-read --exclude="*" --include="${PKG_INCLUDE}" --recursive "${PKG_DIR}" "${s3_upload_dir}"
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -0,0 +1,66 @@
+@setlocal
+
+set MAGMA_VERSION=2.5.4
+
+set CUVER_NODOT=%CUDA_VERSION%
+set CUVER=%CUVER_NODOT:~0,-1%.%CUVER_NODOT:~-1,1%
+
+set CONFIG_LOWERCASE=%CONFIG:D=d%
+set CONFIG_LOWERCASE=%CONFIG_LOWERCASE:R=r%
+set CONFIG_LOWERCASE=%CONFIG_LOWERCASE:M=m%
+
+echo Building for configuration: %CONFIG_LOWERCASE%, %CUVER%
+
+:: Download Ninja
+curl -k https://s3.amazonaws.com/ossci-windows/ninja_1.8.2.exe --output C:\Tools\ninja.exe
+if errorlevel 1 exit /b 1
+
+set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%"
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+
+mkdir magma_cuda%CUVER_NODOT%
+cd magma_cuda%CUVER_NODOT%
+
+if not exist magma (
+  :: MAGMA 2.5.4 from http://icl.utk.edu/projectsfiles/magma/downloads/ with applied patches from our magma folder
+  git clone https://github.com/ptrblck/magma_win.git magma
+  if errorlevel 1 exit /b 1
+) else (
+  rmdir /S /Q magma\build
+  rmdir /S /Q magma\install
+)
+
+cd magma
+mkdir build && cd build
+
+set GPU_TARGET=All
+if "%CUVER_NODOT:~0,2%" == "12" (
+  set CUDA_ARCH_LIST=-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
+)
+if "%CUVER_NODOT%" == "118" (
+  set CUDA_ARCH_LIST= -gencode arch=compute_37,code=sm_37 -gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90
+)
+
+set CC=cl.exe
+set CXX=cl.exe
+
+cmake .. -DGPU_TARGET="%GPU_TARGET%" ^
+            -DUSE_FORTRAN=0 ^
+            -DCMAKE_CXX_FLAGS="/FS /Zf" ^
+            -DCMAKE_BUILD_TYPE=%CONFIG% ^
+            -DCMAKE_GENERATOR=Ninja ^
+            -DCMAKE_INSTALL_PREFIX=..\install\ ^
+            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%"
+if errorlevel 1 exit /b 1
+
+cmake --build . --target install --config %CONFIG% -- -j%NUMBER_OF_PROCESSORS%
+if errorlevel 1 exit /b 1
+
+cd ..\..\..
+
+:: Create
+7z a magma_%MAGMA_VERSION%_cuda%CUVER_NODOT%_%CONFIG_LOWERCASE%.7z %cd%\magma_cuda%CUVER_NODOT%\magma\install\*
+
+rmdir /S /Q magma_cuda%CUVER_NODOT%\
+@endlocal
--- a/.github/scripts/windows/cuda_install.bat
+++ b/.github/scripts/windows/cuda_install.bat
@ -0,0 +1,218 @@
+@echo on
+
+if "%CUDA_VERSION%" == "cpu" (
+    echo Skipping for CPU builds
+    exit /b 0
+)
+if "%CUDA_VERSION%" == "xpu" (
+    echo Skipping for XPU builds
+    exit /b 0
+)
+
+set SRC_DIR=%~dp0\..
+
+if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
+
+set /a CUDA_VER=%CUDA_VERSION%
+set CUDA_VER_MAJOR=%CUDA_VERSION:~0,-1%
+set CUDA_VER_MINOR=%CUDA_VERSION:~-1,1%
+set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
+set CUDNN_FOLDER="cuda"
+set CUDNN_LIB_FOLDER="lib\x64"
+
+:: Skip all of this if we already have cuda installed
+if exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" goto set_cuda_env_vars
+
+if %CUDA_VER% EQU 118 goto cuda118
+if %CUDA_VER% EQU 121 goto cuda121
+if %CUDA_VER% EQU 124 goto cuda124
+if %CUDA_VER% EQU 126 goto cuda126
+
+echo CUDA %CUDA_VERSION_STR% is not supported
+exit /b 1
+
+:cuda118
+
+set CUDA_INSTALL_EXE=cuda_11.8.0_522.06_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_11.8 thrust_11.8 nvcc_11.8 cuobjdump_11.8 nvprune_11.8 nvprof_11.8 cupti_11.8 cublas_11.8 cublas_dev_11.8 cudart_11.8 cufft_11.8 cufft_dev_11.8 curand_11.8 curand_dev_11.8 cusolver_11.8 cusolver_dev_11.8 cusparse_11.8 cusparse_dev_11.8 npp_11.8 npp_dev_11.8 nvrtc_11.8 nvrtc_dev_11.8 nvml_dev_11.8 nvtx_11.8"
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda11-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
+:cuda121
+
+set CUDA_INSTALL_EXE=cuda_12.1.1_531.14_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_12.1 thrust_12.1 nvcc_12.1 cuobjdump_12.1 nvprune_12.1 nvprof_12.1 cupti_12.1 cublas_12.1 cublas_dev_12.1 cudart_12.1 cufft_12.1 cufft_dev_12.1 curand_12.1 curand_dev_12.1 cusolver_12.1 cusolver_dev_12.1 cusparse_12.1 cusparse_dev_12.1 npp_12.1 npp_dev_12.1 nvrtc_12.1 nvrtc_dev_12.1 nvml_dev_12.1 nvjitlink_12.1 nvtx_12.1"
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
+:cuda124
+
+set CUDA_INSTALL_EXE=cuda_12.4.0_551.61_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_12.4 thrust_12.4 nvcc_12.4 cuobjdump_12.4 nvprune_12.4 nvprof_12.4 cupti_12.4 cublas_12.4 cublas_dev_12.4 cudart_12.4 cufft_12.4 cufft_dev_12.4 curand_12.4 curand_dev_12.4 cusolver_12.4 cusolver_dev_12.4 cusparse_12.4 cusparse_dev_12.4 npp_12.4 npp_dev_12.4 nvrtc_12.4 nvrtc_dev_12.4 nvml_dev_12.4 nvjitlink_12.4 nvtx_12.4"
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
+:cuda126
+
+set CUDA_INSTALL_EXE=cuda_12.6.2_560.94_windows.exe
+if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
+    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    if errorlevel 1 exit /b 1
+    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
+    set "ARGS=cuda_profiler_api_12.6 thrust_12.6 nvcc_12.6 cuobjdump_12.6 nvprune_12.6 nvprof_12.6 cupti_12.6 cublas_12.6 cublas_dev_12.6 cudart_12.6 cufft_12.6 cufft_dev_12.6 curand_12.6 curand_dev_12.6 cusolver_12.6 cusolver_dev_12.6 cusparse_12.6 cusparse_dev_12.6 npp_12.6 npp_dev_12.6 nvrtc_12.6 nvrtc_dev_12.6 nvml_dev_12.6 nvjitlink_12.6 nvtx_12.6"
+)
+
+set CUDNN_FOLDER=cudnn-windows-x86_64-9.5.0.50_cuda12-archive
+set CUDNN_LIB_FOLDER="lib"
+set "CUDNN_INSTALL_ZIP=%CUDNN_FOLDER%.zip"
+if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
+    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+    if errorlevel 1 exit /b 1
+    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
+)
+
+@REM cuDNN 8.3+ required zlib to be installed on the path
+echo Installing ZLIB dlls
+curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
+7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
+xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
+
+goto cuda_common
+
+:cuda_common
+:: NOTE: We only install CUDA if we don't have it installed already.
+:: With GHA runners these should be pre-installed as part of our AMI process
+:: If you cannot find the CUDA version you want to build for here then please
+:: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows
+if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+    if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
+        curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
+        if errorlevel 1 exit /b 1
+    )
+
+    if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" (
+        curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
+        if errorlevel 1 exit /b 1
+    )
+
+    echo Installing CUDA toolkit...
+    7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
+    pushd "%SRC_DIR%\temp_build\cuda"
+
+    sc config wuauserv start= disabled
+    sc stop wuauserv
+    sc query wuauserv
+
+    start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
+    echo %errorlevel%
+
+    popd
+
+    echo Installing VS integration...
+    if "%VC_YEAR%" == "2019" (
+        xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\BuildTools\MSBuild\Microsoft\VC\v160\BuildCustomizations"
+    )
+    if "%VC_YEAR%" == "2022" (
+        xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations"
+    )
+
+    echo Installing NvToolsExt...
+    7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+
+    echo Installing cuDNN...
+    7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
+    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
+    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
+
+    echo Installing GPU driver DLLs
+    7z x %SRC_DIR%\temp_build\gpu_driver_dlls.zip -o"C:\Windows\System32"
+
+    echo Cleaning temp files
+    rd /s /q "%SRC_DIR%\temp_build" || ver > nul
+
+    if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+        echo CUDA %CUDA_VERSION_STR% installed failed.
+        echo --------- setup.exe.log -------
+        type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log"
+        echo --------- RunDll32.exe.log
+        type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log"
+        exit /b 1
+    )
+)
+
+goto set_cuda_env_vars
+
+:set_cuda_env_vars
+
+echo Setting up environment...
+set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
+set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -143,9 +143,6 @@ jobs:
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          CUDA_VERSION: ${{ inputs.cuda-version }}
        run: |
-          python3 -m pip install boto3==1.19.12
-          # Fetch aws credential from IMDs
-          eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
          export SHARD_NUMBER=0
          # detached container should get cleaned up by teardown_ec2_linux
          # TODO: Stop building test binaries as part of the build phase
@ -154,6 +151,7 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            ${GPU_FLAG:-} \
+            -e AWS_DEFAULT_REGION \
            -e BUILD_ENVIRONMENT \
            -e GITHUB_ACTIONS \
            -e GITHUB_REPOSITORY \
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -271,7 +271,7 @@ jobs:
          )
          docker exec -t -w "${PYTORCH_ROOT}" "${container_name}" bash -c "bash .circleci/scripts/binary_populate_env.sh"
          if [[ ${BUILD_ENVIRONMENT} == *"aarch64"* ]]; then
-            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /builder/aarch64_linux/aarch64_ci_build.sh"
+            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/aarch64_linux/aarch64_ci_build.sh"
          elif [[ ${{ inputs.PACKAGE_TYPE }} == "manywheel" || ${{ inputs.PACKAGE_TYPE }} == "libtorch" ]]; then
            docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash /pytorch/.ci/${{ inputs.PACKAGE_TYPE }}/build.sh"
          else
--- a/.github/workflows/_buck-build-test.yml
+++ b/.github/workflows/_buck-build-test.yml
@ -1,134 +0,0 @@
-name: buck
-
-on:
-  workflow_call:
-    inputs:
-      test-matrix:
-        required: true
-        type: string
-        description: |
-          A JSON description of what configs to run later on.
-      runner_prefix:
-        required: false
-        type: string
-        description: |
-          Prefix for runner label
-
-defaults:
-  run:
-    shell: bash -e -l {0}
-
-jobs:
-  filter:
-    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, "${{ inputs.runner_prefix }}linux.large"]
-    outputs:
-      test-matrix: ${{ steps.filter.outputs.test-matrix }}
-      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
-      keep-going: ${{ steps.filter.outputs.keep-going }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          fetch-depth: 1
-          submodules: false
-
-      - name: Select all requested test configurations
-        id: filter
-        uses: ./.github/actions/filter-test-configs
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          test-matrix: ${{ inputs.test-matrix }}
-
-  buck-build-test:
-    needs: filter
-    if: github.repository_owner == 'pytorch' && needs.filter.outputs.is-test-matrix-empty == 'False'
-    strategy:
-      matrix: ${{ fromJSON(needs.filter.outputs.test-matrix) }}
-      fail-fast: false
-    runs-on: ${{ matrix.runner }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-      - name: Set up JDK 8
-        uses: actions/setup-java@v3
-        with:
-          java-version: '8'
-          distribution: 'temurin'
-
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-        with:
-          python-version: 3.9
-          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
-
-      - name: Install Buck
-        uses: nick-fields/retry@v3.0.0
-        with:
-          timeout_minutes: 10
-          max_attempts: 5
-          command: |
-            sudo apt update -q
-            wget -q https://github.com/facebook/buck/releases/download/v2021.01.12.01/buck.2021.01.12.01_all.deb
-            sudo apt install ./buck.2021.01.12.01_all.deb
-
-      - name: Download third party libraries and generate wrappers
-        uses: nick-fields/retry@v3.0.0
-        with:
-          timeout_minutes: 10
-          max_attempts: 5
-          command: |
-            bash scripts/buck_setup.sh
-
-      - name: Build tools
-        run: |
-          buck build tools: --keep-going
-
-      - name: Run tools tests
-        run: |
-          buck test tools:selective_build_test tools:gen_oplist_test tools:gen_operators_yaml_test
-
-      - name: Build c10
-        run: |
-          buck build c10:c10
-
-      - name: Build XNNPACK
-        run: |
-          buck build third_party:XNNPACK
-
-      - name: Build QNNPACK
-        run: |
-          buck build aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack
-
-      - name: Test QNNPACK
-        run: |
-          buck test aten/src/ATen/native/quantized/cpu/qnnpack:pytorch_qnnpack_test
-
-      - name: Build aten_cpu
-        run: |
-          buck build :aten_cpu
-
-      - name: Build torch_mobile_core
-        run: |
-          buck build :torch_mobile_core
-
-      - name: Build pt_ops_full
-        run: |
-          buck build :pt_ops_full
-
-      - name: Build mobile benchmark
-        run: |
-          buck build :ptmobile_benchmark
-
-      - name: Run lite interpreter model
-        run: |
-          buck run :ptmobile_benchmark -- --model=ios/TestApp/models/mobilenet_v2.ptl --input_dims=1,3,224,224 --input_type=float
-
-      - name: Build everything
-        run: |
-          buck build //... --keep-going
-
-      - name: Build aten_cpu@shared
-        run: |
-          buck build :aten_cpu#linux-x86_64,shared
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -120,7 +120,7 @@ jobs:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}
+          no-sudo: true

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
@ -212,6 +212,7 @@ jobs:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
+          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
            JENKINS_USER=
            USED_IMAGE="${DOCKER_IMAGE_S390X}"
@ -222,8 +223,6 @@ jobs:
          else
            JENKINS_USER="--user jenkins"
            USED_IMAGE="${DOCKER_IMAGE}"
-            # Fetch aws credential from IMDs
-            eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
          fi
          # detached container should get cleaned up by teardown_ec2_linux
          # Used for JENKINS_USER, which can be empty
@ -232,9 +231,6 @@ jobs:
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e AWS_SESSION_TOKEN \
            -e PR_NUMBER \
            -e SHA1 \
            -e BRANCH \
@ -261,6 +257,9 @@ jobs:
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

+          END_TIME=$(date +%s)
+          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
+
      - name: Archive artifacts into zip
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
        run: |
@ -287,7 +286,7 @@ jobs:
          s3-bucket: ${{ inputs.s3-bucket }}

      - name: Store PyTorch Build Artifacts for s390x
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
@ -296,7 +295,7 @@ jobs:
          path: artifacts.zip

      - name: Store PyTorch Build Artifacts for s390x for split build
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
@ -306,14 +305,10 @@ jobs:

      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
-        uses: seemethere/upload-artifact-s3@v5
+        uses: ./.github/actions/upload-sccache-stats
        with:
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
-          retention-days: 365
-          if-no-files-found: warn
-          path: sccache-stats-*.json
-          s3-bucket: ${{ inputs.s3-bucket }}
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          build-time: ${{ steps.build.outputs.build_time }}

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -47,6 +47,14 @@ on:
        required: false
        type: string
        default: ""
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -82,6 +90,8 @@ jobs:

      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: true

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
@ -143,6 +153,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -232,8 +243,6 @@ jobs:
          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        run: |
-          # Fetch aws credential from IMDs
-          eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
          set -x

          if [[ $TEST_CONFIG == 'multigpu' ]]; then
@ -266,9 +275,6 @@ jobs:
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
-            -e AWS_ACCESS_KEY_ID \
-            -e AWS_SECRET_ACCESS_KEY \
-            -e AWS_SESSION_TOKEN \
            -e IN_WHEEL_TEST \
            -e SHARD_NUMBER \
            -e TEST_CONFIG \
@ -311,7 +317,7 @@ jobs:
          # Propagate download.pytorch.org IP to container
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
          echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
-          docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
+          docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"

      - name: Upload pytest cache if tests failed
        uses: ./.github/actions/pytest-cache-upload
@ -324,6 +330,14 @@ jobs:
          test_config: ${{ matrix.config }}
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: test/test-reports
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Print remaining test logs
        shell: bash
        if: always() && steps.test.conclusion
@ -331,7 +345,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -186,7 +186,7 @@ jobs:
          zip -1 -r artifacts.zip dist/ build/.ninja_log build/compile_commands.json .additional_ci_files

      - name: Store PyTorch Build Artifacts on GHA
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
        with:
          name: ${{ env.BUILD_ENVIRONMENT }}
@ -195,7 +195,7 @@ jobs:
          path: artifacts.zip

      - name: Upload sccache stats to GHA
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        # Only if sccache is installed, see above
        if: ${{ (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository) && steps.build.outcome != 'skipped' }}
        with:
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -30,6 +30,14 @@ on:
        default: 270
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 jobs:
  test:
@ -101,6 +109,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
        run: |
          ${CONDA_RUN} python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
@ -200,7 +209,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && ${{ steps.monitor-script.outputs.monitor-script-pid }}
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        continue-on-error: true
        env:
          MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
@ -214,6 +223,14 @@ jobs:
          use-gha: true
          file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}

+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: test/test-reports
+          dry-run: false
+          schema-version: v3
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Clean up disk space
        if: always()
        continue-on-error: true
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -38,6 +38,14 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -91,6 +99,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -247,7 +256,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
@ -269,7 +278,7 @@ jobs:
          find . -iname "core.[1-9]*" -exec docker exec "${CONTAINER_NAME}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;

      - name: Store Core dumps on GitHub
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: failure()
        with:
          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -51,6 +51,7 @@ jobs:
      TRIGGERING_ACTOR: ${{ inputs.triggering_actor }}
      ISSUE_OWNER: ${{ inputs.issue_owner }}
      CHECK_EXPERIMENTS: ${{ inputs.check_experiments }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
    steps:
      # - name: Checkout PyTorch
      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -113,19 +114,25 @@ jobs:
              # Opt-ins:
              # Users can opt into the LF fleet by adding their GitHub username to this list
              # and specifying experiments to enable in a comma-separated list.
+              # To always opt out of an experiment, prefix it with a "-".
              # Experiments should be from the above list.

-              @User1,lf,split_build
+              @User1,-lf,split_build
              @User2,lf
              @User3,split_build
          """

+          import json
          import logging
          import os
          import random
+          import re
+          import sys
          from argparse import ArgumentParser
+          from functools import lru_cache
          from logging import LogRecord
-          from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Tuple
+          from typing import Any, Dict, FrozenSet, Iterable, List, NamedTuple, Set, Tuple
+          from urllib.request import Request, urlopen

          import yaml
          from github import Auth, Github
@ -139,7 +146,7 @@ jobs:
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
-
+          OPT_OUT_LABEL = "no-runner-experiments"

          SETTING_EXPERIMENTS = "experiments"

@ -258,6 +265,13 @@ jobs:
                  default="",
                  help="comma separated list of experiments to check, if omitted all experiments marked with default=True are checked",
              )
+              parser.add_argument(
+                  "--pr-number",
+                  type=str,
+                  required=False,
+                  default="",
+                  help="the optional PR number where this is run",
+              )

              return parser.parse_args()

@ -363,6 +377,27 @@ jobs:
              return optins


+          def is_valid_experiment_name(experiment_name: str) -> bool:
+              """
+              Check if the experiment name is valid.
+              A valid name:
+                  - Contains only alphanumeric characters and the special characters "_" & "-"
+                  - The special characters "_" & "-" shouldn't be the first or last characters
+                  - Cannot contain spaces
+              """
+
+              valid_char_regex = r"^[a-zA-Z0-9]([\w-]*[a-zA-Z0-9])?$"
+              valid = bool(re.match(valid_char_regex, experiment_name))
+
+              if valid:
+                  return True
+
+              log.error(
+                  f"Invalid experiment name: {experiment_name}. Experiment names should only contain alphanumeric characters, '_', and '-'. They cannot contain spaces, and the special characters '_' and '-' cannot be the first or last characters."
+              )
+              return False
+
+
          def parse_settings_from_text(settings_text: str) -> Settings:
              """
              Parse the experiments from the issue body into a list of ExperimentSettings
@ -381,6 +416,10 @@ jobs:
                      experiments = {}

                      for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                          if not is_valid_experiment_name(exp_name):
+                              # Exclude invalid experiments from the list. We log an error, but don't raise an exception so that other experiments can still be processed.
+                              continue
+
                          valid_settings = {}
                          for setting in exp_settings:
                              if setting not in Experiment._fields:
@ -428,6 +467,23 @@ jobs:
              return experiment_name in user_optins.get(user, [])


+          def is_user_opted_out(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+              """
+              Check if a user explicitly opted out of an experiment
+              """
+              # if the experiment is prefixed with a "-", then it's an opt-out
+              experiment_optout = "-" + experiment_name
+              if experiment_optout not in user_optins.get(user, []):
+                  return False
+
+              if is_user_opted_in(user, user_optins, experiment_name):
+                  log.warning(
+                      f"User {user} is opted into experiment {experiment_name}, but also opted out of it. Defaulting to opting out"
+                  )
+
+              return True
+
+
          def get_runner_prefix(
              rollout_state: str,
              workflow_requestors: Iterable[str],
@ -460,6 +516,19 @@ jobs:
                      )
                      continue

+                  # Is any workflow_requestor opted out to this experiment?
+                  opted_out_users = [
+                      requestor
+                      for requestor in workflow_requestors
+                      if is_user_opted_out(requestor, user_optins, experiment_name)
+                  ]
+
+                  if opted_out_users:
+                      log.info(
+                          f"{', '.join(opted_out_users)} have opted out of experiment {experiment_name}."
+                      )
+                      continue
+
                  # Is any workflow_requestor opted in to this experiment?
                  opted_in_users = [
                      requestor
@ -518,11 +587,66 @@ jobs:
              return str(issue.get_comments()[0].body.strip("\n\t "))


+          def download_json(url: str, headers: Dict[str, str], num_retries: int = 3) -> Any:
+              for _ in range(num_retries):
+                  try:
+                      req = Request(url=url, headers=headers)
+                      content = urlopen(req, timeout=5).read().decode("utf-8")
+                      return json.loads(content)
+                  except Exception as e:
+                      log.warning(f"Could not download {url}: {e}")
+
+              log.warning(f"All {num_retries} retries exhausted, downloading {url} failed")
+              return {}
+
+
+          @lru_cache(maxsize=None)
+          def get_pr_info(github_repo: str, github_token: str, pr_number: int) -> Dict[str, Any]:
+              """
+              Dynamically get PR information
+              """
+              github_api = f"https://api.github.com/repos/{github_repo}"
+              headers = {
+                  "Accept": "application/vnd.github.v3+json",
+                  "Authorization": f"token {github_token}",
+              }
+              json_response: Dict[str, Any] = download_json(
+                  url=f"{github_api}/issues/{pr_number}",
+                  headers=headers,
+              )
+
+              if not json_response:
+                  log.warning(f"Failed to get the labels for #{pr_number}")
+                  return {}
+
+              return json_response
+
+
+          def get_labels(github_repo: str, github_token: str, pr_number: int) -> Set[str]:
+              """
+              Dynamically get the latest list of labels from the pull request
+              """
+              pr_info = get_pr_info(github_repo, github_token, pr_number)
+              return {
+                  label.get("name") for label in pr_info.get("labels", []) if label.get("name")
+              }
+
+
          def main() -> None:
              args = parse_args()

              runner_label_prefix = DEFAULT_LABEL_PREFIX

+              # Check if the PR is opt-out
+              if args.pr_number:
+                  labels = get_labels(args.github_repo, args.github_token, int(args.pr_number))
+                  if OPT_OUT_LABEL in labels:
+                      log.info(
+                          f"Opt-out runner determinator because #{args.pr_number} has {OPT_OUT_LABEL} label"
+                      )
+                      set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
+                      sys.exit()
+
              try:
                  rollout_state = get_rollout_state_from_issue(
                      args.github_token, args.github_issue_repo, args.github_issue
@ -580,3 +704,4 @@ jobs:
            --github-ref-type "$curr_ref_type" \
            --github-repo "$GITHUB_REPOSITORY" \
            --eligible-experiments "$CHECK_EXPERIMENTS" \
+            --pr-number "${PR_NUMBER}"
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -16,6 +16,10 @@ on:
        type: boolean
        default: false
        description: If set, build with XPU support.
+      xpu-version:
+        required: false
+        type: string
+        description: The version of XPU support package.
      vc-year:
        required: false
        type: string
@ -161,6 +165,7 @@ jobs:
          TORCH_CUDA_ARCH_LIST: "8.6"
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          USE_XPU: ${{ inputs.use-xpu == true && '1' || '0' }}
+          XPU_VERSION: "${{ inputs.xpu-version }}"
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        run: |
          .ci/pytorch/win-build.sh
@ -177,13 +182,9 @@ jobs:

      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped'
-        uses: seemethere/upload-artifact-s3@v5
+        uses: ./.github/actions/upload-sccache-stats
        with:
-          s3-prefix: |
-            ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
-          retention-days: 14
-          if-no-files-found: warn
-          path: sccache-stats-*.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}

      - name: Teardown Windows
        uses: ./.github/actions/teardown-win
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -28,6 +28,14 @@ on:
        default: 240
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -96,11 +104,12 @@ jobs:
          retry_wait_seconds: 30
          command: |
            set -eu
-            python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0'
+            python3 -m pip install 'xdoctest>=1.1.0'

      - name: Start monitoring script
        id: monitor-script
        shell: bash
+        if: ${{ !inputs.disable-monitor }}
        continue-on-error: true
        run: |
          # Windows conda doesn't have python3 binary, only python, but it's python3
@ -213,7 +222,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,6 +38,14 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
+      disable-monitor:
+        description: |
+          [Experimental] Disable utilization monitoring for tests.
+          Currently, by default we disable the monitor job and only look for specific tests,
+          since we are investigating the behaviour of the monitor script with different tests.
+        required: false
+        type: boolean
+        default: true

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -83,6 +91,7 @@ jobs:

      - name: Start monitoring script
        id: monitor-script
+        if: ${{ !inputs.disable-monitor }}
        shell: bash
        continue-on-error: true
        run: |
@ -152,6 +161,8 @@ jobs:
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
+          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
@ -159,6 +170,8 @@ jobs:
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
+          # Fetch aws credential from IMDs
+          eval "$(python3 .github/scripts/get_aws_session_tokens.py)"
          set -x

          TEST_COMMAND=.ci/pytorch/test.sh
@ -181,6 +194,9 @@ jobs:
            -e BRANCH \
            -e SHA1 \
            -e AWS_DEFAULT_REGION \
+            -e AWS_ACCESS_KEY_ID \
+            -e AWS_SECRET_ACCESS_KEY \
+            -e AWS_SESSION_TOKEN \
            -e IN_WHEEL_TEST \
            -e SHARD_NUMBER \
            -e TEST_CONFIG \
@ -195,6 +211,8 @@ jobs:
            -e NO_TD \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
+            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
            -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
@ -233,7 +251,7 @@ jobs:
          cat test/**/*_toprint.log || true

      - name: Stop monitoring script
-        if: always() && steps.monitor-script.outputs.monitor-script-pid
+        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
        continue-on-error: true
        env:
@ -261,7 +279,7 @@ jobs:
          docker stop "${{ env.CONTAINER_NAME }}"

      - name: Store Core dumps on GitHub
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        if: failure()
        with:
          name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -1,4 +1,4 @@
-name: Build conda docker images
+name: Build almalinux docker images

 on:
  workflow_dispatch:
@ -11,14 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/conda/*'
+      - '.ci/docker/almalinux/*'
      - '.ci/docker/common/*'
-      - .github/workflows/build-conda-images.yml
+      - .github/workflows/build-almalinux-images.yml
  pull_request:
    paths:
-      - '.ci/docker/conda/*'
+      - '.ci/docker/almalinux/*'
      - '.ci/docker/common/*'
-      - .github/workflows/build-conda-images.yml
+      - .github/workflows/build-almalinux-images.yml

 env:
  DOCKER_REGISTRY: "docker.io"
@ -48,8 +48,8 @@ jobs:
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
-            docker-image-name: conda-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/conda
+            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
+            docker-build-dir:  .ci/docker/almalinux
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
@ -70,4 +70,4 @@ jobs:
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
-            .ci/docker/conda/build.sh conda-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
+            .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-magma-linux.yml
+++ b/.github/workflows/build-magma-linux.yml
@ -0,0 +1,69 @@
+name: build-linux-magma
+
+on:
+  push:
+    branches:
+      main
+    paths:
+      - .ci/magma/*
+      - .ci/magma/package_files/*
+      - .github/workflows/build-magma-linux.yml
+  pull_request:
+    paths:
+      - .ci/magma/*
+      - .ci/magma/package_files/*
+      - .github/workflows/build-magma-linux.yml
+
+defaults:
+  run:
+    shell: bash -x -e -l {0}
+env:
+  BUILD_ENVIRONMENT: build-linux-magma
+  IN_CI: 1
+  IS_GHA: 1
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-linux-magma:
+    if: github.repository_owner == 'pytorch'
+    runs-on: linux.2xlarge
+    permissions:
+      id-token: write
+    strategy:
+      matrix:
+        cuda_version: ["126", "124", "121", "118"]  # There is no pytorch/manylinux-cuda126 yet
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+      - name: Build Magma Cuda
+        working-directory: .ci/magma
+        run: |
+          # Produces artifacts under magma/output/linux-64/magma-cuda*.bz2
+          make magma-cuda${{ matrix.cuda_version }}
+      - name: Save as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          path: .ci/magma/output/linux-64/magma-cuda*.bz2
+          name: artifact_${{ matrix.cuda_version }}
+      - name: Configure AWS credentials(PyTorch account)
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
+          aws-region: us-east-1
+      - name: Set DRY_RUN
+        if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }}
+        run: |
+            echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Upload binaries
+        shell: bash
+        env:
+            PKG_DIR: ".ci/magma/output/linux-64/"
+            TARGET_OS: "linux"
+            PKG_INCLUDE: "magma-cuda*.tar.bz2"
+        run: |
+            set -ex
+            bash .github/scripts/upload_aws_ossci.sh
--- a/.github/workflows/build-magma-windows.yml
+++ b/.github/workflows/build-magma-windows.yml
@ -0,0 +1,74 @@
+name: Build MAGMA for Windows
+
+on:
+  push:
+    branches:
+      main
+    paths:
+      - .github/scripts/windows/*
+      - .github/workflows/build-magma-windows.yml
+  pull_request:
+    paths:
+      - .github/scripts/windows/*
+      - .github/workflows/build-magma-windows.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-windows-magma:
+    if: github.repository_owner == 'pytorch'
+    runs-on: windows-2019
+    strategy:
+      matrix:
+        cuda_version: ["126", "124", "118"]
+        config: ["Release", "Debug"]
+    env:
+      CUDA_VERSION: ${{ matrix.cuda_version }}
+      CONFIG: ${{ matrix.config }}
+    steps:
+      - name: Checkout pytorch/builder
+        uses: actions/checkout@v4
+      - name: Enable MSVC dev commands to enable cl.exe  # FYI incompatible with shell: bash
+        uses: ilammy/msvc-dev-cmd@dd5e2fa0a7de1e7929605d9ecc020e749d9856a3
+      - name: Install CUDA Toolkit
+        run: .github/scripts/windows/cuda_install.bat
+      - name: Build MAGMA and push to S3
+        run: .github/scripts/windows/build_magma.bat
+      - name: Save as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          path: magma_*_cuda*_*.7z
+          name: artifact_${{ matrix.cuda_version }}_${{ matrix.config }}
+  push-windows-magma:
+    if: github.repository_owner == 'pytorch'
+    runs-on: ubuntu-22.04
+    permissions:
+      id-token: write
+    needs: build-windows-magma
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+      - name: Configure AWS credentials(PyTorch account)
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_ossci_linux_windows_read_write
+          aws-region: us-east-1
+      - name: Set DRY_RUN
+        if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
+        shell: bash
+        run: |
+          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
+      - name: Upload binaries
+        shell: bash
+        env:
+            PKG_DIR: "."
+            TARGET_OS: "windows"
+            PKG_INCLUDE: "magma_*_cuda*_*.7z"
+        run: |
+            set -ex
+            bash .github/scripts/upload_aws_ossci.sh
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -0,0 +1,59 @@
+name: Build manywheel docker images for s390x
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+      - release/*
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate or nightly builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/manywheel/build_scripts/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images-s390x.yml
+  pull_request:
+    paths:
+      - '.ci/docker/manywheel/*'
+      - '.ci/docker/manywheel/build_scripts/*'
+      - '.ci/docker/common/*'
+      - .github/workflows/build-manywheel-images-s390x.yml
+
+
+env:
+  DOCKER_REGISTRY: "docker.io"
+  DOCKER_BUILDKIT: 1
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+jobs:
+  build-docker-cpu-s390x:
+    if: github.repository_owner == 'pytorch'
+    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    runs-on: linux.s390x
+    env:
+      GPU_ARCH_TYPE: cpu-s390x
+    steps:
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: false
+          no-sudo: true
+      - name: Authenticate if WITH_PUSH
+        if: env.WITH_PUSH == 'true'
+        env:
+          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
+          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -94,7 +94,7 @@ jobs:
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
-        cuda_version: ["12.4", "12.1", "11.8"]
+        cuda_version: ["12.6", "12.4", "12.1", "11.8"]
    env:
      GPU_ARCH_TYPE: cuda-manylinux_2_28
      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -44,7 +44,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12", "3.13" ]
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
@ -104,6 +104,9 @@ jobs:
          3.12)
            PYTHON_EXECUTABLE=/opt/python/cp312-cp312/bin/python
            ;;
+          3.13)
+            PYTHON_EXECUTABLE=/opt/python/cp313-cp313/bin/python
+            ;;
          *)
            echo "Unsupported python version ${PY_VERS}"
            exit 1
@ -204,126 +207,3 @@ jobs:
        run: |
          set -ex
          bash .circleci/scripts/binary_upload.sh
-
-  build-conda:
-    name: "Build Triton Conda"
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-    strategy:
-      fail-fast: false
-      matrix:
-        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
-    timeout-minutes: 40
-    env:
-      DOCKER_IMAGE: pytorch/conda-builder:cpu
-      PY_VERS: ${{ matrix.py_vers }}
-    steps:
-      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        with:
-          github-secret: ${{ secrets.GITHUB_TOKEN }}
-
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-
-      - name: Setup Linux
-        uses: ./.github/actions/setup-linux
-
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: ${{ env.DOCKER_IMAGE }}
-
-      - name: Build Triton conda package
-        env:
-          IS_RELEASE_TAG: ${{ startsWith(github.event.ref, 'refs/tags/v') }}
-        run: |
-          set -x
-          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          container_name=$(docker run \
-            --tty \
-            --detach \
-            -v "${GITHUB_WORKSPACE}:/pytorch" \
-            -v "${RUNNER_TEMP}/artifacts:/artifacts" \
-            -w /artifacts/ \
-            "${DOCKER_IMAGE}" \
-          )
-
-          RELEASE=""
-          if [[ "${IS_RELEASE_TAG}" == true ]]; then
-            RELEASE="--release"
-          fi
-
-          docker exec -t "${container_name}" yum install -y llvm11 llvm11-devel llvm11-static llvm11-libs zlib-devel
-          docker exec -t "${container_name}" python /pytorch/.github/scripts/build_triton_wheel.py --build-conda --py-version="${PY_VERS}" $RELEASE
-          docker exec -t "${container_name}" chown -R 1000.1000 /artifacts
-
-      - uses: actions/upload-artifact@v4.4.0
-        with:
-          name: pytorch-triton-conda-${{ matrix.py_vers }}
-          if-no-files-found: error
-          path: ${{ runner.temp }}/artifacts/*
-
-      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
-
-  upload-conda:
-    runs-on: ubuntu-22.04
-    needs: build-conda
-    container:
-      image: continuumio/miniconda3:4.12.0
-    environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }}
-    steps:
-      - uses: actions/checkout@v3
-
-      - name: Download Build Artifacts
-        uses: actions/download-artifact@v4.1.7
-        with:
-          # Download all available artifacts
-          path: ${{ runner.temp }}/artifacts-all
-
-      - name: Select Conda Artifacts
-        shell: bash
-        run: |
-          set -x
-          mkdir -p "${RUNNER_TEMP}/artifacts/"
-          mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-conda-*/* "${RUNNER_TEMP}/artifacts/"
-
-      - name: Set DRY_RUN (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }}
-        shell: bash
-        run: |
-          echo "DRY_RUN=disabled" >> "$GITHUB_ENV"
-
-      - name: Set UPLOAD_CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/v') }}
-        shell: bash
-        run: |
-          set -ex
-
-          # reference ends with an RC suffix
-          if [[ "${GITHUB_REF_NAME}" = *-rc[0-9]* ]]; then
-            echo "UPLOAD_CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-
-      # NB: This step is gated by DRY_RUN, which is enabled everywhere except nightly and release branches
-      - name: Upload binaries to Anaconda
-        env:
-          PACKAGE_TYPE: conda
-          PKG_DIR: ${{ runner.temp }}/artifacts
-          # When running these on pull_request events these should be blank
-          CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-          CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-        shell: bash
-        run: |
-          set -ex
-
-          if [[ "${UPLOAD_CHANNEL:-nightly}" == "nightly" ]]; then
-            export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN}"
-          else
-            export ANACONDA_API_TOKEN="${CONDA_PYTORCHBOT_TOKEN_TEST}"
-          fi
-          bash .circleci/scripts/binary_upload.sh
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@ -56,7 +56,7 @@ jobs:
          cache: pip
          architecture: x64

-      - run: pip install pyyaml==6.0 rockset==1.0.3
+      - run: pip install pyyaml==6.0
        shell: bash

      - name: Verify mergeability
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .1.1
 .5.0
				`@ -0,0 +1 @@`
				`6cd83808c6e8bc7a44028e05112b3ab4e579bcc73202ed14733f66661127e213 magma-2.6.1.tar.gz`