set thread_work_size to 4 for unrolled kernel (#154541 )

set thread_work_size to 4 for unrolled kernel (#152396) Previous PRs enabling 8-vectorization inadvertently regressed unrolled kernel perf. Pull Request resolved: https://github.com/pytorch/pytorch/pull/152396 Approved by: https://github.com/BoyuanFeng, https://github.com/msaroufim, https://github.com/malfet, https://github.com/Aidyn-A, https://github.com/atalman (cherry picked from commit adebb8b11226316d45ce114beb7ac239d0f3abdd) Co-authored-by: Natalia Gimelshein <ngimel@meta.com>
[c10d] Fix extra CUDA context created by barrier (#152834 )
2025-10-24 15:44:58 +08:00 · 2025-05-28 21:18:53 -04:00 · 2025-05-27 18:41:02 -04:00 · 2025-05-27 18:39:21 -04:00 · 2025-05-27 10:56:10 -04:00 · 2025-05-22 15:23:24 -04:00
1781 changed files with 47175 additions and 83216 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -55,9 +55,22 @@ def build_ArmComputeLibrary() -> None:
        shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")


-def update_wheel(wheel_path, desired_cuda) -> None:
+def replace_tag(filename) -> None:
+    with open(filename) as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if line.startswith("Tag:"):
+            lines[i] = line.replace("-linux_", "-manylinux_2_28_")
+            print(f"Updated tag from {line} to {lines[i]}")
+            break
+
+    with open(filename, "w") as f:
+        f.writelines(lines)
+
+
+def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    """
-    Update the cuda wheel libraries
+    Package the cuda wheel libraries
    """
    folder = os.path.dirname(wheel_path)
    wheelname = os.path.basename(wheel_path)
@ -88,30 +101,19 @@ def update_wheel(wheel_path, desired_cuda) -> None:
        "/usr/lib64/libgfortran.so.5",
        "/acl/build/libarm_compute.so",
        "/acl/build/libarm_compute_graph.so",
+        "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
+        "/usr/local/lib/libnvpl_lapack_core.so.0",
+        "/usr/local/lib/libnvpl_blas_core.so.0",
    ]
-    if enable_cuda:
+
+    if "128" in desired_cuda:
        libs_to_copy += [
-            "/usr/local/lib/libnvpl_lapack_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_blas_lp64_gomp.so.0",
-            "/usr/local/lib/libnvpl_lapack_core.so.0",
-            "/usr/local/lib/libnvpl_blas_core.so.0",
-        ]
-        if "126" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.6",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-        elif "128" in desired_cuda:
-            libs_to_copy += [
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
-                "/usr/local/cuda/lib64/libcufile.so.0",
-                "/usr/local/cuda/lib64/libcufile_rdma.so.1",
-            ]
-    else:
-        libs_to_copy += [
-            "/opt/OpenBLAS/lib/libopenblas.so.0",
+            "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.8",
+            "/usr/local/cuda/lib64/libcufile.so.0",
+            "/usr/local/cuda/lib64/libcufile_rdma.so.1",
        ]
+
    # Copy libraries to unzipped_folder/a/lib
    for lib_path in libs_to_copy:
        lib_name = os.path.basename(lib_path)
@ -120,6 +122,13 @@ def update_wheel(wheel_path, desired_cuda) -> None:
            f"cd {folder}/tmp/torch/lib/; "
            f"patchelf --set-rpath '$ORIGIN' --force-rpath {folder}/tmp/torch/lib/{lib_name}"
        )
+
+    # Make sure the wheel is tagged with manylinux_2_28
+    for f in os.scandir(f"{folder}/tmp/"):
+        if f.is_dir() and f.name.endswith(".dist-info"):
+            replace_tag(f"{f.path}/WHEEL")
+            break
+
    os.mkdir(f"{folder}/cuda_wheel")
    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
    shutil.move(
@ -242,6 +251,6 @@ if __name__ == "__main__":
        print("Updating Cuda Dependency")
        filename = os.listdir("/pytorch/dist/")
        wheel_path = f"/pytorch/dist/{filename[0]}"
-        update_wheel(wheel_path, desired_cuda)
+        package_cuda_wheel(wheel_path, desired_cuda)
    pytorch_wheel_name = complete_wheel("/pytorch/")
    print(f"Build Complete. Created {pytorch_wheel_name}..")
--- a/.ci/aarch64_linux/build_aarch64_wheel.py
+++ b/.ci/aarch64_linux/build_aarch64_wheel.py
@ -19,11 +19,13 @@ import boto3

 # AMI images for us-east-1, change the following based on your ~/.aws/config
 os_amis = {
+    "ubuntu18_04": "ami-078eece1d8119409f",  # login_name: ubuntu
    "ubuntu20_04": "ami-052eac90edaa9d08f",  # login_name: ubuntu
    "ubuntu22_04": "ami-0c6c29c5125214c77",  # login_name: ubuntu
    "redhat8": "ami-0698b90665a2ddcf1",  # login_name: ec2-user
 }

+ubuntu18_04_ami = os_amis["ubuntu18_04"]
 ubuntu20_04_ami = os_amis["ubuntu20_04"]


@ -657,6 +659,18 @@ def configure_system(
            "sudo apt-get install -y python3-dev python3-yaml python3-setuptools python3-wheel python3-pip"
        )
    host.run_cmd("pip3 install dataclasses typing-extensions")
+    # Install and switch to gcc-8 on Ubuntu-18.04
+    if not host.using_docker() and host.ami == ubuntu18_04_ami and compiler == "gcc-8":
+        host.run_cmd("sudo apt-get install -y g++-8 gfortran-8")
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-8 100"
+        )
+        host.run_cmd(
+            "sudo update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-8 100"
+        )
    if not use_conda:
        print("Installing Cython + numpy from PyPy")
        host.run_cmd("sudo pip3 install Cython")
@ -1012,7 +1026,7 @@ if __name__ == "__main__":
        install_condaforge_python(host, args.python_version)
        sys.exit(0)

-    python_version = args.python_version if args.python_version is not None else "3.9"
+    python_version = args.python_version if args.python_version is not None else "3.8"

    if args.use_torch_from_pypi:
        configure_system(host, compiler=args.compiler, python_version=python_version)
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -44,8 +44,6 @@ FROM base as cuda
 ARG CUDA_VERSION=12.4
 RUN rm -rf /usr/local/cuda-*
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME=/usr/local/cuda-${CUDA_VERSION}
 # Preserve CUDA_VERSION for the builds
 ENV CUDA_VERSION=${CUDA_VERSION}
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,60 +1,82 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE_NAME="pytorch/${image}"

-CUDA_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-fi
-
-case ${DOCKER_TAG_PREFIX} in
-  cpu)
-    BASE_TARGET=base
-    ;;
-  cuda*)
-    BASE_TARGET=cuda${CUDA_VERSION}
-    ;;
-  *)
-    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
-    exit 1
-    ;;
-esac
-
-# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-sudo systemctl daemon-reload
-sudo systemctl restart docker

 export DOCKER_BUILDKIT=1
 TOPDIR=$(git rev-parse --show-toplevel)
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-docker build \
-  --target final \
-  --progress plain \
-  --build-arg "BASE_TARGET=${BASE_TARGET}" \
-  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-  --build-arg "DEVTOOLSET_VERSION=11" \
-  -t ${tmp_tag} \
-  $@ \
-  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
-  ${TOPDIR}/.ci/docker/
+CUDA_VERSION=${CUDA_VERSION:-12.1}

-if [ -n "${CUDA_VERSION}" ]; then
+case ${CUDA_VERSION} in
+  cpu)
+    BASE_TARGET=base
+    DOCKER_TAG=cpu
+    ;;
+  all)
+    BASE_TARGET=all_cuda
+    DOCKER_TAG=latest
+    ;;
+  *)
+    BASE_TARGET=cuda${CUDA_VERSION}
+    DOCKER_TAG=cuda${CUDA_VERSION}
+    ;;
+esac
+
+
+(
+  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
+  docker build \
+    --target final \
+    --progress plain \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+    --build-arg "DEVTOOLSET_VERSION=11" \
+    -t ${DOCKER_IMAGE_NAME} \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+    ${TOPDIR}/.ci/docker/
+)
+
+if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
  # Test that we're using the right CUDA compiler
-  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  (
+    set -x
+    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  )
+fi
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
+if [[ "${WITH_PUSH:-}" == true ]]; then
+  (
+    set -x
+    docker push "${DOCKER_IMAGE_NAME}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
+        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
+        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+        docker push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
 fi
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -105,6 +105,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -118,6 +119,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -132,6 +134,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -146,6 +149,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -160,6 +164,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -173,6 +178,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -187,6 +193,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -201,6 +208,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.13
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -215,6 +223,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    UCX_COMMIT=${_UCX_COMMIT}
@ -226,6 +235,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    ONNX=yes
@ -234,7 +244,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -242,7 +255,10 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.11
    CLANG_VERSION=10
    PROTOBUF=yes
+    DB=yes
    VISION=yes
+    VULKAN_SDK_VERSION=1.2.162.1
+    SWIFTSHADER=yes
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
@ -250,6 +266,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=9
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -258,6 +275,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.2.4
    NINJA_VERSION=1.9.0
@ -272,6 +290,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    ROCM_VERSION=6.3
    NINJA_VERSION=1.9.0
@ -286,6 +305,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=0.5
    NINJA_VERSION=1.9.0
@ -296,6 +316,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    XPU_VERSION=2025.0
    NINJA_VERSION=1.9.0
@ -306,6 +327,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -319,6 +341,7 @@ case "$image" in
    CUDNN_VERSION=9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    TRITON=yes
    ;;
@ -326,6 +349,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    TRITON=yes
@ -346,6 +370,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    KATEX=yes
    CONDA_CMAKE=yes
@ -378,19 +403,20 @@ case "$image" in
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
-    PIP_CMAKE=yes
+    ANACONDA_PYTHON_VERSION=3.9
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
-    PIP_CMAKE=yes
+    CONDA_CMAKE=yes
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -402,6 +428,7 @@ case "$image" in
    GCC_VERSION=11
    ACL=yes
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
    # snadampal: skipping llvm src build install because the current version
@ -412,6 +439,7 @@ case "$image" in
  *)
    # Catch-all for builds that are not hardcoded.
    PROTOBUF=yes
+    DB=yes
    VISION=yes
    echo "image '$image' did not match an existing build configuration"
    if [[ "$image" == *py* ]]; then
@ -460,21 +488,14 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  fi
 fi

-no_cache_flag=""
-progress_flag=""
-# Do not use cache and progress=plain when in CI
-if [[ -n "${CI:-}" ]]; then
-  no_cache_flag="--no-cache"
-  progress_flag="--progress=plain"
-fi
-
 # Build image
 docker build \
-       ${no_cache_flag} \
-       ${progress_flag} \
+       --no-cache \
+       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
       --build-arg "PROTOBUF=${PROTOBUF:-}" \
       --build-arg "LLVMDEV=${LLVMDEV:-}" \
+       --build-arg "DB=${DB:-}" \
       --build-arg "VISION=${VISION:-}" \
       --build-arg "UBUNTU_VERSION=${UBUNTU_VERSION}" \
       --build-arg "CENTOS_VERSION=${CENTOS_VERSION}" \
@ -482,12 +503,13 @@ docker build \
       --build-arg "GLIBC_VERSION=${GLIBC_VERSION}" \
       --build-arg "CLANG_VERSION=${CLANG_VERSION}" \
       --build-arg "ANACONDA_PYTHON_VERSION=${ANACONDA_PYTHON_VERSION}" \
-       --build-arg "PYTHON_VERSION=${PYTHON_VERSION}" \
       --build-arg "GCC_VERSION=${GCC_VERSION}" \
       --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
       --build-arg "CUDNN_VERSION=${CUDNN_VERSION}" \
       --build-arg "TENSORRT_VERSION=${TENSORRT_VERSION}" \
       --build-arg "GRADLE_VERSION=${GRADLE_VERSION}" \
+       --build-arg "VULKAN_SDK_VERSION=${VULKAN_SDK_VERSION}" \
+       --build-arg "SWIFTSHADER=${SWIFTSHADER}" \
       --build-arg "CMAKE_VERSION=${CMAKE_VERSION:-}" \
       --build-arg "NINJA_VERSION=${NINJA_VERSION:-}" \
       --build-arg "KATEX=${KATEX:-}" \
@ -497,7 +519,6 @@ docker build \
       --build-arg "UCX_COMMIT=${UCX_COMMIT}" \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
-       --build-arg "PIP_CMAKE=${PIP_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
@ -523,7 +544,7 @@ docker build \
 UBUNTU_VERSION=$(echo ${UBUNTU_VERSION} | sed 's/-rc$//')

 function drun() {
-  docker run --rm "$tmp_tag" "$@"
+  docker run --rm "$tmp_tag" $*
 }

 if [[ "$OS" == "ubuntu" ]]; then
@ -571,14 +592,3 @@ if [ -n "$KATEX" ]; then
    exit 1
  fi
 fi
-
-HAS_TRITON=$(drun python -c "import triton" > /dev/null 2>&1 && echo "yes" || echo "no")
-if [[ -n "$TRITON" || -n "$TRITON_CPU" ]]; then
-  if [ "$HAS_TRITON" = "no" ]; then
-    echo "expecting triton to be installed, but it is not"
-    exit 1
-  fi
-elif [ "$HAS_TRITON" = "yes" ]; then
-  echo "expecting triton to not be installed, but it is"
-  exit 1
-fi
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -55,6 +55,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -68,7 +75,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 COPY ./common/install_amdsmi.sh install_amdsmi.sh
 RUN bash ./install_amdsmi.sh
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-381ae5d57d35c165d98df728380b20fbde350392
+ebe8522378c3f9944aaaef44868f5ececdd845fc
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -4,10 +4,16 @@ set -ex

 if [ -n "$CLANG_VERSION" ]; then

-  if [[ $UBUNTU_VERSION == 22.04 ]]; then
+  if [[ $CLANG_VERSION == 9 && $UBUNTU_VERSION == 18.04 ]]; then
+    sudo apt-get update
+    # gpg-agent is not available by default on 18.04
+    sudo apt-get install  -y --no-install-recommends gpg-agent
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-${CLANG_VERSION} main"
+  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
-    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
    if [[ $CLANG_VERSION == 18 ]]; then
      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
    fi
@ -35,7 +41,7 @@ if [ -n "$CLANG_VERSION" ]; then
  # clang's packaging is a little messed up (the runtime libs aren't
  # added into the linker path), so give it a little help
  clang_lib=("/usr/lib/llvm-$CLANG_VERSION/lib/clang/"*"/lib/linux")
-  echo "$clang_lib" >/etc/ld.so.conf.d/clang.conf
+  echo "$clang_lib" > /etc/ld.so.conf.d/clang.conf
  ldconfig

  # Cleanup package manager
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -62,7 +62,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
-  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
+  conda_install libstdcxx-ng=12.3.0 -c conda-forge

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  if [[ $(uname -m) == "aarch64" ]]; then
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -2,6 +2,7 @@

 set -ex

+NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.5.1.17

 function install_cusparselt_040 {
@ -39,7 +40,8 @@ function install_cusparselt_063 {

 function install_118 {
    CUDNN_VERSION=9.1.0.70
-    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.4.0"
+    NCCL_VERSION=v2.21.5-1
+    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
    # install CUDA 11.8.0 in the same container
    wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@ -57,7 +59,14 @@ function install_118 {
    cd ..
    rm -rf tmp_cudnn

-    CUDA_VERSION=11.8 bash install_nccl.sh
+    # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+    # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+    git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+    cd nccl && make -j src.build
+    cp -a build/include/* /usr/local/cuda/include/
+    cp -a build/lib/* /usr/local/cuda/lib64/
+    cd ..
+    rm -rf nccl

    install_cusparselt_040

@ -66,7 +75,7 @@ function install_118 {

 function install_124 {
  CUDNN_VERSION=9.1.0.70
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
@ -84,7 +93,14 @@ function install_124 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.4 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_062

@ -92,7 +108,7 @@ function install_124 {
 }

 function install_126 {
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.6 /usr/local/cuda
  # install CUDA 12.6.3 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.6.3/local_installers/cuda_12.6.3_560.35.05_linux.run
@ -110,7 +126,14 @@ function install_126 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.6 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

@ -217,8 +240,8 @@ function prune_126 {
 }

 function install_128 {
-  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  CUDNN_VERSION=9.7.1.26
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux.run
@ -236,7 +259,14 @@ function install_128 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -3,6 +3,7 @@

 set -ex

+NCCL_VERSION=v2.26.2-1
 CUDNN_VERSION=9.8.0.87

 function install_cusparselt_063 {
@ -17,7 +18,7 @@ function install_cusparselt_063 {
 }

 function install_128 {
-  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.6.3"
+  echo "Installing CUDA 12.8.0 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.3"
  rm -rf /usr/local/cuda-12.8 /usr/local/cuda
  # install CUDA 12.8.0 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.8.0/local_installers/cuda_12.8.0_570.86.10_linux_sbsa.run
@ -35,7 +36,14 @@ function install_128 {
  cd ..
  rm -rf tmp_cudnn

-  CUDA_VERSION=12.8 bash install_nccl.sh
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b ${NCCL_VERSION} --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl

  install_cusparselt_063

--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -5,7 +5,7 @@ if [[ -n "${CUDNN_VERSION}" ]]; then
    mkdir tmp_cudnn
    pushd tmp_cudnn
    if [[ ${CUDA_VERSION:0:4} == "12.8" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.8.0.87_cuda12-archive"
+        CUDNN_NAME="cudnn-linux-x86_64-9.7.1.26_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:4} == "12.6" ]]; then
        CUDNN_NAME="cudnn-linux-x86_64-9.5.1.17_cuda12-archive"
    elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+install_ubuntu() {
+  apt-get update
+
+  # Cleanup
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+}
+
+install_centos() {
+  # Need EPEL for many packages we depend on.
+  # See http://fedoraproject.org/wiki/EPEL
+  yum --enablerepo=extras install -y epel-release
+
+  # Cleanup
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+}
+
+# Install base packages depending on the base OS
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    install_ubuntu
+    ;;
+  centos)
+    install_centos
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -14,13 +14,6 @@ function install_timm() {
  local commit
  commit=$(get_pinned_commit timm)

-  # TODO (huydhn): There is no torchvision release on 3.13 when I write this, so
-  # I'm using nightly here instead. We just need to package to be able to install
-  # TIMM. Removing this once vision has a release on 3.13
-  if [[ "${ANACONDA_PYTHON_VERSION}" == "3.13" ]]; then
-    pip_install --pre torch torchvision --index-url https://download.pytorch.org/whl/nightly/cu124
-  fi
-
  pip_install "git+https://github.com/huggingface/pytorch-image-models@${commit}"
  # Clean up
  conda_run pip uninstall -y cmake torch torchvision triton
--- a/.ci/docker/common/install_linter.sh
+++ b/.ci/docker/common/install_linter.sh
@ -2,6 +2,8 @@

 set -ex

+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
 if [ -n "${UBUNTU_VERSION}" ]; then
  apt update
  apt-get install -y clang doxygen git graphviz nodejs npm libtinfo5
@ -13,8 +15,8 @@ chown -R jenkins pytorch

 pushd pytorch
 # Install all linter dependencies
-pip install -r requirements.txt
-lintrunner init
+pip_install -r requirements.txt
+conda_run lintrunner init

 # Cache .lintbin directory as part of the Docker image
 cp -r .lintbin /tmp
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -1,26 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-NCCL_VERSION=""
-if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
-elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-else
-  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
-  exit 1
-fi
-
-if [[ -n "${NCCL_VERSION}" ]]; then
-  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
-  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
-  git clone -b $NCCL_VERSION --depth 1 https://github.com/NVIDIA/nccl.git
-  pushd nccl
-  make -j src.build
-  cp -a build/include/* /usr/local/cuda/include/
-  cp -a build/lib/* /usr/local/cuda/lib64/
-  popd
-  rm -rf nccl
-  ldconfig
-fi
--- a/.ci/docker/common/install_python.sh
+++ b/.ci/docker/common/install_python.sh
@ -1,18 +0,0 @@
-#!/bin/bash
-set -ex
-
-apt-get update
-# Use deadsnakes in case we need an older python version
-sudo add-apt-repository ppa:deadsnakes/ppa
-apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python3-pip python${PYTHON_VERSION}-venv
-
-# Use a venv because uv and some other package managers don't support --user install
-ln -s /usr/bin/python${PYTHON_VERSION} /usr/bin/python
-python -m venv /var/lib/jenkins/ci_env
-source /var/lib/jenkins/ci_env/bin/activate
-
-python -mpip install --upgrade pip
-python -mpip install -r /opt/requirements-ci.txt
-if [ -n "${PIP_CMAKE}" ]; then
-  python -mpip install cmake==3.31.6
-fi
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -8,6 +8,10 @@ ver() {

 install_ubuntu() {
    apt-get update
+    if [[ $UBUNTU_VERSION == 18.04 ]]; then
+      # gpg-agent is not available by default on 18.04
+      apt-get install -y --no-install-recommends gpg-agent
+    fi
    if [[ $UBUNTU_VERSION == 20.04 ]]; then
      # gpg-agent is not available by default on 20.04
      apt-get install -y --no-install-recommends gpg-agent
@ -19,13 +23,6 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Make sure rocm packages from repo.radeon.com have highest priority
-    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
-Package: *
-Pin: release o=repo.radeon.com
-Pin-Priority: 600
-EOF
-
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -1,32 +1,50 @@
-#!/usr/bin/env bash
-# Script used only in CD pipeline
+#!/bin/bash
+# Script used in CI and CD pipeline

-set -eou pipefail
+set -ex

-function do_install() {
-    rocm_version=$1
-    rocm_version_nodot=${1//./}
+# Magma build scripts need `python`
+ln -sf /usr/bin/python3 /usr/bin/python

-    # Version 2.7.2 + ROCm related updates
-    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  almalinux)
+    yum install -y gcc-gfortran
+    ;;
+  *)
+    echo "No preinstalls to build magma..."
+    ;;
+esac

-    rocm_dir="/opt/rocm"
-    (
-        set -x
-        tmp_dir=$(mktemp -d)
-        pushd ${tmp_dir}
-        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        if tar -xvf "${magma_archive}"
-        then
-            mkdir -p "${rocm_dir}/magma"
-            mv include "${rocm_dir}/magma/include"
-            mv lib "${rocm_dir}/magma/lib"
-        else
-            echo "${magma_archive} not found, skipping magma install"
-        fi
-        popd
-    )
-}
+MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}

-do_install $1
+# "install" hipMAGMA into /opt/rocm/magma by copying after build
+git clone https://bitbucket.org/icl/magma.git
+pushd magma
+
+# Version 2.7.2 + ROCm related updates
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6
+
+cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
+echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
+if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
+    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
+fi
+echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
+echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
+export PATH="${PATH}:/opt/rocm/bin"
+if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
+  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
+else
+  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
+fi
+for arch in $amdgpu_targets; do
+  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
+done
+# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
+sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
+make -f make.gen.hipMAGMA -j $(nproc)
+LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
+make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
+popd
+mv magma /opt/rocm
--- a/.ci/docker/common/install_swiftshader.sh
+++ b/.ci/docker/common/install_swiftshader.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${SWIFTSHADER}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_https_amazon_aws=https://ossci-android.s3.amazonaws.com
+
+# SwiftShader
+_swiftshader_dir=/var/lib/jenkins/swiftshader
+_swiftshader_file_targz=swiftshader-abe07b943-prebuilt.tar.gz
+mkdir -p $_swiftshader_dir
+_tmp_swiftshader_targz="/tmp/${_swiftshader_file_targz}"
+
+curl --silent --show-error --location --fail --retry 3 \
+  --output "${_tmp_swiftshader_targz}" "$_https_amazon_aws/${_swiftshader_file_targz}"
+
+tar -C "${_swiftshader_dir}" -xzf "${_tmp_swiftshader_targz}"
+
+export VK_ICD_FILENAMES="${_swiftshader_dir}/build/Linux/vk_swiftshader_icd.json"
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -2,12 +2,6 @@

 set -ex

-mkdir -p /opt/triton
-if [ -z "${TRITON}" ] && [ -z "${TRITON_CPU}" ]; then
-  echo "TRITON and TRITON_CPU are not set. Exiting..."
-  exit 0
-fi
-
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 get_conda_version() {
@ -58,7 +52,6 @@ cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
 as_jenkins git submodule update --init --recursive
 cd python
-pip_install pybind11==2.13.6

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
 as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
@ -67,22 +60,17 @@ if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}"
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 conda_run python setup.py bdist_wheel
+  CXX=g++-9 pip_install .
 else
-  conda_run python setup.py bdist_wheel
+  pip_install .
 fi

-# Copy the wheel to /opt for multi stage docker builds
-cp dist/*.whl /opt/triton
-# Install the wheel for docker builds that don't use multi stage
-pip_install dist/*.whl
-
 if [ -n "${CONDA_CMAKE}" ]; then
  # TODO: This is to make sure that the same cmake and numpy version from install conda
  # script is used. Without this step, the newer cmake version (3.25.2) downloaded by
--- a/.ci/docker/common/install_vulkan_sdk.sh
+++ b/.ci/docker/common/install_vulkan_sdk.sh
@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+
+[ -n "${VULKAN_SDK_VERSION}" ]
+
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+_vulkansdk_dir=/var/lib/jenkins/vulkansdk
+_tmp_vulkansdk_targz=/tmp/vulkansdk.tar.gz
+
+curl \
+  --silent \
+  --show-error \
+  --location \
+  --fail \
+  --retry 3 \
+  --output "${_tmp_vulkansdk_targz}" "https://ossci-android.s3.amazonaws.com/vulkansdk-linux-x86_64-${VULKAN_SDK_VERSION}.tar.gz"
+
+mkdir -p "${_vulkansdk_dir}"
+tar -C "${_vulkansdk_dir}" -xzf "${_tmp_vulkansdk_targz}" --strip-components 1
+rm -rf "${_tmp_vulkansdk_targz}"
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -47,6 +47,9 @@ function install_ubuntu() {
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl=2025.0.1-6"
+    fi
    apt-get install -y ${XPU_PACKAGES}

    # Cleanup
@ -82,6 +85,9 @@ gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.
 EOF

    # Install Intel Support Packages
+    if [[ "$XPU_VERSION" == "2025.0" ]]; then
+        XPU_PACKAGES="${XPU_PACKAGES} intel-oneapi-dnnl-2025.0.1-6"
+    fi
    yum install -y ${XPU_PACKAGES}
    # The xpu-smi packages
    dnf install -y xpu-smi
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -49,8 +49,6 @@ RUN bash ./install_mkl.sh && rm install_mkl.sh
 FROM cpu as cuda
 ADD ./common/install_cuda.sh install_cuda.sh
 ADD ./common/install_magma.sh install_magma.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
 ENV CUDA_HOME /usr/local/cuda

 FROM cuda as cuda11.8
@ -74,7 +72,6 @@ RUN bash ./install_magma.sh 12.8
 RUN ln -sf /usr/local/cuda-12.8 /usr/local/cuda

 FROM cpu as rocm
-ARG ROCM_VERSION
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
 ENV MKLROOT /opt/intel
@ -89,11 +86,11 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python3 python-is-python3 -y && \
+    apt-get install python -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 FROM ${BASE_TARGET} as final
 COPY --from=openssl            /opt/openssl           /opt/openssl
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,63 +1,83 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eoux pipefail
+set -eou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

+DOCKER_IMAGE="pytorch/${image}"
+
 TOPDIR=$(git rev-parse --show-toplevel)

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
+
+WITH_PUSH=${WITH_PUSH:-}
+
 DOCKER=${DOCKER:-docker}

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
-
-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
-
-case ${DOCKER_TAG_PREFIX} in
+case ${GPU_ARCH_TYPE} in
    cpu)
        BASE_TARGET=cpu
+        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda*)
+    cuda)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm*)
+    rocm)
        BASE_TARGET=rocm
-        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
+        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
-        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
+        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}"
        ;;
    *)
-        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-DOCKER_BUILDKIT=1 ${DOCKER} build \
-    --target final \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
-    "${TOPDIR}/.ci/docker/"
+(
+    set -x
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
+         --target final \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        "${TOPDIR}/.ci/docker/"
+
+)
+
+GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  (
+    set -x
+    ${DOCKER} push "${DOCKER_IMAGE}"
+    if [[ -n ${GITHUB_REF} ]]; then
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
+        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
+    fi
+  )
+fi
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -18,30 +18,28 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-# Put venv into the env vars so users don't need to activate it
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_magma_conda.sh install_magma_conda.sh
+RUN bash ./install_conda.sh && rm install_conda.sh install_magma_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
-
-RUN chown -R jenkins:jenkins /var/lib/jenkins/ci_env
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/linter/Dockerfile
+++ b/.ci/docker/linter/Dockerfile
@ -15,18 +15,20 @@ COPY ./common/install_user.sh install_user.sh
 RUN bash ./install_user.sh && rm install_user.sh

 # Install conda and other packages (e.g., numpy, pytest)
-ARG PYTHON_VERSION
-ARG PIP_CMAKE
-ENV PATH /var/lib/jenkins/ci_env/bin:$PATH
-ENV VIRTUAL_ENV /var/lib/jenkins/ci_env
-COPY requirements-ci.txt /opt/requirements-ci.txt
-COPY ./common/install_python.sh install_python.sh
-RUN bash ./install_python.sh && rm install_python.sh /opt/requirements-ci.txt
+ARG ANACONDA_PYTHON_VERSION
+ARG CONDA_CMAKE
+ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
+ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
+COPY requirements-ci.txt /opt/conda/requirements-ci.txt
+COPY ./common/install_conda.sh install_conda.sh
+COPY ./common/common_utils.sh common_utils.sh
+RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt

 # Note that Docker build forbids copying file outside the build context
 COPY ./common/install_linter.sh install_linter.sh
+COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_linter.sh
-RUN rm install_linter.sh
+RUN rm install_linter.sh common_utils.sh

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -64,9 +64,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=10.2
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -197,6 +195,6 @@ RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 # cmake3 is needed for the MIOpen build
 RUN ln -sf /usr/local/bin/cmake /usr/bin/cmake3
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -36,9 +36,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION=11.8
 # Install CUDA
 ADD ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh install_nccl.sh ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${BASE_CUDA_VERSION} && rm install_cuda.sh

 FROM base as intel
 # MKL
@ -160,7 +158,7 @@ ADD ./common/install_rocm_drm.sh install_rocm_drm.sh
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
 ENV MKLROOT /opt/intel
 ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION} && rm install_rocm_magma.sh
+RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -67,9 +67,7 @@ FROM base as cuda
 ARG BASE_CUDA_VERSION
 # Install CUDA
 ADD ./common/install_cuda_aarch64.sh install_cuda_aarch64.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh install_nccl.sh ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda_aarch64.sh ${BASE_CUDA_VERSION} && rm install_cuda_aarch64.sh

 FROM base as magma
 ARG BASE_CUDA_VERSION
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -42,7 +42,6 @@ RUN yum install -y \
  llvm-devel \
  libzstd-devel \
  python3.12-devel \
-  python3.12-test \
  python3.12-setuptools \
  python3.12-pip \
  python3-virtualenv \
@ -102,33 +101,24 @@ CMD ["/bin/bash"]

 # install test dependencies:
 # - grpcio requires system openssl, bundled crypto fails to build
+# - ml_dtypes 0.4.0 requires some fixes provided in later commits to build
 RUN dnf install -y \
  protobuf-devel \
  protobuf-c-devel \
  protobuf-lite-devel \
-  hdf5-devel \
-  python3-h5py \
-  git
+  wget \
+  patch

-RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio
-
-# cmake-3.28.0 from pip for onnxruntime
-RUN python3 -mpip install cmake==3.28.0
-
-# build onnxruntime 1.21.0 from sources.
-# it is not possible to build it from sources using pip,
-# so just build it from upstream repository.
-# h5py is dependency of onnxruntime_training.
-# h5py==3.11.0 builds with hdf5-devel 1.10.5 from repository.
-# install newest flatbuffers version first:
-# for some reason old version is getting pulled in otherwise.
-# packaging package is required for onnxruntime wheel build.
-RUN pip3 install flatbuffers && \
-  pip3 install h5py==3.11.0 && \
-  pip3 install packaging && \
-  git clone https://github.com/microsoft/onnxruntime && \
-  cd onnxruntime && git checkout v1.21.0 && \
+RUN env GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=True pip3 install grpcio==1.65.4
+RUN cd ~ && \
+  git clone https://github.com/jax-ml/ml_dtypes && \
+  cd ml_dtypes && \
+  git checkout v0.4.0 && \
  git submodule update --init --recursive && \
-  ./build.sh --config Release --parallel 0 --enable_pybind --build_wheel --enable_training --enable_training_apis --enable_training_ops --skip_tests --allow_running_as_root && \
-  pip3 install ./build/Linux/Release/dist/onnxruntime_training-*.whl && \
-  cd .. && /bin/rm -rf ./onnxruntime
+  wget https://github.com/jax-ml/ml_dtypes/commit/b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  wget https://github.com/jax-ml/ml_dtypes/commit/d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  patch -p1 < b969f76914d6b30676721bc92bf0f6021a0d1321.patch && \
+  patch -p1 < d4e6d035ecda073eab8bcf60f4eef572ee7087e6.patch && \
+  python3 setup.py bdist_wheel && \
+  pip3 install dist/*.whl && \
+  rm -rf ml_dtypes
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -exou pipefail
+set -eou pipefail

 TOPDIR=$(git rev-parse --show-toplevel)

@ -9,110 +9,152 @@ image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi

-# Go from imagename:tag to tag
-DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+DOCKER_IMAGE="pytorch/${image}"

-GPU_ARCH_VERSION=""
-if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
-    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
-elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
-    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
-    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
-fi
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"

+GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
+GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
+WITH_PUSH=${WITH_PUSH:-}

-case ${image} in
-    manylinux2_28-builder:cpu)
+case ${GPU_ARCH_TYPE} in
+    cpu)
        TARGET=cpu_final
+        DOCKER_TAG=cpu
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cpu-manylinux_2_28)
+        TARGET=cpu_final
+        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinuxaarch64-builder:cpu-aarch64)
+    cpu-aarch64)
        TARGET=final
+        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
-    manylinux2_28_aarch64-builder:cpu-aarch64)
+    cpu-aarch64-2_28)
        TARGET=final
+        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+    cpu-cxx11-abi)
        TARGET=final
+        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    manylinuxs390x-builder:cpu-s390x)
+    cpu-s390x)
        TARGET=final
+        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda*)
+    cuda)
        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
+        # Keep this up to date with the minimum version of CUDA we currently support
+        GPU_IMAGE=centos:7
+        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
+        ;;
+    cuda-manylinux_2_28)
+        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinuxaarch64-builder:cuda*)
+    cuda-aarch64)
        TARGET=cuda_final
+        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    manylinux2_28-builder:rocm*)
+    rocm|rocm-manylinux_2_28)
        TARGET=rocm_final
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        DEVTOOLSET_VERSION="9"
-        MANY_LINUX_VERSION="2_28"
-        DEVTOOLSET_VERSION="11"
-        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
+            MANY_LINUX_VERSION="2_28"
+            DEVTOOLSET_VERSION="11"
+            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+        fi
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    manylinux2_28-builder:xpu)
+    xpu)
        TARGET=xpu_final
+        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized image name: ${image}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac

+IMAGES=''
+
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-# Only activate this if in CI
-if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-    sudo systemctl daemon-reload
-    sudo systemctl restart docker
+(
+    set -x
+
+    # Only activate this if in CI
+    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+        sudo systemctl daemon-reload
+        sudo systemctl restart docker
+    fi
+
+    DOCKER_BUILDKIT=1 docker build  \
+        ${DOCKER_GPU_BUILD_ARG} \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+        --target "${TARGET}" \
+        -t "${DOCKER_IMAGE}" \
+        $@ \
+        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+        "${TOPDIR}/.ci/docker/"
+)
+
+GITHUB_REF=${GITHUB_REF:-"dev")}
+GIT_BRANCH_NAME=${GITHUB_REF##*/}
+GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
+DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
+
+if [[ "${WITH_PUSH}" == true ]]; then
+    (
+        set -x
+        docker push "${DOCKER_IMAGE}"
+        if [[ -n ${GITHUB_REF} ]]; then
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
+            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
+            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
+            docker push "${DOCKER_IMAGE_SHA_TAG}"
+        fi
+    )
 fi
-
-tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
-
-DOCKER_BUILDKIT=1 docker build  \
-    ${DOCKER_GPU_BUILD_ARG} \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    --target "${TARGET}" \
-    -t "${tmp_tag}" \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
-    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -41,14 +41,11 @@ fbscribelogger==0.1.7
 #Pinned versions: 0.1.6
 #test that import:

-flatbuffers==2.0 ; platform_machine != "s390x"
+flatbuffers==2.0
 #Description: cross platform serialization library
 #Pinned versions: 2.0
 #test that import:

-flatbuffers ; platform_machine == "s390x"
-#Description: cross platform serialization library; Newer version is required on s390x for new python version
-
 hypothesis==5.35.1
 # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136
 #Description: advanced library for generating parametrized tests
@ -105,10 +102,10 @@ networkx==2.8.8
 #Pinned versions: 2.8.8
 #test that import: functorch

-ninja==1.11.1.3
-#Description: build system. Used in some tests. Used in build to generate build
-#time tracing information
-#Pinned versions: 1.11.1.3
+#ninja
+#Description: build system.  Note that it install from
+#here breaks things so it is commented out
+#Pinned versions: 1.10.0.post1
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
@ -356,7 +353,7 @@ parameterized==0.8.1
 #Pinned versions: 1.24.0
 #test that import: test_sac_estimator.py

-pwlf==2.2.1
+pwlf==2.2.1 ; python_version >= "3.8"
 #Description: required for testing torch/distributed/_tools/sac_estimator.py
 #Pinned versions: 2.2.1
 #test that import: test_sac_estimator.py
@ -368,9 +365,10 @@ PyYAML
 pyzstd
 setuptools

+ninja==1.11.1 ; platform_machine == "aarch64"
 scons==4.5.2 ; platform_machine == "aarch64"

-pulp==2.9.0
+pulp==2.9.0 ; python_version >= "3.8"
 #Description: required for testing ilp formulaiton under torch/distributed/_tools
 #Pinned versions: 2.9.0
 #test that import: test_sac_ilp.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,20 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
-
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6

-sphinxext-opengraph==0.9.1
-#Description: This is used to generate PyTorch docs
-#Pinned versions: 0.9.1
-
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -51,6 +46,5 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
-sphinxcontrib-mermaid==1.0.0
+sphinx-panels==0.4.1
 myst-parser==0.18.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.0
+3.3.1
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -2,7 +2,7 @@ ARG UBUNTU_VERSION
 ARG CUDA_VERSION
 ARG IMAGE_NAME

-FROM ${IMAGE_NAME} as base
+FROM ${IMAGE_NAME}

 ARG UBUNTU_VERSION
 ARG CUDA_VERSION
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -90,20 +97,14 @@ RUN if [ -n "${CMAKE_VERSION}" ]; then bash ./install_cmake.sh; fi
 RUN rm install_cmake.sh

 ARG TRITON
-
-FROM base as triton-builder
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
 COPY triton_version.txt triton_version.txt
-RUN bash ./install_triton.sh
-
-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

 ARG HALIDE
 # Build and install halide
@ -158,16 +159,6 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh

-# Install NCCL
-ARG CUDA_VERSION
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash install_nccl.sh
-RUN rm install_nccl.sh /ci_commit_pins/nccl-cu*
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-
 # Install CUDSS
 ARG CUDA_VERSION
 COPY ./common/install_cudss.sh install_cudss.sh
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -50,6 +50,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -63,7 +70,7 @@ COPY ./common/install_rocm.sh install_rocm.sh
 RUN bash ./install_rocm.sh
 RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
-RUN bash ./install_rocm_magma.sh ${ROCM_VERSION}
+RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
 ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -77,6 +77,13 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -1,6 +1,6 @@
 ARG UBUNTU_VERSION

-FROM ubuntu:${UBUNTU_VERSION} as base
+FROM ubuntu:${UBUNTU_VERSION}

 ARG UBUNTU_VERSION

@ -52,16 +52,9 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh
 # Install cuda and cudnn
 ARG CUDA_VERSION
 COPY ./common/install_cuda.sh install_cuda.sh
-COPY ./common/install_nccl.sh install_nccl.sh
-COPY ./ci_commit_pins/nccl-cu* /ci_commit_pins/
-RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh install_nccl.sh /ci_commit_pins/nccl-cu*
+RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
-# No effect if cuda not installed
-ENV USE_SYSTEM_NCCL=1
-ENV NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
-ENV NCCL_LIB_DIR="/usr/local/cuda/lib64/"
-

 # (optional) Install UCC
 ARG UCX_COMMIT
@ -81,6 +74,13 @@ RUN if [ -n "${PROTOBUF}" ]; then bash ./install_protobuf.sh; fi
 RUN rm install_protobuf.sh
 ENV INSTALLED_PROTOBUF ${PROTOBUF}

+# (optional) Install database packages like LMDB and LevelDB
+ARG DB
+COPY ./common/install_db.sh install_db.sh
+RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
+RUN rm install_db.sh
+ENV INSTALLED_DB ${DB}
+
 # (optional) Install vision packages like OpenCV
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
@ -88,6 +88,18 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# (optional) Install Vulkan SDK
+ARG VULKAN_SDK_VERSION
+COPY ./common/install_vulkan_sdk.sh install_vulkan_sdk.sh
+RUN if [ -n "${VULKAN_SDK_VERSION}" ]; then bash ./install_vulkan_sdk.sh; fi
+RUN rm install_vulkan_sdk.sh
+
+# (optional) Install swiftshader
+ARG SWIFTSHADER
+COPY ./common/install_swiftshader.sh install_swiftshader.sh
+RUN if [ -n "${SWIFTSHADER}" ]; then bash ./install_swiftshader.sh; fi
+RUN rm install_swiftshader.sh
+
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -115,21 +127,20 @@ RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_d
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 ARG TRITON
-ARG TRITON_CPU
-
-# Create a separate stage for building Triton and Triton-CPU.  install_triton
-# will check for the presence of env vars
-FROM base as triton-builder
+# Install triton, this needs to be done before sccache because the latter will
+# try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/triton.txt triton.txt
-COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
-RUN bash ./install_triton.sh
+RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton.txt

-FROM base as final
-COPY --from=triton-builder /opt/triton /opt/triton
-RUN if [ -n "${TRITON}" ] || [ -n "${TRITON_CPU}" ]; then pip install /opt/triton/*.whl; chown -R jenkins:jenkins /opt/conda; fi
-RUN rm -rf /opt/triton
+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt

 ARG EXECUTORCH
 # Build and install executorch
--- a/.ci/magma-rocm/.gitignore
+++ b/.ci/magma-rocm/.gitignore
@ -1,2 +0,0 @@
-output/
-magma-rocm*/
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,41 +0,0 @@
-SHELL=/usr/bin/env bash
-
-DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
-DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
-PACKAGE_NAME = magma-rocm
-# inherit this from underlying docker image, do not pass this env var to docker
-#PYTORCH_ROCM_ARCH ?= gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201
-
-DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
-	-v $(shell git rev-parse --show-toplevel)/.ci:/builder \
-	-w /builder \
-	-e PACKAGE_NAME=${PACKAGE_NAME}${DESIRED_ROCM_SHORT} \
-	-e DESIRED_ROCM=${DESIRED_ROCM} \
-	"pytorch/manylinux2_28-builder:rocm${DESIRED_ROCM}-main" \
-	magma-rocm/build_magma.sh
-
-.PHONY: all
-all: magma-rocm64
-all: magma-rocm63
-all: magma-rocm624
-
-.PHONY:
-clean:
-	$(RM) -r magma-*
-	$(RM) -r output
-
-.PHONY: magma-rocm64
-magma-rocm64: DESIRED_ROCM := 6.4
-magma-rocm64:
-	$(DOCKER_RUN)
-
-.PHONY: magma-rocm63
-magma-rocm63: DESIRED_ROCM := 6.3
-magma-rocm63:
-	$(DOCKER_RUN)
-
-.PHONY: magma-rocm624
-magma-rocm624: DESIRED_ROCM := 6.2.4
-magma-rocm624:
-	$(DOCKER_RUN)
--- a/.ci/magma-rocm/README.md
+++ b/.ci/magma-rocm/README.md
@ -1,48 +0,0 @@
-# Magma ROCm
-
-This folder contains the scripts and configurations to build libmagma.so, linked for various versions of ROCm.
-
-## Building
-
-Look in the `Makefile` for available targets to build. To build any target, for example `magma-rocm63`, run
-
-```
-# Using `docker`
-make magma-rocm63
-
-# Using `podman`
-DOCKER_CMD=podman make magma-rocm63
-```
-
-This spawns a `pytorch/manylinux-rocm<version>` docker image, which has the required `devtoolset` and ROCm versions installed.
-Within the docker image, it runs `build_magma.sh` with the correct environment variables set, which package the necessary files
-into a tarball, with the following structure:
-
-```
-.
-├── include       # header files
-├── lib           # libmagma.so
-├── info
-│   ├── licenses  # license file
-│   └── recipe    # build script
-```
-
-More specifically, `build_magma.sh` copies over the relevant files from the `package_files` directory depending on the ROCm version.
-Outputted binaries should be in the `output` folder.
-
-
-## Pushing
-
-Packages can be uploaded to an S3 bucket using:
-
-```
-aws s3 cp output/*/magma-cuda*.bz2 <bucket-with-path>
-```
-
-If you do not have upload permissions, please ping @seemethere or @soumith to gain access
-
-## New versions
-
-New ROCm versions can be added by creating a new make target with the next desired version. For ROCm version N.n, the target should be named `magma-rocmNn`.
-
-Make sure to edit the appropriate environment variables (e.g., DESIRED_ROCM) in the `Makefile` accordingly. Remember also to check `build_magma.sh` to ensure the logic for copying over the files remains correct.
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -1,42 +0,0 @@
-#!/usr/bin/env bash
-
-set -eou pipefail
-
-# Environment variables
-# The script expects DESIRED_CUDA and PACKAGE_NAME to be set
-ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-
-# Version 2.7.2 + ROCm related updates
-MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
-
-# Folders for the build
-PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
-PACKAGE_DIR=${ROOT_DIR}/magma-rocm/${PACKAGE_NAME} # build workspace
-PACKAGE_OUTPUT=${ROOT_DIR}/magma-rocm/output # where tarballs are stored
-PACKAGE_BUILD=${PACKAGE_DIR} # where the content of the tarball is prepared
-PACKAGE_RECIPE=${PACKAGE_BUILD}/info/recipe
-PACKAGE_LICENSE=${PACKAGE_BUILD}/info/licenses
-mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RECIPE} ${PACKAGE_LICENSE}
-
-# Fetch magma sources and verify checksum
-pushd ${PACKAGE_DIR}
-git clone https://bitbucket.org/icl/magma.git
-pushd magma
-git checkout ${MAGMA_VERSION}
-popd
-popd
-
-# build
-pushd ${PACKAGE_DIR}/magma
-# The build.sh script expects to be executed from the sources root folder
-INSTALL_DIR=${PACKAGE_BUILD} ${PACKAGE_FILES}/build.sh
-popd
-
-# Package recipe, license and tarball
-# Folder and package name are backward compatible for the build workflow
-cp ${PACKAGE_FILES}/build.sh ${PACKAGE_RECIPE}/build.sh
-cp ${PACKAGE_DIR}/magma/COPYRIGHT ${PACKAGE_LICENSE}/COPYRIGHT
-pushd ${PACKAGE_BUILD}
-tar cjf ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2 include lib info
-echo Built in ${PACKAGE_OUTPUT}/linux-64/${PACKAGE_NAME}-${MAGMA_VERSION}-1.tar.bz2
-popd
--- a/.ci/magma-rocm/package_files/build.sh
+++ b/.ci/magma-rocm/package_files/build.sh
@ -1,38 +0,0 @@
-# Magma build scripts need `python`
-ln -sf /usr/bin/python3 /usr/bin/python
-
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  almalinux)
-    yum install -y gcc-gfortran
-    ;;
-  *)
-    echo "No preinstalls to build magma..."
-    ;;
-esac
-
-MKLROOT=${MKLROOT:-/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION}
-
-cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
-echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
-if [[ -f "${MKLROOT}/lib/libmkl_core.a" ]]; then
-    echo 'LIB = -Wl,--start-group -lmkl_gf_lp64 -lmkl_gnu_thread -lmkl_core -Wl,--end-group -lpthread -lstdc++ -lm -lgomp -lhipblas -lhipsparse' >> make.inc
-fi
-echo 'LIB += -Wl,--enable-new-dtags -Wl,--rpath,/opt/rocm/lib -Wl,--rpath,$(MKLROOT)/lib -Wl,--rpath,/opt/rocm/magma/lib -ldl' >> make.inc
-echo 'DEVCCFLAGS += --gpu-max-threads-per-block=256' >> make.inc
-export PATH="${PATH}:/opt/rocm/bin"
-if [[ -n "$PYTORCH_ROCM_ARCH" ]]; then
-  amdgpu_targets=`echo $PYTORCH_ROCM_ARCH | sed 's/;/ /g'`
-else
-  amdgpu_targets=`rocm_agent_enumerator | grep -v gfx000 | sort -u | xargs`
-fi
-for arch in $amdgpu_targets; do
-  echo "DEVCCFLAGS += --offload-arch=$arch" >> make.inc
-done
-# hipcc with openmp flag may cause isnan() on __device__ not to be found; depending on context, compiler may attempt to match with host definition
-sed -i 's/^FOPENMP/#FOPENMP/g' make.inc
-make -f make.gen.hipMAGMA -j $(nproc)
-LANG=C.UTF-8 make lib/libmagma.so -j $(nproc) MKLROOT="${MKLROOT}"
-make testing/testing_dgemm -j $(nproc) MKLROOT="${MKLROOT}"
-cp -R lib ${INSTALL_DIR}
-cp -R include ${INSTALL_DIR}
--- a/.ci/manywheel/build_common.sh
+++ b/.ci/manywheel/build_common.sh
@ -111,6 +111,12 @@ case ${DESIRED_PYTHON} in
    ;;
 esac

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -203,6 +209,12 @@ if [[ -n "$BUILD_PYTHONLESS" ]]; then

    mkdir -p /tmp/$LIBTORCH_HOUSE_DIR

+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+        LIBTORCH_ABI="cxx11-abi-"
+    else
+        LIBTORCH_ABI=
+    fi
+
    zip -rq /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip libtorch
    cp /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-$PYTORCH_BUILD_VERSION.zip \
       /tmp/$LIBTORCH_HOUSE_DIR/libtorch-$LIBTORCH_ABI$LIBTORCH_VARIANT-latest.zip
@ -321,8 +333,8 @@ for pkg in /$WHEELHOUSE_DIR/torch_no_python*.whl /$WHEELHOUSE_DIR/torch*linux*.w
            # ROCm workaround for roctracer dlopens
            if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
                patchedpath=$(fname_without_so_number $destpath)
-            # Keep the so number for XPU dependencies
-            elif [[ "$DESIRED_CUDA" == *"xpu"* ]]; then
+            # Keep the so number for XPU dependencies and libgomp.so.1 to avoid twice load
+            elif [[ "$DESIRED_CUDA" == *"xpu"* || "$filename" == "libgomp.so.1" ]]; then
                patchedpath=$destpath
            else
                patchedpath=$(fname_with_sha256 $destpath)
--- a/.ci/manywheel/build_libtorch.sh
+++ b/.ci/manywheel/build_libtorch.sh
@ -95,6 +95,12 @@ python setup.py clean
 retry pip install -qr requirements.txt
 retry pip install -q numpy==2.0.1

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    export _GLIBCXX_USE_CXX11_ABI=1
+else
+    export _GLIBCXX_USE_CXX11_ABI=0
+fi
+
 if [[ "$DESIRED_CUDA" == *"rocm"* ]]; then
    echo "Calling build_amd.py at $(date)"
    python tools/amd_build/build_amd.py
@ -163,6 +169,12 @@ fi

 )

+if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    LIBTORCH_ABI="cxx11-abi-"
+else
+    LIBTORCH_ABI=
+fi
+
 (
    set -x

--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -35,7 +35,7 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
-  if [[ "$BUILD_ENVIRONMENT" != *clang* ]]; then
+  if [[ "$BUILD_ENVIRONMENT" != *cuda11.3* && "$BUILD_ENVIRONMENT" != *clang* ]]; then
    # TODO: there is a linking issue when building with UCC using clang,
    # disable it for now and to be fix later.
    # TODO: disable UCC temporarily to enable CUDA 12.1 in CI
@ -277,8 +277,10 @@ else
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-      python -mpip install numpy==2.0.2
+      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
+        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
+        python -mpip install numpy==2.0.2
+      fi

      WERROR=1 python setup.py clean

@ -301,18 +303,6 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

-    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-      echo "Checking that xpu is compiled"
-      pushd dist/
-      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
-        echo "XPU support is compiled in."
-      else
-        echo "XPU support is NOT compiled in."
-        exit 1
-      fi
-      popd
-    fi
-
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x

--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -63,12 +63,64 @@ fi
 # Check GCC ABI
 ###############################################################################

-# NOTE: As of https://github.com/pytorch/pytorch/issues/126551 we only produce
-#       wheels with cxx11-abi
+# NOTE [ Building libtorch with old vs. new gcc ABI ]
+#
+# Packages built with one version of ABI could not be linked against by client
+# C++ libraries that were compiled using the other version of ABI. Since both
+# gcc ABIs are still common in the wild, we need to support both ABIs. Currently:
+#
+# - All the nightlies built on CentOS 7 + devtoolset7 use the old gcc ABI.
+# - All the nightlies built on Ubuntu 16.04 + gcc 5.4 use the new gcc ABI.

 echo "Checking that the gcc ABI is what we expect"
 if [[ "$(uname)" != 'Darwin' ]]; then
-  # We also check that there are cxx11 symbols in libtorch
+  function is_expected() {
+    if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* || "$DESIRED_CUDA" == *"rocm"* ]]; then
+      if [[ "$1" -gt 0 || "$1" == "ON " ]]; then
+        echo 1
+      fi
+    else
+      if [[ -z "$1" || "$1" == 0 || "$1" == "OFF" ]]; then
+        echo 1
+      fi
+    fi
+  }
+
+  # First we check that the env var in TorchConfig.cmake is correct
+
+  # We search for D_GLIBCXX_USE_CXX11_ABI=1 in torch/TorchConfig.cmake
+  torch_config="${install_root}/share/cmake/Torch/TorchConfig.cmake"
+  if [[ ! -f "$torch_config" ]]; then
+    echo "No TorchConfig.cmake found!"
+    ls -lah "$install_root/share/cmake/Torch"
+    exit 1
+  fi
+  echo "Checking the TorchConfig.cmake"
+  cat "$torch_config"
+
+  # The sed call below is
+  #   don't print lines by default (only print the line we want)
+  # -n
+  #   execute the following expression
+  # e
+  #   replace lines that match with the first capture group and print
+  # s/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p
+  #   any characters, D_GLIBCXX_USE_CXX11_ABI=, exactly one any character, a
+  #   quote, any characters
+  #   Note the exactly one single character after the '='. In the case that the
+  #     variable is not set the '=' will be followed by a '"' immediately and the
+  #     line will fail the match and nothing will be printed; this is what we
+  #     want.  Otherwise it will capture the 0 or 1 after the '='.
+  # /.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/
+  #   replace the matched line with the capture group and print
+  # /\1/p
+  actual_gcc_abi="$(sed -ne 's/.*D_GLIBCXX_USE_CXX11_ABI=\(.\)".*/\1/p' < "$torch_config")"
+  if [[ "$(is_expected "$actual_gcc_abi")" != 1 ]]; then
+    echo "gcc ABI $actual_gcc_abi not as expected."
+    exit 1
+  fi
+
+  # We also check that there are [not] cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
  python3 "$(dirname ${BASH_SOURCE[0]})/smoke_test/check_binary_symbols.py"
@ -146,11 +198,35 @@ setup_link_flags () {

 TEST_CODE_DIR="$(dirname $(realpath ${BASH_SOURCE[0]}))/test_example_code"
 build_and_run_example_cpp () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=1
+  else
+    GLIBCXX_USE_CXX11_ABI=0
+  fi
  setup_link_flags
-  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
  ./$1
 }

+build_example_cpp_with_incorrect_abi () {
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    GLIBCXX_USE_CXX11_ABI=0
+  else
+    GLIBCXX_USE_CXX11_ABI=1
+  fi
+  set +e
+  setup_link_flags
+  g++ ${TEST_CODE_DIR}/$1.cpp -I${install_root}/include -I${install_root}/include/torch/csrc/api/include -D_GLIBCXX_USE_CXX11_ABI=$GLIBCXX_USE_CXX11_ABI -std=gnu++17 -L${install_root}/lib ${REF_LIB} ${ADDITIONAL_LINKER_FLAGS} -ltorch $TORCH_CPU_LINK_FLAGS $TORCH_CUDA_LINK_FLAGS $C10_LINK_FLAGS -o $1
+  ERRCODE=$?
+  set -e
+  if [ "$ERRCODE" -eq "0" ]; then
+    echo "Building example with incorrect ABI didn't throw error. Aborting."
+    exit 1
+  else
+    echo "Building example with incorrect ABI throws expected error. Proceeding."
+  fi
+}
+
 ###############################################################################
 # Check simple Python/C++ calls
 ###############################################################################
@ -160,6 +236,11 @@ if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    export LD_LIBRARY_PATH=/usr/local/cuda/lib64
  fi
  build_and_run_example_cpp simple-torch-test
+  # `_GLIBCXX_USE_CXX11_ABI` is always ignored by gcc in devtoolset7, so we test
+  # the expected failure case for Ubuntu 16.04 + gcc 5.4 only.
+  if [[ "$DESIRED_DEVTOOLSET" == *"cxx11-abi"* ]]; then
+    build_example_cpp_with_incorrect_abi simple-torch-test
+  fi
 else
  pushd /tmp
  python -c 'import torch'
@ -216,14 +297,6 @@ else
  fi
 fi

-###############################################################################
-# Check XPU configured correctly
-###############################################################################
-if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
-  echo "Checking that xpu is compiled"
-  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
-fi
-
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
@ -302,19 +375,10 @@ except RuntimeError as e:
 fi

 ###############################################################################
-# Check for C++ ABI compatibility to GCC-11
+# Check for C++ ABI compatibility between gcc7 and gcc9 compiled binaries
 ###############################################################################
 if [[ "$(uname)" == 'Linux' &&  "$PACKAGE_TYPE" == 'manywheel' ]]; then
  pushd /tmp
-  # Per https://gcc.gnu.org/onlinedocs/gcc/C_002b_002b-Dialect-Options.html gcc-11 is ABI16
-  # Though manylinux_2.28 should have been build with gcc-14, per
-  # https://github.com/pypa/manylinux?tab=readme-ov-file#manylinux_2_28-almalinux-8-based
-  # On s390x gcc 14 is used because it contains fix for interaction
-  # between precompiled headers and vectorization builtins.
-  # This fix is not available in earlier gcc versions.
-  # gcc-14 uses ABI19.
-  if [[ "$(uname -m)" != "s390x" ]]; then
-    python -c "import torch; exit(0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1016' else 1)"
-  fi
+  python -c "import torch; exit(0 if torch.compiled_with_cxx11_abi() else (0 if torch._C._PYBIND11_BUILD_ABI == '_cxxabi1011' else 1))"
  popd
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -202,7 +202,7 @@ function install_torchrec_and_fbgemm() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.7 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,31 +1,50 @@
 #!/bin/bash

 # Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
+# image, which has sccache installed but doesn't write the stubs.  This is
+# mostly copied from .ci/docker/install_cache.sh.  Changes are: removing checks
+# that will always return the same thing, ex checks for for rocm, CUDA, changing
+# the path where sccache is installed, not changing /etc/environment, and not
+# installing/downloading sccache as it is already in the docker image.

 set -ex -o pipefail

-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
 mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
 export PATH="/tmp/cache/bin:$PATH"

-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
 function write_sccache_stub() {
  # Unset LD_PRELOAD for ps because of asan + ps issues
  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
+  if [ "$1" == "gcc" ]; then
+    # Do not call sccache recursively when dumping preprocessor argument
+    # For some reason it's very important for the first cached nvcc invocation
+    cat >"/tmp/cache/bin/$1" <<EOF
+#!/bin/sh
+
+# sccache does not support -E flag, so we need to call the original compiler directly in order to avoid calling this wrapper recursively
+for arg in "\$@"; do
+  if [ "\$arg" = "-E" ]; then
+    exec $(which "$1") "\$@"
+  fi
+done
+
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache $(which "$1") "\$@"
+else
+  exec $(which "$1") "\$@"
+fi
+EOF
+  else
+    cat >"/tmp/cache/bin/$1" <<EOF
+#!/bin/sh
+
+if [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then
+  exec sccache $(which "$1") "\$@"
+else
+  exec $(which "$1") "\$@"
+fi
+EOF
+  fi
  chmod a+x "/tmp/cache/bin/$1"
 }

--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -33,15 +33,56 @@ if which sccache > /dev/null; then
  export PATH="${tmp_dir}:$PATH"
 fi

-print_cmake_info
-if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
-else
+cross_compile_arm64() {
+  # Cross compilation for arm64
  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 CMAKE_OSX_ARCHITECTURES=arm64 MACOSX_DEPLOYMENT_TARGET=11.0 USE_MKLDNN=OFF USE_QNNPACK=OFF WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_arm64() {
+  # Compilation for arm64
+  # TODO: Compile with OpenMP support (but this causes CI regressions as cross-compilation were done with OpenMP disabled)
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+}
+
+compile_x86_64() {
+  USE_DISTRIBUTED=0 WERROR=1 python setup.py bdist_wheel --plat-name=macosx_10_9_x86_64
+}
+
+build_lite_interpreter() {
+    echo "Testing libtorch (lite interpreter)."
+
+    CPP_BUILD="$(pwd)/../cpp_build"
+    # Ensure the removal of the tmp directory
+    trap 'rm -rfv ${CPP_BUILD}' EXIT
+    rm -rf "${CPP_BUILD}"
+    mkdir -p "${CPP_BUILD}/caffe2"
+
+    # It looks libtorch need to be built in "${CPP_BUILD}/caffe2 folder.
+    BUILD_LIBTORCH_PY=$PWD/tools/build_libtorch.py
+    pushd "${CPP_BUILD}/caffe2" || exit
+    VERBOSE=1 DEBUG=1 python "${BUILD_LIBTORCH_PY}"
+    popd || exit
+
+    "${CPP_BUILD}/caffe2/build/bin/test_lite_interpreter_runtime"
+}
+
+print_cmake_info
+
+if [[ ${BUILD_ENVIRONMENT} = *arm64* ]]; then
+  if [[ $(uname -m) == "arm64" ]]; then
+    compile_arm64
+  else
+    cross_compile_arm64
+  fi
+elif [[ ${BUILD_ENVIRONMENT} = *lite-interpreter* ]]; then
+  export BUILD_LITE_INTERPRETER=1
+  build_lite_interpreter
+else
+  compile_x86_64
 fi
+
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -221,39 +221,25 @@ test_torchbench_smoketest() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

+  local backend=eager
+  local dtype=notset
  local device=mps
-  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder moco speech_transformer)

-  for backend in eager inductor; do
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"

-    for dtype in notset float16 bfloat16; do
-      echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
-      local dtype_arg="--${dtype}"
-      if [ "$dtype" == notset ]; then
-          dtype_arg="--float32"
-      fi
-      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-      for model in "${models[@]}"; do
-        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
-          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
-      done
-    done
-
-    for dtype in notset amp; do
-      echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype}"
-      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-      local dtype_arg="--${dtype}"
-      if [ "$dtype" == notset ]; then
-          dtype_arg="--float32"
-      fi
-      for model in "${models[@]}"; do
-        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-          --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
-          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
-      done
-    done
+  echo "Setup complete, launching torchbench training performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --training --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+  done

+  echo "Launching torchbench inference performance run"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done

  echo "Pytorch benchmark on mps device completed"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,6 +119,12 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

+# Prevent Google from indexing $install_path/_modules. This folder contains
+# generated source files.
+# NB: the following only works on gnu sed. The sed shipped with mac os is different.
+# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
+find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
+
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -80,7 +80,7 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
        return functools.reduce(list.__add__, (x.result() for x in tasks), [])


-def check_lib_symbols_for_abi_correctness(lib: str) -> None:
+def check_lib_symbols_for_abi_correctness(lib: str, pre_cxx11_abi: bool = True) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
    pre_cxx11_symbols = grep_symbols(lib, LIBTORCH_PRE_CXX11_PATTERNS)
@ -88,12 +88,28 @@ def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    num_pre_cxx11_symbols = len(pre_cxx11_symbols)
    print(f"num_cxx11_symbols: {num_cxx11_symbols}")
    print(f"num_pre_cxx11_symbols: {num_pre_cxx11_symbols}")
-    if num_pre_cxx11_symbols > 0:
-        raise RuntimeError(
-            f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+    if pre_cxx11_abi:
+        if num_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found cxx11 symbols, but there shouldn't be any, see: {cxx11_symbols[:100]}"
+            )
+        if num_pre_cxx11_symbols < 1000:
+            raise RuntimeError("Didn't find enough pre-cxx11 symbols.")
+        # Check for no recursive iterators, regression test for https://github.com/pytorch/pytorch/issues/133437
+        rec_iter_symbols = grep_symbols(
+            lib, [re.compile("std::filesystem::recursive_directory_iterator.*")]
        )
-    if num_cxx11_symbols < 100:
-        raise RuntimeError("Didn't find enought cxx11 symbols")
+        if len(rec_iter_symbols) > 0:
+            raise RuntimeError(
+                f"recursive_directory_iterator in used pre-CXX11 binaries, see; {rec_iter_symbols}"
+            )
+    else:
+        if num_pre_cxx11_symbols > 0:
+            raise RuntimeError(
+                f"Found pre-cxx11 symbols, but there shouldn't be any, see: {pre_cxx11_symbols[:100]}"
+            )
+        if num_cxx11_symbols < 100:
+            raise RuntimeError("Didn't find enought cxx11 symbols")


 def main() -> None:
@ -105,8 +121,9 @@ def main() -> None:
        else:
            install_root = Path(distutils.sysconfig.get_python_lib()) / "torch"

-    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
-    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    libtorch_cpu_path = install_root / "lib" / "libtorch_cpu.so"
+    pre_cxx11_abi = "cxx11-abi" not in os.getenv("DESIRED_DEVTOOLSET", "")
+    check_lib_symbols_for_abi_correctness(libtorch_cpu_path, pre_cxx11_abi)


 if __name__ == "__main__":
--- a/.ci/pytorch/smoke_test/check_gomp.py
+++ b/.ci/pytorch/smoke_test/check_gomp.py
@ -0,0 +1,74 @@
+import ctypes
+import os
+import sys
+from pathlib import Path
+
+
+def get_gomp_thread():
+    """
+    Retrieves the maximum number of OpenMP threads after loading the `libgomp.so.1` library
+    and the `libtorch_cpu.so` library. It then queries the
+    maximum number of threads available for OpenMP parallel regions using the
+    `omp_get_max_threads` function.
+
+    Returns:
+        int: The maximum number of OpenMP threads available.
+
+    Notes:
+        - The function assumes the default path for `libgomp.so.1` on AlmaLinux OS.
+        - The path to `libtorch_cpu.so` is constructed based on the Python executable's
+          installation directory.
+        - This function is specific to environments where PyTorch and OpenMP are used
+          together and may require adjustments for other setups.
+    """
+    python_path = Path(sys.executable).resolve()
+    python_prefix = (
+        python_path.parent.parent
+    )  # Typically goes to the Python installation root
+
+    # Get the additional ABI flags (if any); it may be an empty string.
+    abiflags = getattr(sys, "abiflags", "")
+
+    # Construct the Python directory name correctly (e.g., "python3.13t").
+    python_version = (
+        f"python{sys.version_info.major}.{sys.version_info.minor}{abiflags}"
+    )
+
+    libtorch_cpu_path = (
+        python_prefix
+        / "lib"
+        / python_version
+        / "site-packages"
+        / "torch"
+        / "lib"
+        / "libtorch_cpu.so"
+    )
+
+    # use the default gomp path of AlmaLinux OS
+    libgomp_path = "/usr/lib64/libgomp.so.1"
+
+    os.environ["GOMP_CPU_AFFINITY"] = "0-3"
+
+    libgomp = ctypes.CDLL(libgomp_path)
+    libgomp = ctypes.CDLL(libtorch_cpu_path)
+
+    libgomp.omp_get_max_threads.restype = ctypes.c_int
+    libgomp.omp_get_max_threads.argtypes = []
+
+    omp_max_threads = libgomp.omp_get_max_threads()
+    return omp_max_threads
+
+
+def main():
+    omp_max_threads = get_gomp_thread()
+    print(
+        f"omp_max_threads after loading libgomp.so and libtorch_cpu.so: {omp_max_threads}"
+    )
+    if omp_max_threads == 1:
+        raise RuntimeError(
+            "omp_max_threads is 1. Check whether libgomp.so is loaded twice."
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/.ci/pytorch/smoke_test/smoke_test.py
+++ b/.ci/pytorch/smoke_test/smoke_test.py
@ -7,7 +7,6 @@ import subprocess
 import sys
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from typing import Optional

 import torch
 import torch._dynamo
@ -196,41 +195,8 @@ def test_cuda_gds_errors_captured() -> None:
        )


-def find_pypi_package_version(package: str) -> Optional[str]:
-    from importlib import metadata
-
-    dists = metadata.distributions()
-    for dist in dists:
-        if dist.metadata["Name"].startswith(package):
-            return dist.version
-    return None
-
-
-def cudnn_to_version_str(cudnn_version: int) -> str:
-    patch = int(cudnn_version % 10)
-    minor = int((cudnn_version / 100) % 100)
-    major = int((cudnn_version / 10000) % 10000)
-    return f"{major}.{minor}.{patch}"
-
-
-def compare_pypi_to_torch_versions(
-    package: str, pypi_version: str, torch_version: str
-) -> None:
-    if pypi_version is None:
-        raise RuntimeError(f"Can't find {package} in PyPI for Torch: {torch_version}")
-    if pypi_version.startswith(torch_version):
-        print(f"Found matching {package}. Torch: {torch_version} PyPI {pypi_version}")
-    else:
-        raise RuntimeError(
-            f"Wrong {package} version. Torch: {torch_version} PyPI: {pypi_version}"
-        )
-
-
 def smoke_test_cuda(
-    package: str,
-    runtime_error_check: str,
-    torch_compile_check: str,
-    pypi_pkg_check: str,
+    package: str, runtime_error_check: str, torch_compile_check: str
 ) -> None:
    if not torch.cuda.is_available() and is_cuda_system:
        raise RuntimeError(f"Expected CUDA {gpu_arch_ver}. However CUDA is not loaded.")
@ -260,30 +226,20 @@ def smoke_test_cuda(
            raise RuntimeError(
                f"Wrong CUDA version. Loaded: {torch.version.cuda} Expected: {gpu_arch_ver}"
            )
-
        print(f"torch cuda: {torch.version.cuda}")
+        # todo add cudnn version validation
+        print(f"torch cudnn: {torch.backends.cudnn.version()}")
+        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
+
        torch.cuda.init()
        print("CUDA initialized successfully")
        print(f"Number of CUDA devices: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"Device {i}: {torch.cuda.get_device_name(i)}")

-        print(f"cuDNN enabled? {torch.backends.cudnn.enabled}")
-        torch_cudnn_version = cudnn_to_version_str(torch.backends.cudnn.version())
-        print(f"Torch cuDNN version: {torch_cudnn_version}")
-
+        # nccl is availbale only on Linux
        if sys.platform in ["linux", "linux2"]:
-            torch_nccl_version = ".".join(str(v) for v in torch.cuda.nccl.version())
-            print(f"Torch nccl; version: {torch_nccl_version}")
-
-        # Pypi dependencies are installed on linux ony and nccl is availbale only on Linux.
-        if pypi_pkg_check == "enabled" and sys.platform in ["linux", "linux2"]:
-            compare_pypi_to_torch_versions(
-                "cudnn", find_pypi_package_version("nvidia-cudnn"), torch_cudnn_version
-            )
-            compare_pypi_to_torch_versions(
-                "nccl", find_pypi_package_version("nvidia-nccl"), torch_nccl_version
-            )
+            print(f"torch nccl version: {torch.cuda.nccl.version()}")

        if runtime_error_check == "enabled":
            test_cuda_runtime_errors_captured()
@ -442,13 +398,6 @@ def parse_args():
        choices=["enabled", "disabled"],
        default="enabled",
    )
-    parser.add_argument(
-        "--pypi-pkg-check",
-        help="Check pypi package versions cudnn and nccl",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
    return parser.parse_args()


@ -473,10 +422,7 @@ def main() -> None:
        smoke_test_modules()

    smoke_test_cuda(
-        options.package,
-        options.runtime_error_check,
-        options.torch_compile_check,
-        options.pypi_pkg_check,
+        options.package, options.runtime_error_check, options.torch_compile_check
    )


--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1175,6 +1175,7 @@ build_xla() {
  # These functions are defined in .circleci/common.sh in pytorch/xla repo
  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
+  retry install_post_deps_pytorch_xla
  assert_git_not_dirty
 }

@ -1526,27 +1527,6 @@ test_linux_aarch64() {
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

-test_operator_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  TEST_DIR=$(pwd)
-
-  test_inductor_set_cpu_affinity
-
-  cd benchmarks/operator_benchmark/pt_extension
-  python setup.py install
-
-  cd "${TEST_DIR}"/benchmarks/operator_benchmark
-  $TASKSET python -m benchmark_all_test --device "$1" --tag-filter "$2" \
-      --output-dir "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv"
-
-  pip_install pandas
-  python check_perf_csv.py \
-      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
-      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
-}
-
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
@ -1577,19 +1557,6 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
-elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
-  TEST_MODE="short"
-
-  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
-    if [[ "${TEST_CONFIG}" == *long* ]]; then
-      TEST_MODE="long"
-    elif [[ "${TEST_CONFIG}" == *all* ]]; then
-      TEST_MODE="all"
-    fi
-
-    test_operator_benchmark cpu ${TEST_MODE}
-
-  fi
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1652,7 +1619,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
--- a/.ci/pytorch/windows/internal/smoke_test.bat
+++ b/.ci/pytorch/windows/internal/smoke_test.bat
@ -42,6 +42,7 @@ if "%DESIRED_PYTHON%" == "3.12" set "PYTHON_INSTALLER_URL=https://www.python.org
 if "%DESIRED_PYTHON%" == "3.11" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.11.0/python-3.11.0-amd64.exe"
 if "%DESIRED_PYTHON%" == "3.10" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.10.0/python-3.10.0-amd64.exe"
 if "%DESIRED_PYTHON%" == "3.9" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.9.0/python-3.9.0-amd64.exe"
+if "%DESIRED_PYTHON%" == "3.8" set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.8.2/python-3.8.2-amd64.exe"
 if "%PYTHON_INSTALLER_URL%" == "" (
    echo Python %DESIRED_PYTHON% not supported yet
 )
@ -127,6 +128,7 @@ goto end
 :libtorch
 echo "install and test libtorch"

+if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1

 if ERRORLEVEL 1 exit /b 1
@ -138,6 +140,10 @@ pushd tmp\libtorch

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
+IF "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/static_lib_test.bat
+++ b/.ci/pytorch/windows/internal/static_lib_test.bat
@ -70,6 +70,7 @@ echo "install and test libtorch"
 pip install cmake
 echo "installing cmake"

+if "%VC_YEAR%" == "2019" powershell internal\vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell internal\vs2022_install.ps1

 if ERRORLEVEL 1 exit /b 1
@ -82,6 +83,10 @@ pushd tmp\libtorch

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
+IF "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/vc_install_helper.bat
+++ b/.ci/pytorch/windows/internal/vc_install_helper.bat
@ -1,8 +1,12 @@
+if "%VC_YEAR%" == "2019" powershell windows/internal/vs2019_install.ps1
 if "%VC_YEAR%" == "2022" powershell windows/internal/vs2022_install.ps1

 set VC_VERSION_LOWER=17
 set VC_VERSION_UPPER=18
-
+if "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)

 for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe"  -products Microsoft.VisualStudio.Product.BuildTools -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
--- a/.ci/pytorch/windows/internal/vs2019_install.ps1
+++ b/.ci/pytorch/windows/internal/vs2019_install.ps1
@ -0,0 +1,48 @@
+# https://developercommunity.visualstudio.com/t/install-specific-version-of-vs-component/1142479
+# https://docs.microsoft.com/en-us/visualstudio/releases/2019/history#release-dates-and-build-numbers
+
+# 16.8.6 BuildTools
+$VS_DOWNLOAD_LINK = "https://ossci-windows.s3.us-east-1.amazonaws.com/vs16.8.6_BuildTools.exe"
+$COLLECT_DOWNLOAD_LINK = "https://aka.ms/vscollect.exe"
+$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
+                                                     "--add Microsoft.Component.MSBuild",
+                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
+                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
+                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
+                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
+
+curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
+if ($LASTEXITCODE -ne 0) {
+    echo "Download of the VS 2019 Version 16.8.5 installer failed"
+    exit 1
+}
+
+if (Test-Path "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe") {
+    $existingPath = & "${env:ProgramFiles(x86)}\Microsoft Visual Studio\Installer\vswhere.exe" -products "Microsoft.VisualStudio.Product.BuildTools" -version "[16, 17)" -property installationPath
+    if ($existingPath -ne $null) {
+        if (!${env:CIRCLECI}) {
+            echo "Found correctly versioned existing BuildTools installation in $existingPath"
+            exit 0
+        }
+        echo "Found existing BuildTools installation in $existingPath, keeping it"
+    }
+}
+
+$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
+Remove-Item -Path vs_installer.exe -Force
+$exitCode = $process.ExitCode
+if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
+    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
+    curl.exe --retry 3 -kL $COLLECT_DOWNLOAD_LINK --output Collect.exe
+    if ($LASTEXITCODE -ne 0) {
+        echo "Download of the VS Collect tool failed."
+        exit 1
+    }
+    Start-Process "${PWD}\Collect.exe" -NoNewWindow -Wait -PassThru
+    New-Item -Path "C:\w\build-results" -ItemType "directory" -Force
+    Copy-Item -Path "C:\Users\${env:USERNAME}\AppData\Local\Temp\vslogs.zip" -Destination "C:\w\build-results\"
+    exit 1
+}
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -90,16 +90,12 @@ fi
 /pytorch/.ci/pytorch/check_binary.sh

 if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
+  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
+  python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled

-  torch_pkg_size="$(ls -1 /final_pkgs/torch-* | sort |tail -1 |xargs wc -c |cut -d ' ' -f1)"
-  # todo: implement check for large binaries
-  # if the package is larger than 1.5GB, we disable the pypi check.
-  # this package contains all libraries packaged in torch libs folder
-  # example of such package is https://download.pytorch.org/whl/cu126_full/torch
-  if [[ "\$torch_pkg_size" -gt  1500000000 ]]; then
-    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled --pypi-pkg-check disabled
-  else
-    python /pytorch/.ci/pytorch/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled $extra_parameters
+  if [[ "\$GPU_ARCH_TYPE" != *cpu-aarch64* ]]; then
+    # test for issue https://github.com/pytorch/pytorch/issues/149422
+    python /pytorch/.ci/pytorch/smoke_test/check_gomp.py
  fi
 fi

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -55,16 +55,12 @@ s3_upload() {
    s3_upload_dir="${s3_root_dir}/${UPLOAD_SUBFOLDER}/"
  fi
  (
-    cache_control_flag=""
-    if [[ "${UPLOAD_CHANNEL}" = "test" ]]; then
-      cache_control_flag="--cache-control='no-cache,no-store,must-revalidate'"
-    fi
    for pkg in ${PKG_DIR}/*.${extension}; do
      (
        set -x
        shm_id=$(sha256sum "${pkg}" | awk '{print $1}')
        ${AWS_S3_CP} --no-progress --acl public-read "${pkg}" "${s3_upload_dir}" \
-          --metadata "checksum-sha256=${shm_id}" ${cache_control_flag}
+          --metadata "checksum-sha256=${shm_id}"
      )
    done
  )
--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -0,0 +1,22 @@
+#!/bin/bash
+set -eux -o pipefail
+
+source "${BINARY_ENV_FILE:-/c/w/env}"
+mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+export USE_SCCACHE=1
+export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+
+echo "Free space on filesystem before build:"
+df -h
+
+export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
+
+if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
+    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
+elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
+    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
+fi
+
+echo "Free space on filesystem after build:"
+df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -0,0 +1,6 @@
+#!/bin/bash
+set -eux -o pipefail
+
+source "${BINARY_ENV_FILE:-/c/w/env}"
+
+pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -4,15 +4,14 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"

-if [[ "$OS" != "windows-arm64" ]]; then
-    export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-    export USE_SCCACHE=1
-    export SCCACHE_BUCKET=ossci-compiler-cache
-    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-    export VC_YEAR=2022
-fi
+export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+export USE_SCCACHE=1
+export SCCACHE_BUCKET=ossci-compiler-cache
+export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+export VC_YEAR=2019

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+    export VC_YEAR=2022
    export USE_SCCACHE=0
    export XPU_VERSION=2025.0
    export XPU_ENABLE_KINETO=1
@ -23,16 +22,7 @@ df -h

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-
-if [[ "$OS" == "windows-arm64" ]]; then
-    if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-        ./windows/arm64/build_libtorch.bat
-    elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
-        ./windows/arm64/build_pytorch.bat
-    fi
-else
-    ./windows/internal/build_wheels.bat
-fi
+./windows/internal/build_wheels.bat

 echo "Free space on filesystem after build:"
 df -h
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -4,18 +4,14 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"

 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export VC_YEAR=2022
+export VC_YEAR=2019

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
+    export VC_YEAR=2022
    export XPU_VERSION=2025.0
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
-
-if [[ "$OS" == "windows-arm64" ]]; then
-    ./windows/arm64/smoke_test.bat
-else
-    ./windows/internal/smoke_test.bat
-fi
+./windows/internal/smoke_test.bat

 popd
--- a/.clang-tidy
+++ b/.clang-tidy
@ -48,10 +48,12 @@ misc-*,
 -misc-no-recursion,
 -misc-non-private-member-variables-in-classes,
 -misc-unused-using-decls,
+-misc-use-internal-linkage,
 modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
+-modernize-use-default-member-init,
 -modernize-use-using,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
--- a/.editorconfig
+++ b/.editorconfig
@ -1,14 +0,0 @@
-root = true
-
-[*]
-end_of_line = lf
-insert_final_newline = true
-
-# Python
-[*.py]
-indent_style = space
-indent_size = 4
-
-# Make
-[Makefile]
-indent_style = tab
--- a/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
+++ b/.github/ISSUE_TEMPLATE/disable-ci-jobs.md
@ -5,7 +5,7 @@ title: "DISABLED [WORKFLOW_NAME] / [PLATFORM_NAME] / [JOB_NAME]"
 labels: "module: ci"
 ---

-> For example, DISABLED pull / win-vs2022-cpu-py3 / test (default). Once
+> For example, DISABLED pull / win-vs2019-cpu-py3 / test (default). Once
 > created, the job will be disabled within 15 minutes. You can check the
 > list of disabled jobs at https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json

--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@ -20,7 +20,7 @@ body:

        - Don't compare indices of max/min etc, because that avoids the above requirement

-        - When comparing eager and torch.compile, use a higher precision result as a baseline. `torch._dynamo.utils.same` with fp64_ref will handle this comparison.
+        - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline

        - Ensure rng state used to compare results is equivalent. Use `torch._inductor.config.fallback_random=True` and reset the torch rng seed between comparisons

--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -45,15 +45,10 @@ self-hosted-runner:
    - windows.g5.4xlarge.nvidia.gpu
    # Windows ARM64 runners
    - windows-11-arm64
-    # Organization-wide AMD-hosted runners
-    # MI2xx runners
+    # Organization-wide AMD hosted runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
-    # MI300 runners
-    - linux.rocm.gpu.mi300.2
-    - linux.rocm.gpu.mi300.4
-    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
    - macos-m2-14
--- a/.github/actions/binary-docker-build/action.yml
+++ b/.github/actions/binary-docker-build/action.yml
@ -1,70 +0,0 @@
-name: Binary docker build
-
-description: Build docker image for binary builds
-
-inputs:
-  docker-image-name:
-    description: Docker image name for PR builds
-    required: true
-  docker-build-dir:
-    description: Location of the build.sh relative to .ci/docker
-    required: true
-  custom-tag-prefix:
-    description: Custom tag prefix for the docker image
-    required: false
-  DOCKER_TOKEN:
-    description: Docker token for authentication
-    required: true
-  DOCKER_ID:
-    description: Docker ID for authentication
-    required: true
-
-runs:
-  using: composite
-  steps:
-    - name: Checkout PyTorch
-      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-
-    - name: Calculate docker image
-      id: calculate-docker-image
-      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-      with:
-        docker-image-name: ${{ inputs.docker-image-name }}
-        docker-build-dir: .ci/docker
-        custom-tag-prefix: ${{ inputs.custom-tag-prefix }}
-        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
-        always-rebuild: true
-        push: true
-
-    - name: Tag and (if WITH_PUSH) push docker image to docker.io
-      env:
-        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
-        DOCKER_ID: ${{ inputs.DOCKER_ID }}
-        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
-        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
-        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      shell: bash
-      run: |
-        set -euox pipefail
-        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
-        GIT_BRANCH_NAME=${GITHUB_REF##*/}
-        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
-
-        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
-
-        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}
-        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
-        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
-        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
-
-        # Pretty sure Github will mask tokens and I'm not sure if it will even be
-        # printed due to pipe, but just in case
-        set +x
-        if [[ ${WITH_PUSH:-false} == "true" ]]; then
-          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          docker push ${DOCKER_IMAGE_NAME_PREFIX}
-          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
-          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
-          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
-        fi
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -23,44 +23,9 @@ runs:
      id: check_container_runner
      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

-    - name: Set up parallel fetch and clean workspace
-      id: first-clean
-      continue-on-error: true
+    - name: Clean workspace
      shell: bash
      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
-      env:
-        NO_SUDO: ${{ inputs.no-sudo }}
-      run: |
-        # Use all available CPUs for fetching
-        cd "${GITHUB_WORKSPACE}"
-        git config --global fetch.parallel 0
-        git config --global submodule.fetchJobs 0
-
-        # Clean workspace. The default checkout action should also do this, but
-        # do it here as well just in case
-        if [[ -d .git ]]; then
-          if [ -z "${NO_SUDO}" ]; then
-            sudo git clean -ffdx
-          else
-            git clean -ffdx
-          fi
-        fi
-
-    - name: Checkout PyTorch
-      id: first-checkout-attempt
-      continue-on-error: true
-      uses: actions/checkout@v4
-      with:
-        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-        # --depth=1 for speed, manually fetch history and other refs as necessary
-        fetch-depth: ${{ inputs.fetch-depth }}
-        submodules: ${{ inputs.submodules }}
-        show-progress: false
-
-    - name: Clean workspace (try again)
-      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' &&
-        (steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success') }}
-      shell: bash
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
@ -75,11 +40,16 @@ runs:
        fi
        mkdir "${GITHUB_WORKSPACE}"

-    - name: Checkout PyTorch (try again)
+        # Use all available CPUs for fetching
+        cd "${GITHUB_WORKSPACE}"
+        git config --global fetch.parallel 0
+        git config --global submodule.fetchJobs 0
+
+    - name: Checkout PyTorch
      uses: actions/checkout@v4
-      if: ${{ steps.first-clean.outcome != 'success' || steps.first-checkout-attempt.outcome != 'success' }}
      with:
        ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+        # --depth=1 for speed, manually fetch history and other refs as necessary
        fetch-depth: ${{ inputs.fetch-depth }}
        submodules: ${{ inputs.submodules }}
        show-progress: false
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -66,7 +66,7 @@ runs:

    - name: configure aws credentials
      if : ${{ inputs.aws-role-to-assume != '' }}
-      uses: aws-actions/configure-aws-credentials@v4
+      uses: aws-actions/configure-aws-credentials@v3
      with:
        role-to-assume: ${{ inputs.aws-role-to-assume }}
        role-session-name: gha-linux-test
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -15,6 +15,7 @@ runs:
          -e BINARY_ENV_FILE \
          -e BUILD_ENVIRONMENT \
          -e DESIRED_CUDA \
+          -e DESIRED_DEVTOOLSET \
          -e DESIRED_PYTHON \
          -e GITHUB_ACTIONS \
          -e GPU_ARCH_TYPE \
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -48,8 +48,14 @@ runs:
      run: |
        # Remove any previous usage logs if they exist
        rm -f logs-*.zip
-        zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt' || true
-        zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log' || true
+        # this workflow is also run in bazel build test, but we dont generate usage reports for it
+        # so check to see if the file exists first
+        if [ -f 'usage_log.txt' ]; then
+            zip "logs-${FILE_SUFFIX}.zip" 'usage_log.txt'
+        fi
+        if find "test/test-reports" -name "*.log" 2>/dev/null | grep -q .; then
+            zip -r "logs-${FILE_SUFFIX}.zip" test/test-reports -i '*.log'
+        fi

    - name: Zip debugging artifacts for upload
      if: runner.os != 'Windows' && !inputs.use-gha
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-bccaa454a54c3c648697cc2f46a4fb0500b1f01b
+c670ad81fda266b6598aeeef434583eb98197ae8
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-ac9a39f4b768cef09b9d2be8e074be496d7783b6
+r2.7
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -112,22 +112,3 @@
 - torch/csrc/inductor/aoti_include/xpu.h
 - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
 - torch/csrc/inductor/cpp_wrapper/xpu.h
-
-"release notes: inductor (aoti)":
- torch/_C/_aoti.pyi
- torch/_dynamo/repro/aoti.py
- torch/_export/serde/aoti_schema.py
- torch/_higher_order_ops/aoti_call_delegate.py
- torch/_inductor/codegen/aoti_runtime/**
- torch/_inductor/codegen/aoti_hipify_utils.py
- torch/_inductor/codegen/cpp_wrapper_cpu.py
- torch/_inductor/codegen/cpp_wrapper_gpu.py
- torch/_inductor/aoti_eager.py
- torch/csrc/inductor/aoti_runtime/**
- torch/csrc/inductor/aoti_torch/**
- torch/csrc/inductor/aoti_runner/**
- torch/csrc/inductor/aoti_eager/**
- torch/csrc/inductor/aoti_package/**
- torch/csrc/inductor/aoti_include/**
- torchgen/aoti/**
- torchgen/gen_aoti_c_shim.py
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -501,9 +501,7 @@
 - name: XPU
  patterns:
  - '**xpu**'
-  - '**XPU**'
  - '**sycl**'
-  - '**SYCL**'
  approved_by:
  - EikanWang
  - jgong5
@ -540,7 +538,6 @@
  - bdhirsh
  - zou3519
  - isuruf
-  - Chillee
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,7 +16,6 @@ ciflow_push_tags:
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
- ciflow/periodic-rocm-mi300
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
@ -26,7 +25,6 @@ ciflow_push_tags:
 - ciflow/xpu
 - ciflow/torchbench
 - ciflow/autoformat
- ciflow/op-benchmark
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -61,14 +61,10 @@ fi
 ROCM_SO=(
    "${libamdhip}"
    "libhsa-runtime64.so.1"
+    "libamd_comgr.so.2"
    "libdrm.so.2"
    "libdrm_amdgpu.so.1"
 )
-if [[ $ROCM_INT -ge 60400 ]]; then
-    ROCM_SO+=("libamd_comgr.so.3")
-else
-    ROCM_SO+=("libamd_comgr.so.2")
-fi

 if [[ $ROCM_INT -ge 60100 ]]; then
    ROCM_SO+=("librocprofiler-register.so.0")
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -39,9 +39,9 @@ SUPPORTED_PERIODICAL_MODES: dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=n.FT07XR3dLMwOLBwmRNquyYSeGk8Het"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=.Ox7WAXa21I1PVqadHyPfhMRPhl0aCnD"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,10 +30,12 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Also update the ROCm sources in tools/nightly.py when changing this list
-ROCM_ARCHES = ["6.3", "6.4"]
+ROCM_ARCHES = ["6.2.4", "6.3"]

 XPU_ARCHES = ["xpu"]

+CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
+
 CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]
@ -75,7 +77,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cuda-nvrtc-cu12==12.8.61; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-runtime-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.8.57; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.8.0.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==9.7.1.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.8.3.14; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.3.3.41; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.9.55; platform_system == 'Linux' and platform_machine == 'x86_64' | "
@ -144,6 +146,8 @@ def arch_type(arch_version: str) -> str:
        return "rocm"
    elif arch_version in XPU_ARCHES:
        return "xpu"
+    elif arch_version in CPU_CXX11_ABI_ARCH:
+        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
    elif arch_version in CPU_S390X_ARCH:
@ -172,23 +176,31 @@ WHEEL_CONTAINER_IMAGES = {
    },
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
+    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
+    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
 }

+CXX11_ABI = "cxx11-abi"
 RELEASE = "release"
 DEBUG = "debug"

-LIBTORCH_CONTAINER_IMAGES: dict[str, str] = {
+LIBTORCH_CONTAINER_IMAGES: dict[tuple[str, str], str] = {
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:cuda{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in CUDA_ARCHES
    },
    **{
-        gpu_arch: f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
+        (
+            gpu_arch,
+            CXX11_ABI,
+        ): f"pytorch/libtorch-cxx11-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "cpu": f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
+    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }

 FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12", "3.13", "3.13t"]
@ -198,6 +210,7 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
    return {
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
+        "cpu-cxx11-abi": "cpu-cxx11-abi",
        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
        "cuda-aarch64": f"cu{gpu_arch_version.replace('-aarch64', '').replace('.', '')}",
@ -212,7 +225,7 @@ def list_without(in_list: list[str], without: list[str]) -> list[str]:

 def generate_libtorch_matrix(
    os: str,
-    release_type: str,
+    abi_version: str,
    arches: Optional[list[str]] = None,
    libtorch_variants: Optional[list[str]] = None,
 ) -> list[dict[str, str]]:
@ -234,6 +247,9 @@ def generate_libtorch_matrix(
    ret: list[dict[str, str]] = []
    for arch_version in arches:
        for libtorch_variant in libtorch_variants:
+            # one of the values in the following list must be exactly
+            # CXX11_ABI, but the precise value of the other one doesn't
+            # matter
            gpu_arch_type = arch_type(arch_version)
            gpu_arch_version = "" if arch_version == "cpu" else arch_version
            # ROCm builds without-deps failed even in ROCm runners; skip for now
@ -246,15 +262,20 @@ def generate_libtorch_matrix(
                    "desired_cuda": translate_desired_cuda(
                        gpu_arch_type, gpu_arch_version
                    ),
-                    "libtorch_config": release_type,
                    "libtorch_variant": libtorch_variant,
+                    "libtorch_config": abi_version
+                    if os in ("windows", "windows-arm64")
+                    else "",
+                    "devtoolset": abi_version
+                    if os not in ("windows", "windows-arm64")
+                    else "",
                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[arch_version]
+                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
                        if os not in ("windows", "windows-arm64")
                        else ""
                    ),
                    "package_type": "libtorch",
-                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{release_type}".replace(
+                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
                    ),
                }
@ -280,7 +301,7 @@ def generate_wheels_matrix(
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES + XPU_ARCHES
        elif os == "linux-aarch64":
@ -299,6 +320,7 @@ def generate_wheels_matrix(
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
+                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
                or arch_version == "cpu-s390x"
                or arch_version == "xpu"
@ -333,6 +355,7 @@ def generate_wheels_matrix(
                        "gpu_arch_version": gpu_arch_version,
                        "desired_cuda": desired_cuda,
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": "cxx11-abi",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "pytorch_extra_install_requirements": (
@ -361,6 +384,7 @@ def generate_wheels_matrix(
                                gpu_arch_type, gpu_arch_version
                            ),
                            "use_split_build": "True" if use_split_build else "False",
+                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
                            "pytorch_extra_install_requirements": "",
@ -379,6 +403,12 @@ def generate_wheels_matrix(
                            gpu_arch_type, gpu_arch_version
                        ),
                        "use_split_build": "True" if use_split_build else "False",
+                        "devtoolset": (
+                            "cxx11-abi"
+                            if (arch_version in ["cpu-cxx11-abi", "cpu-aarch64"])
+                            or os == "linux"
+                            else ""
+                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
@ -387,8 +417,6 @@ def generate_wheels_matrix(
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["xpu"]
                            if gpu_arch_type == "xpu"
-                            else PYTORCH_EXTRA_INSTALL_REQUIREMENTS[CUDA_STABLE]
-                            if os != "linux"
                            else ""
                        ),
                    }
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -54,6 +54,7 @@ class BinaryBuildWorkflow:

    # Optional fields
    build_environment: str = ""
+    abi_version: str = ""
    ciflow_config: CIFlowConfig = field(default_factory=CIFlowConfig)
    is_scheduled: str = ""
    branches: str = "nightly"
@ -61,16 +62,14 @@ class BinaryBuildWorkflow:
    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
    use_split_build: bool = False
-    # Mainly used for libtorch builds
-    build_variant: str = ""

    def __post_init__(self) -> None:
-        if self.build_environment == "":
-            self.build_environment = "-".join(
-                item
-                for item in [self.os, "binary", self.package_type, self.build_variant]
-                if item != ""
+        if self.abi_version:
+            self.build_environment = (
+                f"{self.os}-binary-{self.package_type}-{self.abi_version}"
            )
+        else:
+            self.build_environment = f"{self.os}-binary-{self.package_type}"
        if self.use_split_build:
            # added to distinguish concurrency groups
            self.build_environment += "-split"
@ -134,9 +133,10 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        ciflow_config=CIFlowConfig(
@ -176,10 +176,10 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.LINUX,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
@ -202,7 +202,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
@ -216,7 +216,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
@ -227,6 +227,42 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
+]
+
+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        abi_version=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
+WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="wheel",
@ -243,7 +279,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS_ARM64,
            generate_binary_build_matrix.RELEASE,
@ -258,7 +294,7 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
+        abi_version=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS_ARM64,
            generate_binary_build_matrix.DEBUG,
@ -272,47 +308,14 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

-WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-]
-
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
+        abi_version=generate_binary_build_matrix.CXX11_ABI,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.MACOS,
-            generate_binary_build_matrix.RELEASE,
+            generate_binary_build_matrix.CXX11_ABI,
            libtorch_variants=["shared-with-deps"],
        ),
        cross_compile_arm64=False,
@ -399,6 +402,10 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
+        (
+            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
+            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
+        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -1,6 +1,11 @@
 #!/usr/bin/env bash
 set -ex

+# The generic Linux job chooses to use base env, not the one setup by the image
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)"
+conda activate "${CONDA_ENV}"
+
 # Use uv to speed up lintrunner init
 python3 -m pip install uv==0.1.45

--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -5,50 +5,6 @@ FROM --platform=linux/amd64 docker.io/ubuntu:24.04 as ld-prefix
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get -y install ca-certificates libicu74 libssl3

-# Patched podman
-FROM --platform=linux/s390x docker.io/ubuntu:24.04 as podman
-ENV DEBIAN_FRONTEND=noninteractive
-RUN sed -i 's/^Types: deb$/Types: deb deb-src/' /etc/apt/sources.list.d/ubuntu.sources
-RUN apt-get update && \
-    apt-get install -y \
-        cmake \
-        curl \
-        devscripts \
-        dpkg-dev \
-        gdb \
-        less \
-        make \
-        python3 \
-        python3-pip \
-        quilt \
-        rsync \
-        software-properties-common \
-        stress-ng \
-        vim \
-        nano \
-        wget && \
-    apt-get build-dep -y podman && \
-    apt-get source podman
-
-COPY podman-patches/podman-25245.patch /tmp/podman-25245.patch
-COPY podman-patches/podman-25102-backport.patch /tmp/podman-25102-backport.patch
-
-# import and apply patches
-# patches:
-# https://github.com/containers/podman/pull/25102
-# https://github.com/containers/podman/pull/25245
-RUN cd /libpod-* && \
-    quilt import /tmp/podman-25245.patch && quilt push && \
-    quilt import /tmp/podman-25102-backport.patch && quilt push && \
-    dch -i "Fix podman deadlock and add option to clean up build leftovers" && \
-    /bin/rm /tmp/podman-25245.patch /tmp/podman-25102-backport.patch
-
-# build patched podman
-RUN cd /libpod-* && \
-    debuild -i -us -uc -b && \
-    /bin/rm /podman-remote_*.deb && \
-    mkdir /tmp/podman && cp -v /podman*.deb /tmp/podman
-
 # Main image.
 FROM --platform=linux/s390x docker.io/ubuntu:24.04

@ -89,11 +45,7 @@ COPY fs/ /
 RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint

 # install podman
-# RUN apt-get update && apt -y install podman podman-docker
-
-# install patched podman
-COPY --from=podman /tmp/podman /tmp/podman
-RUN apt-get update && apt -y install /tmp/podman/*.deb && /bin/rm -rfv /tmp/podman
+RUN apt -y install podman podman-docker

 # amd64 Github Actions Runner.
 RUN useradd -m actions-runner
@ -113,7 +65,7 @@ RUN virtualenv --system-site-packages venv
 #
 COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar

-RUN curl -L https://github.com/actions/runner/releases/download/v2.322.0/actions-runner-linux-x64-2.322.0.tar.gz | tar -xz
+RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz

 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25102-backport.patch
@ -1,358 +0,0 @@
-diff --git a/cmd/podman/system/prune.go b/cmd/podman/system/prune.go
-index f7cf7b551..739f87cde 100644
--- a/cmd/podman/system/prune.go
-+++ b/cmd/podman/system/prune.go
-@@ -48,6 +48,7 @@ func init() {
- 	flags.BoolVarP(&force, "force", "f", false, "Do not prompt for confirmation.  The default is false")
- 	flags.BoolVarP(&pruneOptions.All, "all", "a", false, "Remove all unused data")
- 	flags.BoolVar(&pruneOptions.External, "external", false, "Remove container data in storage not controlled by podman")
-+	flags.BoolVar(&pruneOptions.Build, "build", false, "Remove build containers")
- 	flags.BoolVar(&pruneOptions.Volume, "volumes", false, "Prune volumes")
- 	filterFlagName := "filter"
- 	flags.StringArrayVar(&filters, filterFlagName, []string{}, "Provide filter values (e.g. 'label=<key>=<value>')")
-@@ -64,8 +65,12 @@ func prune(cmd *cobra.Command, args []string) error {
- 			volumeString = `
- 	- all volumes not used by at least one container`
- 		}
-
-		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, "Are you sure you want to continue? [y/N] ")
-+		buildString := ""
-+		if pruneOptions.Build {
-+			buildString = `
-+	- all build containers`
-+		}
-+		fmt.Printf(createPruneWarningMessage(pruneOptions), volumeString, buildString, "Are you sure you want to continue? [y/N] ")
- 
- 		answer, err := reader.ReadString('\n')
- 		if err != nil {
-@@ -124,7 +129,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	if pruneOpts.All {
- 		return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all images without at least one container associated with them
- 	- all build cache
- 
-@@ -132,7 +137,7 @@ func createPruneWarningMessage(pruneOpts entities.SystemPruneOptions) string {
- 	}
- 	return `WARNING! This command removes:
- 	- all stopped containers
-	- all networks not used by at least one container%s
-+	- all networks not used by at least one container%s%s
- 	- all dangling images
- 	- all dangling build cache
- 
-diff --git a/docs/source/markdown/podman-system-prune.1.md b/docs/source/markdown/podman-system-prune.1.md
-index 52f9ec1c7..95099d018 100644
--- a/docs/source/markdown/podman-system-prune.1.md
-+++ b/docs/source/markdown/podman-system-prune.1.md
-@@ -7,20 +7,28 @@ podman\-system\-prune - Remove all unused pods, containers, images, networks, an
- **podman system prune** [*options*]
- 
- ## DESCRIPTION
-**podman system prune** removes all unused containers (both dangling and unreferenced), pods, networks, and optionally, volumes from local storage.
-+**podman system prune** removes all unused containers (both dangling and unreferenced), build containers, pods, networks, and optionally, volumes from local storage.
- 
- Use the **--all** option to delete all unused images.  Unused images are dangling images as well as any image that does not have any containers based on it.
- 
- By default, volumes are not removed to prevent important data from being deleted if there is currently no container using the volume. Use the **--volumes** flag when running the command to prune volumes as well.
- 
-+By default, build containers are not removed to prevent interference with builds in progress. Use the **--build** flag when running the command to remove build containers as well.
-+
- ## OPTIONS
- #### **--all**, **-a**
- 
- Recursively remove all unused pods, containers, images, networks, and volume data. (Maximum 50 iterations.)
- 
-+#### **--build**
-+
-+Removes any build containers that were created during the build, but were not removed because the build was unexpectedly terminated.
-+
-+Note: **This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.**
-+
- #### **--external**
- 
-Removes all leftover container storage files from local storage not managed by Podman. In normal circumstances, no such data exists, but in case of an unclean shutdown, the Podman database may be corrupted and cause this.
-+Tries to clean up remainders of previous containers or layers that are not references in the storage json files. These can happen in the case of unclean shutdowns or regular restarts in transient storage mode.
- 
- However, when using transient storage mode, the Podman database does not persist. This means containers leave the writable layers on disk after a reboot. When using a transient store, it is recommended that the **podman system prune --external** command is run during boot.
- 
-diff --git a/libpod/runtime.go b/libpod/runtime.go
-index 986e40f60..609fbba57 100644
--- a/libpod/runtime.go
-+++ b/libpod/runtime.go
-@@ -33,6 +33,7 @@ import (
- 	"github.com/containers/podman/v4/libpod/lock"
- 	"github.com/containers/podman/v4/libpod/plugin"
- 	"github.com/containers/podman/v4/libpod/shutdown"
-+	"github.com/containers/podman/v4/pkg/domain/entities/reports"
- 	"github.com/containers/podman/v4/pkg/rootless"
- 	"github.com/containers/podman/v4/pkg/systemd"
- 	"github.com/containers/podman/v4/pkg/util"
-@@ -1250,3 +1251,52 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
- 
- 	return toReturn, locksHeld, nil
- }
-+
-+// Exists checks whether a file or directory exists at the given path.
-+// If the path is a symlink, the symlink is followed.
-+func Exists(path string) error {
-+	// It uses unix.Faccessat which is a faster operation compared to os.Stat for
-+	// simply checking the existence of a file.
-+	err := unix.Faccessat(unix.AT_FDCWD, path, unix.F_OK, 0)
-+	if err != nil {
-+		return &os.PathError{Op: "faccessat", Path: path, Err: err}
-+	}
-+	return nil
-+}
-+
-+// PruneBuildContainers removes any build containers that were created during the build,
-+// but were not removed because the build was unexpectedly terminated.
-+//
-+// Note: This is not safe operation and should be executed only when no builds are in progress. It can interfere with builds in progress.
-+func (r *Runtime) PruneBuildContainers() ([]*reports.PruneReport, error) {
-+	stageContainersPruneReports := []*reports.PruneReport{}
-+
-+	containers, err := r.store.Containers()
-+	if err != nil {
-+		return stageContainersPruneReports, err
-+	}
-+	for _, container := range containers {
-+		path, err := r.store.ContainerDirectory(container.ID)
-+		if err != nil {
-+			return stageContainersPruneReports, err
-+		}
-+		if err := Exists(filepath.Join(path, "buildah.json")); err != nil {
-+			continue
-+		}
-+
-+		report := &reports.PruneReport{
-+			Id: container.ID,
-+		}
-+		size, err := r.store.ContainerSize(container.ID)
-+		if err != nil {
-+			report.Err = err
-+		}
-+		report.Size = uint64(size)
-+
-+		if err := r.store.DeleteContainer(container.ID); err != nil {
-+			report.Err = errors.Join(report.Err, err)
-+		}
-+		stageContainersPruneReports = append(stageContainersPruneReports, report)
-+	}
-+	return stageContainersPruneReports, nil
-+}
-diff --git a/pkg/api/handlers/libpod/system.go b/pkg/api/handlers/libpod/system.go
-index 70d4493f8..7c129b1ba 100644
--- a/pkg/api/handlers/libpod/system.go
-+++ b/pkg/api/handlers/libpod/system.go
-@@ -22,6 +22,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		All      bool `schema:"all"`
- 		Volumes  bool `schema:"volumes"`
- 		External bool `schema:"external"`
-+		Build    bool `schema:"build"`
- 	}{}
- 
- 	if err := decoder.Decode(&query, r.URL.Query()); err != nil {
-@@ -43,6 +44,7 @@ func SystemPrune(w http.ResponseWriter, r *http.Request) {
- 		Volume:   query.Volumes,
- 		Filters:  *filterMap,
- 		External: query.External,
-+		Build:    query.Build,
- 	}
- 	report, err := containerEngine.SystemPrune(r.Context(), pruneOptions)
- 	if err != nil {
-diff --git a/pkg/bindings/system/types.go b/pkg/bindings/system/types.go
-index 89e093f68..b4a4ff064 100644
--- a/pkg/bindings/system/types.go
-+++ b/pkg/bindings/system/types.go
-@@ -18,6 +18,7 @@ type PruneOptions struct {
- 	Filters  map[string][]string
- 	Volumes  *bool
- 	External *bool
-+	Build    *bool
- }
- 
- // VersionOptions are optional options for getting version info
-diff --git a/pkg/bindings/system/types_prune_options.go b/pkg/bindings/system/types_prune_options.go
-index d00498520..5f3bd652c 100644
--- a/pkg/bindings/system/types_prune_options.go
-+++ b/pkg/bindings/system/types_prune_options.go
-@@ -76,3 +76,18 @@ func (o *PruneOptions) GetExternal() bool {
- 	}
- 	return *o.External
- }
-+
-+// WithBuild set field Build to given value
-+func (o *PruneOptions) WithBuild(value bool) *PruneOptions {
-+	o.Build = &value
-+	return o
-+}
-+
-+// GetBuild returns value of field Build
-+func (o *PruneOptions) GetBuild() bool {
-+	if o.Build == nil {
-+		var z bool
-+		return z
-+	}
-+	return *o.Build
-+}
-diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go
-index 473db3530..f6938652a 100644
--- a/pkg/domain/entities/system.go
-+++ b/pkg/domain/entities/system.go
-@@ -22,6 +22,7 @@ type SystemPruneOptions struct {
- 	Volume   bool
- 	Filters  map[string][]string `json:"filters" schema:"filters"`
- 	External bool
-+	Build    bool
- }
- 
- // SystemPruneReport provides report after system prune is executed.
-diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go
-index 24ee64d29..ea3e5f203 100644
--- a/pkg/domain/infra/abi/system.go
-+++ b/pkg/domain/infra/abi/system.go
-@@ -150,16 +150,16 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 	return nil
- }
- 
-// SystemPrune removes unused data from the system. Pruning pods, containers, networks, volumes and images.
-+// SystemPrune removes unused data from the system. Pruning pods, containers, build container, networks, volumes and images.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
- 	var systemPruneReport = new(entities.SystemPruneReport)
- 
- 	if options.External {
-		if options.All || options.Volume || len(options.Filters) > 0 {
-+		if options.All || options.Volume || len(options.Filters) > 0 || options.Build {
- 			return nil, fmt.Errorf("system prune --external cannot be combined with other options")
- 		}
-		err := ic.Libpod.GarbageCollect()
-		if err != nil {
-+
-+		if err := ic.Libpod.GarbageCollect(); err != nil {
- 			return nil, err
- 		}
- 		return systemPruneReport, nil
-@@ -170,6 +170,17 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, options entities.Sys
- 		filters = append(filters, fmt.Sprintf("%s=%s", k, v[0]))
- 	}
- 	reclaimedSpace := (uint64)(0)
-+
-+	// Prune Build Containers
-+	if options.Build {
-+		stageContainersPruneReports, err := ic.Libpod.PruneBuildContainers()
-+		if err != nil {
-+			return nil, err
-+		}
-+		reclaimedSpace += reports.PruneReportsSize(stageContainersPruneReports)
-+		systemPruneReport.ContainerPruneReports = append(systemPruneReport.ContainerPruneReports, stageContainersPruneReports...)
-+	}
-+
- 	found := true
- 	for found {
- 		found = false
-diff --git a/pkg/domain/infra/tunnel/system.go b/pkg/domain/infra/tunnel/system.go
-index fc82e7b2b..142a9fa5c 100644
--- a/pkg/domain/infra/tunnel/system.go
-+++ b/pkg/domain/infra/tunnel/system.go
-@@ -19,7 +19,7 @@ func (ic *ContainerEngine) SetupRootless(_ context.Context, noMoveProcess bool)
- 
- // SystemPrune prunes unused data from the system.
- func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.SystemPruneOptions) (*entities.SystemPruneReport, error) {
-	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External)
-+	options := new(system.PruneOptions).WithAll(opts.All).WithVolumes(opts.Volume).WithFilters(opts.Filters).WithExternal(opts.External).WithBuild(opts.Build)
- 	return system.Prune(ic.ClientCtx, options)
- }
- 
-diff --git a/test/e2e/prune_test.go b/test/e2e/prune_test.go
-index 01e848478..57bd5582d 100644
--- a/test/e2e/prune_test.go
-+++ b/test/e2e/prune_test.go
-@@ -4,6 +4,8 @@ import (
- 	"fmt"
- 	"os"
- 	"path/filepath"
-+	"syscall"
-+	"time"
- 
- 	. "github.com/containers/podman/v4/test/utils"
- 	. "github.com/onsi/ginkgo/v2"
-@@ -22,6 +24,11 @@ FROM scratch
- ENV test1=test1
- ENV test2=test2`
- 
-+var longBuildImage = fmt.Sprintf(`
-+FROM %s
-+RUN echo "Hello, World!"
-+RUN RUN echo "Please use signal 9 this will never ends" && sleep 10000s`, ALPINE)
-+
- var _ = Describe("Podman prune", func() {
- 
- 	It("podman container prune containers", func() {
-@@ -593,4 +600,63 @@ var _ = Describe("Podman prune", func() {
- 		Expect(err).ToNot(HaveOccurred())
- 		Expect(dirents).To(HaveLen(3))
- 	})
-+
-+	It("podman system prune --build clean up after terminated build", func() {
-+		useCustomNetworkDir(podmanTest, tempdir)
-+
-+		podmanTest.BuildImage(pruneImage, "alpine_notleaker:latest", "false")
-+
-+		create := podmanTest.Podman([]string{"create", "--name", "test", BB, "sleep", "10000"})
-+		create.WaitWithDefaultTimeout()
-+		Expect(create).Should(ExitCleanly())
-+
-+		containerFilePath := filepath.Join(podmanTest.TempDir, "ContainerFile-podman-leaker")
-+		err := os.WriteFile(containerFilePath, []byte(longBuildImage), 0755)
-+		Expect(err).ToNot(HaveOccurred())
-+
-+		build := podmanTest.Podman([]string{"build", "-f", containerFilePath, "-t", "podmanleaker"})
-+		// Build will never finish so let's wait for build to ask for SIGKILL to simulate a failed build that leaves stage containers.
-+		matchedOutput := false
-+		for range 900 {
-+			if build.LineInOutputContains("Please use signal 9") {
-+				matchedOutput = true
-+				build.Signal(syscall.SIGKILL)
-+				break
-+			}
-+			time.Sleep(100 * time.Millisecond)
-+		}
-+		if !matchedOutput {
-+			Fail("Did not match special string in podman build")
-+		}
-+
-+		// Check Intermediate image of stage container
-+		none := podmanTest.Podman([]string{"images", "-a"})
-+		none.WaitWithDefaultTimeout()
-+		Expect(none).Should(ExitCleanly())
-+		Expect(none.OutputToString()).Should(ContainSubstring("none"))
-+
-+		// Check if Container and Stage Container exist
-+		count := podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToStringArray()).To(HaveLen(3))
-+
-+		prune := podmanTest.Podman([]string{"system", "prune", "--build", "-f"})
-+		prune.WaitWithDefaultTimeout()
-+		Expect(prune).Should(ExitCleanly())
-+
-+		// Container should still exist, but no stage containers
-+		count = podmanTest.Podman([]string{"ps", "-aq", "--external"})
-+		count.WaitWithDefaultTimeout()
-+		Expect(count).Should(ExitCleanly())
-+		Expect(count.OutputToString()).To(BeEmpty())
-+
-+		Expect(podmanTest.NumberOfContainers()).To(Equal(0))
-+
-+		after := podmanTest.Podman([]string{"images", "-a"})
-+		after.WaitWithDefaultTimeout()
-+		Expect(after).Should(ExitCleanly())
-+		Expect(after.OutputToString()).ShouldNot(ContainSubstring("none"))
-+		Expect(after.OutputToString()).Should(ContainSubstring("notleaker"))
-+	})
- })
-
--- a/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
+++ b/.github/scripts/s390x-ci/self-hosted-builder/podman-patches/podman-25245.patch
@ -1,21 +0,0 @@
-diff --git a/pkg/rootless/rootless_linux.c b/pkg/rootless/rootless_linux.c
-index 4f71d49e5c..3d74af6a6c 100644
--- a/pkg/rootless/rootless_linux.c
-+++ b/pkg/rootless/rootless_linux.c
-@@ -658,7 +658,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-   if (pipe (p) < 0)
-     return -1;
-
-  pid = fork ();
-+  pid = syscall_clone (SIGCHLD, NULL);
-   if (pid < 0)
-     {
-       close (p[0]);
-@@ -689,7 +689,7 @@ create_pause_process (const char *pause_pid_file_path, char **argv)
-       close (p[0]);
-
-       setsid ();
-      pid = fork ();
-+      pid = syscall_clone (SIGCHLD, NULL);
-       if (pid < 0)
-         _exit (EXIT_FAILURE);
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -434,7 +434,7 @@ query ($owner: String!, $name: String!) {
 RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
 RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE)
 RE_PULL_REQUEST_RESOLVED = re.compile(
-    r"(Pull Request resolved|Pull-Request-resolved): "
+    r"Pull Request resolved: "
    r"https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)",
    re.MULTILINE,
 )
@ -819,9 +819,10 @@ class GitHubPR:
                    cursor=info["reviews"]["pageInfo"]["startCursor"],
                )
                info = rc["data"]["repository"]["pullRequest"]
-        reviews = {
-            author: state for author, state in self._reviews if state != "COMMENTED"
-        }
+        reviews = {}
+        for author, state in self._reviews:
+            if state != "COMMENTED":
+                reviews[author] = state
        return list(reviews.items())

    def get_approved_by(self) -> list[str]:
@ -2281,8 +2282,7 @@ def merge(
        except MandatoryChecksMissingError as ex:
            last_exception = str(ex)
            print(
-                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min",
-                flush=True,
+                f"Merge of https://github.com/{pr.org}/{pr.project}/pull/{pr.pr_num} failed due to: {ex}. Retrying in 5 min"
            )
            time.sleep(5 * 60)
    # Finally report timeout back
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -54,8 +54,7 @@ cmake .. -DGPU_TARGET="%GPU_TARGET%" ^
            -DCMAKE_BUILD_TYPE=%CONFIG% ^
            -DCMAKE_GENERATOR=Ninja ^
            -DCMAKE_INSTALL_PREFIX=..\install\ ^
-            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%" ^
-            -DCMAKE_POLICY_VERSION_MINIMUM=3.5
+            -DCUDA_ARCH_LIST="%CUDA_ARCH_LIST%"
 if errorlevel 1 exit /b 1

 cmake --build . --target install --config %CONFIG% -- -j%NUMBER_OF_PROCESSORS%
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -32,7 +32,7 @@ concurrency:
 {%- macro setup_ec2_windows() -%}
      !{{ display_ec2_information() }}
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -53,7 +53,7 @@ jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -135,7 +135,7 @@ jobs:
        uses: ./.github/actions/setup-xpu
      - name: configure aws credentials
        id: aws_creds
-        uses: aws-actions/configure-aws-credentials@v4
+        uses: aws-actions/configure-aws-credentials@v1.7.0
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
@ -147,9 +147,9 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
@ -168,12 +168,12 @@ jobs:
        with:
          name: !{{ config["build_name"] }}
          path: "${{ runner.temp }}/artifacts/"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: !{{ config["container_image"] }}
      - name: Test Pytorch binary
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -76,7 +76,7 @@ jobs:
          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
          fi
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        run: |
          # shellcheck disable=SC1091
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -25,6 +25,9 @@
      DOCKER_IMAGE: !{{ config["container_image"] }}
 {%- endif %}
 {%- if config["package_type"] == "manywheel" %}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if config.use_split_build is defined %}
      use_split_build: !{{ config["use_split_build"] }}
  {%- endif %}
@ -34,6 +37,9 @@
      LIBTORCH_CONFIG: !{{ config["libtorch_config"] }}
  {%- endif %}
      LIBTORCH_VARIANT: !{{ config["libtorch_variant"] }}
+  {%- if config["devtoolset"] %}
+      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
+  {%- endif %}
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -0,0 +1,197 @@
+{% import 'common.yml.j2' as common %}
+{% import 'upload.yml.j2' as upload %}
+
+{%- block name -%}
+# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Generation script: .github/scripts/generate_ci_workflows.py
+name: !{{ build_environment }}
+{%- endblock %}
+
+{%- macro set_runner_specific_vars() -%}
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+{%- endmacro %}
+
+on:
+  push:
+    branches:
+      - !{{ branches }}
+    {%- if branches == "nightly" %}
+    tags:
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+    {%- endif %}
+{%- for label in ciflow_config.labels | sort %}
+    {%- if loop.first and branches != "nightly" %}
+    tags:
+    {%- endif %}
+      - '!{{ label }}/*'
+{%- endfor %}
+  workflow_dispatch:
+
+env:
+  BUILD_ENVIRONMENT: !{{ build_environment }}
+  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+  PR_NUMBER: ${{ github.event.pull_request.number }}
+  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+  SKIP_ALL_TESTS: 1
+  PYTORCH_ROOT: /pytorch
+  DOWNLOADS_DIR: c:\temp\downloads
+  DEPENDENCIES_DIR: c:\temp\dependencies
+  ENABLE_APL: 1
+  ENABLE_OPENBLAS: 0
+  MSVC_VERSION : 14.42
+  AWS_DEFAULT_REGION: us-east-1
+
+jobs:
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+{%- for config in build_configs %}
+  !{{ config["build_name"] }}-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: get-label-type
+    runs-on: "windows-11-arm64"
+    timeout-minutes: !{{ common.timeout_minutes }}
+    !{{ upload.binary_env(config, True) }}
+    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
+    {%- endif %}
+    steps:
+      !{{ set_runner_specific_vars() }}
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch - recursive
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+      - uses: !{{ common.upload_artifact_action }}
+        if: always()
+        with:
+          name: !{{ config["build_name"] }}
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  !{{ config["build_name"] }}-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - !{{ config["build_name"] }}-build
+      - get-label-type
+    runs-on: "windows-11-arm64"
+    timeout-minutes: !{{ common.timeout_minutes }}
+    !{{ upload.binary_env(config, True) }}
+    steps:
+      !{{ set_runner_specific_vars() }}
+      - uses: !{{ common.download_artifact_action }}
+        name: Download Build Artifacts
+        with:
+          name: !{{ config["build_name"] }}
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Populate binary env
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
+      - name: Test PyTorch binary
+        shell: bash
+        run: |
+          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+  {%- if branches == "nightly" %}
+  !{{ upload.upload_binaries(config, True) }}
+  {%- endif %}
+{%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -49,22 +49,13 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
-  OS: !{{ os }}
-{%- if os == "windows-arm64" %}
-  PYTORCH_ROOT: /pytorch
-  DOWNLOADS_DIR: c:\temp\downloads
-  DEPENDENCIES_DIR: c:\temp\dependencies
-  ENABLE_APL: 1
-  ENABLE_OPENBLAS: 0
-  MSVC_VERSION : 14.42
-{%- endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@release/2.7
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
@ -75,79 +66,20 @@ jobs:
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    {%- if os == "windows-arm64" %}
-    runs-on: "windows-11-arm64"
-    {%- else %}
    {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
-    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
    steps:
-{%- if os == "windows-arm64" %}
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
-          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
-          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Bootstrap folders
-        shell: cmd
-        run: |
-          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
-          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Bootstrap sccache
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
-      - name: Bootstrap Libuv
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
-{%- else %}
-      !{{ set_runner_specific_vars() }}
      !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
-{%- endif %}
+      !{{ set_runner_specific_vars() }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -163,17 +95,12 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
-{% endif %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - !{{ config["build_name"] }}-build
      - get-label-type
-{%- if os == "windows-arm64" %}
-    runs-on: "windows-11-arm64"
-{%- else %}
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
@ -186,61 +113,18 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
-{%- endif %}
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
-{%- if os == "windows-arm64" %}
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
-          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
-          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-{%- else %}
      !{{ common.setup_ec2_windows() }}
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ set_runner_specific_vars() }}
-{%- endif %}
      - uses: !{{ common.download_artifact_action }}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+      !{{ common.checkout(deep_clone=False, directory="pytorch", checkout_pr_head=False) }}
      - name: Populate binary env
        shell: bash
        run: |
@ -249,10 +133,8 @@ jobs:
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
-{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
-{%- endif %}
  {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config, True) }}
  {%- endif %}
-{%- endfor %}
+{%- endfor %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -33,10 +33,6 @@ on:
        default: "linux.large"
        description: Runner type

-permissions:
-  id-token: write
-  contents: read
-
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -51,7 +47,7 @@ jobs:
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7
        with:
          fetch-depth: 1
          submodules: false
@ -73,32 +69,25 @@ jobs:
    runs-on: ${{ matrix.runner }}
    steps:
      - name: Setup SSH (Click me for login details)
-        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        uses: pytorch/test-infra/.github/actions/setup-ssh@release/2.7
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

      # [see note: pytorch repo ref]
      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@release/2.7

      - name: Setup Linux
        uses: ./.github/actions/setup-linux

-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
-          role-session-name: gha-bazel-build
-          aws-region: us-east-1
-
      - name: Calculate docker image
        id: calculate-docker-image
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@release/2.7
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Pull docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@release/2.7
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -108,7 +97,7 @@ jobs:
        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.7
        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
@ -213,13 +202,6 @@ jobs:
        uses: ./.github/actions/chown-workspace
        if: always()

-      - name: Configure AWS Credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
-          role-session-name: gha-bazel-build-upload-artifacts
-          aws-region: us-east-1
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
@ -227,5 +209,5 @@ jobs:
          file-suffix: bazel-${{ github.job }}_${{ steps.get-job-id.outputs.job-id }}

      - name: Teardown Linux
-        uses: pytorch/test-infra/.github/actions/teardown-linux@main
+        uses: pytorch/test-infra/.github/actions/teardown-linux@release/2.7
        if: always()
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.0
 .3.1