Halves time spent in generating the key strings

Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.
[EZ/Profiler] Update Submodule (#151843 )
2025-10-24 07:27:32 +08:00 · 2025-04-22 11:32:36 -07:00 · 2025-04-22 11:32:34 -07:00 · 2025-04-22 18:19:43 +00:00 · 2025-04-22 17:57:31 +00:00 · 2025-04-22 17:20:09 +00:00
843 changed files with 33763 additions and 12243 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,82 +1,60 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -exou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE_NAME="pytorch/${image}"
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')

+CUDA_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+fi

-export DOCKER_BUILDKIT=1
-TOPDIR=$(git rev-parse --show-toplevel)
-
-CUDA_VERSION=${CUDA_VERSION:-12.1}
-
-case ${CUDA_VERSION} in
+case ${DOCKER_TAG_PREFIX} in
  cpu)
    BASE_TARGET=base
-    DOCKER_TAG=cpu
    ;;
-  all)
-    BASE_TARGET=all_cuda
-    DOCKER_TAG=latest
+  cuda*)
+    BASE_TARGET=cuda${CUDA_VERSION}
    ;;
  *)
-    BASE_TARGET=cuda${CUDA_VERSION}
-    DOCKER_TAG=cuda${CUDA_VERSION}
+    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
+    exit 1
    ;;
 esac

+# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+sudo systemctl daemon-reload
+sudo systemctl restart docker

-(
-  set -x
-  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-  sudo systemctl daemon-reload
-  sudo systemctl restart docker
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-  docker build \
-    --target final \
-    --progress plain \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-    --build-arg "DEVTOOLSET_VERSION=11" \
-    -t ${DOCKER_IMAGE_NAME} \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
-    ${TOPDIR}/.ci/docker/
-)
+docker build \
+  --target final \
+  --progress plain \
+  --build-arg "BASE_TARGET=${BASE_TARGET}" \
+  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+  --build-arg "DEVTOOLSET_VERSION=11" \
+  -t ${tmp_tag} \
+  $@ \
+  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+  ${TOPDIR}/.ci/docker/

-if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
+if [ -n "${CUDA_VERSION}" ]; then
  # Test that we're using the right CUDA compiler
-  (
-    set -x
-    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
-  )
-fi
-
-GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  (
-    set -x
-    docker push "${DOCKER_IMAGE_NAME}"
-    if [[ -n ${GITHUB_REF} ]]; then
-        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
-        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
-        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
-        docker push "${DOCKER_IMAGE_SHA_TAG}"
-    fi
-  )
+  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
 fi
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-7e487c24e1c20c3f4606c2d8aca2778873b00b4c
+381ae5d57d35c165d98df728380b20fbde350392
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -19,6 +19,13 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

+    # Make sure rocm packages from repo.radeon.com have highest priority
+    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
+Package: *
+Pin: release o=repo.radeon.com
+Pin-Priority: 600
+EOF
+
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,9 +25,7 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
-# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
-GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm

 ###########################
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -17,10 +17,14 @@ function do_install() {
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        tar -xvf "${magma_archive}"
-        mkdir -p "${rocm_dir}/magma"
-        mv include "${rocm_dir}/magma/include"
-        mv lib "${rocm_dir}/magma/lib"
+        if tar -xvf "${magma_archive}"
+        then
+            mkdir -p "${rocm_dir}/magma"
+            mv include "${rocm_dir}/magma/include"
+            mv lib "${rocm_dir}/magma/lib"
+        else
+            echo "${magma_archive} not found, skipping magma install"
+        fi
        popd
    )
 }
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -89,7 +89,7 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python -y && \
+    apt-get install python3 python-is-python3 -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,83 +1,63 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -eoux pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE="pytorch/${image}"
-
 TOPDIR=$(git rev-parse --show-toplevel)

-GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-WITH_PUSH=${WITH_PUSH:-}
-
 DOCKER=${DOCKER:-docker}

-case ${GPU_ARCH_TYPE} in
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi
+
+case ${DOCKER_TAG_PREFIX} in
    cpu)
        BASE_TARGET=cpu
-        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda)
+    cuda*)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm)
+    rocm*)
        BASE_TARGET=rocm
-        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
-        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
-        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
        exit 1
        ;;
 esac

+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-(
-    set -x
-    DOCKER_BUILDKIT=1 ${DOCKER} build \
-         --target final \
-        ${DOCKER_GPU_BUILD_ARG} \
-        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-        --build-arg "BASE_TARGET=${BASE_TARGET}" \
-        -t "${DOCKER_IMAGE}" \
-        $@ \
-        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
-        "${TOPDIR}/.ci/docker/"
-
-)
-
-GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  (
-    set -x
-    ${DOCKER} push "${DOCKER_IMAGE}"
-    if [[ -n ${GITHUB_REF} ]]; then
-        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
-        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
-        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
-        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
-    fi
-  )
-fi
+DOCKER_BUILDKIT=1 ${DOCKER} build \
+    --target final \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -exou pipefail

 TOPDIR=$(git rev-parse --show-toplevel)

@ -9,152 +9,110 @@ image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGE:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE="pytorch/${image}"
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')

-DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi

-GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
-WITH_PUSH=${WITH_PUSH:-}

-case ${GPU_ARCH_TYPE} in
-    cpu)
+case ${image} in
+    manylinux2_28-builder:cpu)
        TARGET=cpu_final
-        DOCKER_TAG=cpu
-        GPU_IMAGE=centos:7
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
-        ;;
-    cpu-manylinux_2_28)
-        TARGET=cpu_final
-        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    cpu-aarch64)
+    manylinuxaarch64-builder:cpu-aarch64)
        TARGET=final
-        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
-    cpu-aarch64-2_28)
+    manylinux2_28_aarch64-builder:cpu-aarch64)
        TARGET=final
-        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
-    cpu-cxx11-abi)
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
        TARGET=final
-        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    cpu-s390x)
+    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
-        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    cuda)
+    manylinux2_28-builder:cuda*)
        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
-        # Keep this up to date with the minimum version of CUDA we currently support
-        GPU_IMAGE=centos:7
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
-        ;;
-    cuda-manylinux_2_28)
-        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    cuda-aarch64)
+    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    rocm|rocm-manylinux_2_28)
+    manylinux2_28-builder:rocm*)
        TARGET=rocm_final
-        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        DEVTOOLSET_VERSION="9"
-        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
-            MANY_LINUX_VERSION="2_28"
-            DEVTOOLSET_VERSION="11"
-            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        fi
+        MANY_LINUX_VERSION="2_28"
+        DEVTOOLSET_VERSION="11"
+        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    xpu)
+    manylinux2_28-builder:xpu)
        TARGET=xpu_final
-        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        echo "ERROR: Unrecognized image name: ${image}"
        exit 1
        ;;
 esac

-IMAGES=''
-
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-(
-    set -x
-
-    # Only activate this if in CI
-    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-        sudo systemctl daemon-reload
-        sudo systemctl restart docker
-    fi
-
-    DOCKER_BUILDKIT=1 docker build  \
-        ${DOCKER_GPU_BUILD_ARG} \
-        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-        --target "${TARGET}" \
-        -t "${DOCKER_IMAGE}" \
-        $@ \
-        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
-        "${TOPDIR}/.ci/docker/"
-)
-
-GITHUB_REF=${GITHUB_REF:-"dev")}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
-
-if [[ "${WITH_PUSH}" == true ]]; then
-    (
-        set -x
-        docker push "${DOCKER_IMAGE}"
-        if [[ -n ${GITHUB_REF} ]]; then
-            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
-            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
-            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
-            docker push "${DOCKER_IMAGE_SHA_TAG}"
-        fi
-    )
+# Only activate this if in CI
+if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
 fi
+
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
+
+DOCKER_BUILDKIT=1 docker build  \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --target "${TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,20 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
+
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6

+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -46,5 +51,6 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-panels==0.4.1
+sphinx-design==0.4.0
+sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.3
+DESIRED_ROCM ?= 6.4
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
+all: magma-rocm64
 all: magma-rocm63
 all: magma-rocm624

@ -24,6 +25,11 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

+.PHONY: magma-rocm64
+magma-rocm64: DESIRED_ROCM := 6.4
+magma-rocm64:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm63
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -301,6 +301,18 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"

+    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
+      echo "Checking that xpu is compiled"
+      pushd dist/
+      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
+        echo "XPU support is compiled in."
+      else
+        echo "XPU support is NOT compiled in."
+        exit 1
+      fi
+      popd
+    fi
+
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x

--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -216,6 +216,14 @@ else
  fi
 fi

+###############################################################################
+# Check XPU configured correctly
+###############################################################################
+if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
+  echo "Checking that xpu is compiled"
+  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
+fi
+
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -34,11 +34,14 @@ if which sccache > /dev/null; then
 fi

 print_cmake_info
-
-# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
-
+if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+else
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+fi
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -221,25 +221,39 @@ test_torchbench_smoketest() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  local backend=eager
-  local dtype=notset
  local device=mps
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder moco speech_transformer)

-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+  for backend in eager inductor; do

-  echo "Setup complete, launching torchbench training performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --training --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  done
+    for dtype in notset float16 bfloat16; do
+      echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
+      local dtype_arg="--${dtype}"
+      if [ "$dtype" == notset ]; then
+          dtype_arg="--float32"
+      fi
+      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+      for model in "${models[@]}"; do
+        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
+      done
+    done
+
+    for dtype in notset amp; do
+      echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype}"
+      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+      local dtype_arg="--${dtype}"
+      if [ "$dtype" == notset ]; then
+          dtype_arg="--float32"
+      fi
+      for model in "${models[@]}"; do
+        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+          --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
+          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
+      done
+    done

-  echo "Launching torchbench inference performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done

  echo "Pytorch benchmark on mps device completed"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,12 +119,6 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

-# Prevent Google from indexing $install_path/_modules. This folder contains
-# generated source files.
-# NB: the following only works on gnu sed. The sed shipped with mac os is different.
-# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
-find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
-
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1175,7 +1175,6 @@ build_xla() {
  # These functions are defined in .circleci/common.sh in pytorch/xla repo
  retry install_pre_deps_pytorch_xla $XLA_DIR $USE_CACHE
  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch:${CMAKE_PREFIX_PATH}" XLA_SANDBOX_BUILD=1 build_torch_xla $XLA_DIR
-  retry install_post_deps_pytorch_xla
  assert_git_not_dirty
 }

--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-export USE_SCCACHE=1
-export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-
-echo "Free space on filesystem before build:"
-df -h
-
-export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
-elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
-fi
-
-echo "Free space on filesystem after build:"
-df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-
-pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -4,11 +4,13 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"

-export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export USE_SCCACHE=1
-export SCCACHE_BUCKET=ossci-compiler-cache
-export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-export VC_YEAR=2022
+if [[ "$OS" != "windows-arm64" ]]; then
+    export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+    export USE_SCCACHE=1
+    export SCCACHE_BUCKET=ossci-compiler-cache
+    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+    export VC_YEAR=2022
+fi

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export USE_SCCACHE=0
@ -21,7 +23,16 @@ df -h

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-./windows/internal/build_wheels.bat
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
+        ./windows/arm64/build_libtorch.bat
+    elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
+        ./windows/arm64/build_pytorch.bat
+    fi
+else
+    ./windows/internal/build_wheels.bat
+fi

 echo "Free space on filesystem after build:"
 df -h
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -11,6 +11,11 @@ if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
-./windows/internal/smoke_test.bat
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    ./windows/arm64/smoke_test.bat
+else
+    ./windows/internal/smoke_test.bat
+fi

 popd
--- a/.clang-tidy
+++ b/.clang-tidy
@ -52,7 +52,6 @@ modernize-*,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
-modernize-use-default-member-init,
 -modernize-use-using,
 -modernize-use-trailing-return-type,
 -modernize-use-nodiscard,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -45,10 +45,14 @@ self-hosted-runner:
    - windows.g5.4xlarge.nvidia.gpu
    # Windows ARM64 runners
    - windows-11-arm64
-    # Organization-wide AMD hosted runners
+    # Organization-wide AMD-hosted runners
+    # MI2xx runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
+    # MI300 runners
+    - linux.rocm.gpu.mi300.2
+    - linux.rocm.gpu.mi300.4
    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
--- a/.github/actions/binary-docker-build/action.yml
+++ b/.github/actions/binary-docker-build/action.yml
@ -0,0 +1,70 @@
+name: Binary docker build
+
+description: Build docker image for binary builds
+
+inputs:
+  docker-image-name:
+    description: Docker image name for PR builds
+    required: true
+  docker-build-dir:
+    description: Location of the build.sh relative to .ci/docker
+    required: true
+  custom-tag-prefix:
+    description: Custom tag prefix for the docker image
+    required: false
+  DOCKER_TOKEN:
+    description: Docker token for authentication
+    required: true
+  DOCKER_ID:
+    description: Docker ID for authentication
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Checkout PyTorch
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+        docker-build-dir: .ci/docker
+        custom-tag-prefix: ${{ inputs.custom-tag-prefix }}
+        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
+        always-rebuild: true
+        push: true
+
+    - name: Tag and (if WITH_PUSH) push docker image to docker.io
+      env:
+        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
+        DOCKER_ID: ${{ inputs.DOCKER_ID }}
+        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
+        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
+        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        set -euox pipefail
+        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+        GIT_BRANCH_NAME=${GITHUB_REF##*/}
+        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
+
+        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
+
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+
+        # Pretty sure Github will mask tokens and I'm not sure if it will even be
+        # printed due to pipe, but just in case
+        set +x
+        if [[ ${WITH_PUSH:-false} == "true" ]]; then
+          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+        fi
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -112,3 +112,22 @@
 - torch/csrc/inductor/aoti_include/xpu.h
 - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
 - torch/csrc/inductor/cpp_wrapper/xpu.h
+
+"release notes: inductor (aoti)":
+- torch/_C/_aoti.pyi
+- torch/_dynamo/repro/aoti.py
+- torch/_export/serde/aoti_schema.py
+- torch/_higher_order_ops/aoti_call_delegate.py
+- torch/_inductor/codegen/aoti_runtime/**
+- torch/_inductor/codegen/aoti_hipify_utils.py
+- torch/_inductor/codegen/cpp_wrapper_cpu.py
+- torch/_inductor/codegen/cpp_wrapper_gpu.py
+- torch/_inductor/aoti_eager.py
+- torch/csrc/inductor/aoti_runtime/**
+- torch/csrc/inductor/aoti_torch/**
+- torch/csrc/inductor/aoti_runner/**
+- torch/csrc/inductor/aoti_eager/**
+- torch/csrc/inductor/aoti_package/**
+- torch/csrc/inductor/aoti_include/**
+- torchgen/aoti/**
+- torchgen/gen_aoti_c_shim.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,6 +16,7 @@ ciflow_push_tags:
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
+- ciflow/periodic-rocm-mi300
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Also update the ROCm sources in tools/nightly.py when changing this list
-ROCM_ARCHES = ["6.2.4", "6.3"]
+ROCM_ARCHES = ["6.3", "6.4"]

 XPU_ARCHES = ["xpu"]

@ -173,7 +173,7 @@ WHEEL_CONTAINER_IMAGES = {
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
+    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
 }

 RELEASE = "release"
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -227,42 +227,6 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
-]
-
-WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-]
-
-WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="wheel",
@ -308,6 +272,39 @@ WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -402,10 +399,6 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
-        (
-            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
-            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -434,7 +434,7 @@ query ($owner: String!, $name: String!) {
 RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
 RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE)
 RE_PULL_REQUEST_RESOLVED = re.compile(
-    r"Pull Request resolved: "
+    r"(Pull Request resolved|Pull-Request-resolved): "
    r"https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)",
    re.MULTILINE,
 )
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -1,197 +0,0 @@
-{% import 'common.yml.j2' as common %}
-{% import 'upload.yml.j2' as upload %}
-
-{%- block name -%}
-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-{%- macro set_runner_specific_vars() -%}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
-          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
-          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-{%- endmacro %}
-
-on:
-  push:
-    branches:
-      - !{{ branches }}
-    {%- if branches == "nightly" %}
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-    {%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-    {%- if loop.first and branches != "nightly" %}
-    tags:
-    {%- endif %}
-      - '!{{ label }}/*'
-{%- endfor %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  PYTORCH_ROOT: /pytorch
-  DOWNLOADS_DIR: c:\temp\downloads
-  DEPENDENCIES_DIR: c:\temp\dependencies
-  ENABLE_APL: 1
-  ENABLE_OPENBLAS: 0
-  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-{%- for config in build_configs %}
-  !{{ config["build_name"] }}-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
-    {%- endif %}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - name: Bootstrap folders
-        shell: cmd
-        run: |
-          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
-          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Bootstrap sccache
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
-      - name: Bootstrap Libuv
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
-      - uses: !{{ common.upload_artifact_action }}
-        if: always()
-        with:
-          name: !{{ config["build_name"] }}
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  !{{ config["build_name"] }}-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - !{{ config["build_name"] }}-build
-      - get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - uses: !{{ common.download_artifact_action }}
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
-  {%- if branches == "nightly" %}
-  !{{ upload.upload_binaries(config, True) }}
-  {%- endif %}
-{%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -49,6 +49,15 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: !{{ os }}
+{%- if os == "windows-arm64" %}
+  PYTORCH_ROOT: /pytorch
+  DOWNLOADS_DIR: c:\temp\downloads
+  DEPENDENCIES_DIR: c:\temp\dependencies
+  ENABLE_APL: 1
+  ENABLE_OPENBLAS: 0
+  MSVC_VERSION : 14.42
+{%- endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
@ -66,20 +75,79 @@ jobs:
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
+    {%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64"
+    {%- else %}
    {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
+    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
    steps:
-      !{{ common.setup_ec2_windows() }}
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch - recursive
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+{%- else %}
      !{{ set_runner_specific_vars() }}
+      !{{ common.setup_ec2_windows() }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+{%- endif %}
      - name: Populate binary env
        shell: bash
        run: |
@ -95,12 +163,17 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{% endif %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - !{{ config["build_name"] }}-build
      - get-label-type
+{%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64"
+{%- else %}
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
@ -113,18 +186,61 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
+{%- endif %}
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+{%- else %}
      !{{ common.setup_ec2_windows() }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ set_runner_specific_vars() }}
+{%- endif %}
      - uses: !{{ common.download_artifact_action }}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      - name: Populate binary env
        shell: bash
        run: |
@ -133,8 +249,10 @@ jobs:
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{%- endif %}
  {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config, True) }}
  {%- endif %}
-{%- endfor %}
+{%- endfor %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -33,6 +33,10 @@ on:
        default: "linux.large"
        description: Runner type

+permissions:
+  id-token: write
+  contents: read
+
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -80,6 +84,13 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          role-session-name: gha-bazel-build
+          aws-region: us-east-1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -202,6 +213,13 @@ jobs:
        uses: ./.github/actions/chown-workspace
        if: always()

+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
+          role-session-name: gha-bazel-build-upload-artifacts
+          aws-region: us-east-1
+
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -38,6 +38,11 @@ on:
        required: false
        type: boolean
        default: true
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub

 jobs:
  test:
@ -166,6 +171,7 @@ jobs:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
          # shellcheck disable=SC1090
          set -ex
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -47,6 +47,10 @@ on:
        type: boolean
        default: true

+permissions:
+  id-token: write
+  contents: read
+
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -11,14 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/almalinux/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/almalinux/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -37,37 +37,12 @@ jobs:
    strategy:
      matrix:
        cuda_version: ["11.8", "12.4", "12.6", "cpu"]
-    env:
-      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/almalinux
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: almalinux-builder
+          custom-tag-prefix: ${{ matrix.cuda_version != 'cpu' && 'cuda' || '' }}${{matrix.cuda_version}}
+          docker-build-dir: almalinux
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -10,14 +10,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/libtorch/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/libtorch/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -39,123 +39,29 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  build-docker-cuda:
+  build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral
+    name: libtorch-cxx11-builder:${{ matrix.tag }}
    strategy:
+      fail-fast: false
      matrix:
-        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-    env:
-      GPU_ARCH_TYPE: cuda
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+        include: [
+          { tag: "cuda12.8" },
+          { tag: "cuda12.6" },
+          { tag: "cuda12.4" },
+          { tag: "cuda11.8" },
+          { tag: "rocm6.3"  },
+          { tag: "rocm6.4"  },
+          { tag: "cpu"      },
+        ]
    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: libtorch
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
-  build-docker-rocm:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    strategy:
-      matrix:
-        rocm_version: ["6.2.4", "6.3"]
-    env:
-      GPU_ARCH_TYPE: rocm
-      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
-  build-docker-cpu:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-cpu
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["63", "624"]
+        rocm_version: ["64", "63"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -11,15 +11,11 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images-s390x.yml
  pull_request:
    paths:
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images-s390x.yml


@ -37,26 +33,45 @@ jobs:
    if: github.repository_owner == 'pytorch'
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.s390x
-    env:
-      GPU_ARCH_TYPE: cpu-s390x
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
          no-sudo: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
+
+      - name: Build Docker Image
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x -t manylinuxs390x-builder:cpu-s390x
+
+      - name: Tag and (if WITH_PUSH) push docker image to docker.io
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+          CREATED_FULL_DOCKER_IMAGE_NAME: manylinuxs390x-builder:cpu-s390x
+        shell: bash
        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
+          set -euox pipefail
+          GITHUB_REF="${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}"
+          GIT_BRANCH_NAME="${GITHUB_REF##*/}"
+          GIT_COMMIT_SHA="${GITHUB_SHA:-$(git rev-parse HEAD)}"
+          CI_FOLDER_SHA="$(git rev-parse HEAD:.ci/docker)"
+
+          DOCKER_IMAGE_NAME_PREFIX="docker.io/pytorch/${CREATED_FULL_DOCKER_IMAGE_NAME}"
+
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
+
+          # Prety sure Github will mask tokens and I'm not sure if it will even be
+          # printed due to pipe, but just in case
+          set +x
+          if [[ "${WITH_PUSH:-false}" == "true" ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
          fi
-      - name: Build Docker Image
-        run: |
-          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x

      - name: Cleanup docker
        if: cancelled()
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -11,17 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/common/*'
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/common/*'
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images.yml
-
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -43,322 +40,34 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  build-docker-cuda-manylinux_2_28:
+  build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
+      fail-fast: false
      matrix:
-        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-    env:
-      GPU_ARCH_TYPE: cuda-manylinux_2_28
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+        include: [
+          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.4",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda11.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
+        ]
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
+    name: ${{ matrix.name }}:${{ matrix.tag }}
    steps:
-      - name: Purge tools folder (free space for build)
-        run: rm -rf /opt/hostedtoolcache
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: ${{ matrix.name }}
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: manywheel
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
-  build-docker-cuda-aarch64:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    strategy:
-      matrix:
-        cuda_version: ["12.8"]
-    env:
-      GPU_ARCH_TYPE: cuda-aarch64
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
-  build-docker-rocm-manylinux_2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    strategy:
-      matrix:
-        rocm_version: ["6.2.4", "6.3"]
-    env:
-      GPU_ARCH_TYPE: rocm-manylinux_2_28
-      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}}
-  build-docker-cpu-manylinux_2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-manylinux_2_28
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-cpu
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
-  build-docker-cpu-aarch64:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-aarch64
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
-  build-docker-cpu-aarch64-2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-aarch64-2_28
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
-  build-docker-cpu-cxx11-abi:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
-  build-docker-xpu:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: xpu
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-xpu
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -54,7 +54,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.3"
+            rocm_version: "6.4"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
@ -138,7 +138,7 @@ jobs:
          fi

          docker exec -t "${container_name}" yum install -y zlib-devel zip
-          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==67.4.0 pybind11==2.13.1 auditwheel wheel
+          docker exec -t "${container_name}" "${PYTHON_EXECUTABLE}"  -m pip install -U setuptools==78.1.0 pybind11==2.13.1 auditwheel wheel

          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "rocm" || "${{ matrix.device }}" == "aarch64" ) ]]; then
            # With this install, it gets clang 16.0.6.
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -79,7 +79,7 @@ jobs:
        ]
        include:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
-            runner: linux.arm64.2xlarge
+            runner: linux.arm64.m7g.4xlarge
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -301,98 +301,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_2_4-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-rocm6_2_4-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_2_4-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm6_2_4-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_2_4-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_2_4-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_2_4-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_2_4-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_3-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -484,3 +392,95 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_4-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -55,7 +55,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
@ -79,7 +79,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -101,7 +101,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -120,7 +120,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
@ -144,7 +144,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -166,7 +166,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -185,7 +185,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
@ -209,7 +209,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -231,7 +231,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -250,7 +250,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
@ -274,7 +274,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -296,7 +296,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -315,7 +315,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
@ -339,7 +339,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
@ -361,7 +361,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-debug

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-debug
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -59,9 +66,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -117,11 +121,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -135,7 +139,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -150,25 +154,17 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -197,14 +193,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-release

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-release
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -59,9 +66,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -117,11 +121,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -135,7 +139,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -150,25 +154,17 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -197,14 +193,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-wheel

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-wheel
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -56,9 +63,6 @@ jobs:
      DESIRED_PYTHON: "3.12"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -114,11 +118,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -132,7 +136,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -143,25 +147,17 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_12-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -190,14 +186,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_12-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  wheel-py3_12-cpu-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -19,6 +19,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -52,6 +53,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -96,15 +106,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -145,6 +146,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -210,6 +212,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -224,18 +238,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -26,6 +26,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -59,6 +60,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -103,15 +113,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -152,6 +153,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -217,6 +219,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -231,18 +245,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -306,6 +308,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -350,15 +361,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -399,6 +401,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda11_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -465,6 +468,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -479,18 +494,6 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -555,6 +558,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -599,15 +611,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -648,6 +651,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_6-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -714,6 +718,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -728,18 +744,6 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -804,6 +808,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -848,15 +861,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -897,6 +901,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -963,6 +968,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -977,18 +994,6 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -19,6 +19,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -52,6 +53,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -96,15 +106,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -145,6 +146,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -210,6 +212,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -224,18 +238,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -26,6 +26,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -59,6 +60,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -103,15 +113,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -152,6 +153,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -217,6 +219,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -231,18 +245,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -306,6 +308,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -350,15 +361,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -399,6 +401,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda11_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -465,6 +468,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -479,18 +494,6 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -555,6 +558,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -599,15 +611,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -648,6 +651,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_6-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -714,6 +718,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -728,18 +744,6 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -804,6 +808,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -848,15 +861,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -897,6 +901,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -963,6 +968,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -977,18 +994,6 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -1,5 +1,4 @@
-name: perf-nightly-macos
-# Technically not an inductor test, but uses it as a template for tracking macos performance
+name: inductor-perf-nightly-macos

 on:
  schedule:
@ -24,6 +23,7 @@ on:
  pull_request:
    paths:
      - .github/workflows/inductor-perf-test-nightly-macos.yml
+      - .ci/pytorch/macos-test.sh

 concurrency:
  group:  ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -38,7 +38,7 @@ jobs:
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-perf-py3-arm64-build
-      build-environment: macos-py3-arm64
+      build-environment: macos-py3-arm64-distributed
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
@ -54,7 +54,7 @@ jobs:
    uses: ./.github/workflows/_mac-test.yml
    needs: macos-perf-py3-arm64-build
    with:
-      build-environment: macos-py3-arm64
+      build-environment: macos-py3-arm64-distributed
      # Same as the build job
      python-version: 3.9.12
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -36,11 +36,11 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -65,8 +65,8 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -90,7 +90,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
    secrets: inherit

@ -114,7 +114,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
    secrets: inherit

@ -138,10 +138,10 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
    secrets: inherit

@ -165,8 +165,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -53,11 +53,11 @@ jobs:
      sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -82,14 +82,14 @@ jobs:
      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
        ]}
    secrets: inherit

--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -37,13 +37,13 @@ jobs:
      runner: linux.arm64.2xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -0,0 +1,81 @@
+name: periodic-rocm-mi300
+
+on:
+  schedule:
+    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
+    # Also run less frequently on weekends.
+    - cron: 45 0,8,16 * * 1-5
+    - cron: 45 4 * * 0,6
+    - cron: 45 4,12,20 * * 1-5
+    - cron: 45 12 * * 0,6
+    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
+  push:
+    tags:
+      - ciflow/periodic-rocm-mi300/*
+    branches:
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-focal-rocm-py3_10-build:
+    name: linux-focal-rocm-py3.10
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-rocm-py3.10
+      docker-image-name: pytorch-linux-focal-rocm-n-py3
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
+        ]}
+    secrets: inherit
+
+  linux-focal-rocm-py3_10-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-focal-rocm-py3.10
+    uses: ./.github/workflows/_rocm-test.yml
+    needs:
+      - linux-focal-rocm-py3_10-build
+      - target-determination
+    with:
+      build-environment: linux-focal-rocm-py3.10
+      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -182,14 +182,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -184,7 +184,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang10
      docker-image-name: pytorch-linux-focal-py3.9-clang10
      test-matrix: |
@ -385,6 +385,9 @@ jobs:
    name: linux-focal-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
    needs: get-label-type
+    permissions:
+      id-token: write
+      contents: read
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.6-py3.10-gcc11-bazel-test
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@ -21,6 +21,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
      runner: linux.s390x
    secrets: inherit
--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@ -42,7 +42,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
      runner: linux.s390x
      test-matrix: |
        { include: [
@ -70,7 +70,7 @@ jobs:
      - target-determination
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x
      test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
      timeout-minutes: 600
      use-gha: "yes"
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -143,9 +143,9 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
    secrets: inherit
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -2,7 +2,7 @@ name: Upload test stats

 on:
  workflow_run:
-    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps]
+    workflows: [pull, trunk, periodic, periodic-rocm-mi300, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps]
    types:
      - completed

--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats

 on:
  workflow_run:
-    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, inductor-perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
    types:
      - completed

--- a/.gitignore
+++ b/.gitignore
@ -178,6 +178,7 @@ compile_commands.json
 *.egg-info/
 docs/source/scripts/activation_images/
 docs/source/scripts/quantization_backend_configs/
+docs/source/scripts/lr_scheduler_images/

 ## General

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1165,14 +1165,6 @@ exclude_patterns = [
    'test/quantization/core/test_utils.py',
    'test/quantization/core/test_workflow_module.py',
    'test/quantization/core/test_workflow_ops.py',
-    'test/quantization/eager/__init__.py',
-    'test/quantization/eager/test_bias_correction_eager.py',
-    'test/quantization/eager/test_equalize_eager.py',
-    'test/quantization/eager/test_fuse_eager.py',
-    'test/quantization/eager/test_model_numerics.py',
-    'test/quantization/eager/test_numeric_suite_eager.py',
-    'test/quantization/eager/test_quantize_eager_ptq.py',
-    'test/quantization/eager/test_quantize_eager_qat.py',
    'test/quantization/fx/__init__.py',
    'test/quantization/fx/test_equalize_fx.py',
    'test/quantization/fx/test_model_report_fx.py',
@ -1723,7 +1715,7 @@ command = [
    '@{{PATHSFILE}}'
 ]
 include_patterns = [
-   'torch/**/not-exist.py'
+   'torch/_inductor/**/*.py'
 ]
 is_formatter = false

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,4 +1,5 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_python//python:defs.bzl", "py_library", "py_test")
@ -659,6 +660,15 @@ cc_library(
 # torch
 torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])

+flatbuffer_cc_library(
+    name = "torch_flatbuffers",
+    srcs = [
+        "torch/csrc/jit/serialization/mobile_bytecode.fbs",
+    ],
+    flatc_args = ["--cpp", "--gen-mutable", "--scoped-enums"],
+    out_prefix = "torch/csrc/jit/serialization/",
+)
+
 cc_library(
    name = "torch_headers",
    hdrs = if_cuda(
@ -672,6 +682,7 @@ cc_library(
        ],
        exclude = [
            "torch/csrc/*/generated/*.h",
+            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
@ -686,6 +697,7 @@ cc_library(
    deps = [
        ":aten_headers",
        ":caffe2_headers",
+        ":torch_flatbuffers",
        "//c10",
        "@com_github_google_flatbuffers//:flatbuffers",
        "@local_config_python//:python_headers",
--- a/6
+++ b/6
@ -165,9 +165,9 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi

 # Dynamic Shapes
-/torch/fx/experimental/symbolic_shapes.py @bobren @laithsakka
-/torch/fx/experimental/sym_node.py @bobren @laithsakka
-/torch/fx/experimental/recording.py @bobren @laithsakka
+/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/sym_node.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/recording.py @bobrenjc93 @laithsakka

 # serialization-related files
 /aten/src/ATen/MapAllocator* @mikaylagawarecki
--- a/README.md
+++ b/README.md
@ -221,7 +221,7 @@ Other potentially useful environment variables may be found in `setup.py`.

 #### Get the PyTorch Source
 ```bash
-git clone --recursive https://github.com/pytorch/pytorch
+git clone https://github.com/pytorch/pytorch
 cd pytorch
 # if you are updating an existing checkout
 git submodule sync
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -384,12 +384,11 @@ endif()
    ${native_quantized_hip_hip}
    ${native_transformers_hip_hip} ${native_transformers_src_hip_hip}
  )
-  if(WIN32) # Windows doesn't support Composable Kernels and Triton
+  if(WIN32) # Windows doesn't support Composable Kernels
    file(GLOB native_hip_bgemm "native/hip/bgemm_kernels/*.hip")
    file(GLOB native_hip_ck "native/hip/ck*.hip")
    exclude(ATen_HIP_SRCS "${ATen_HIP_SRCS}"
-      ${native_hip_bgemm} ${native_hip_ck}
-      ${native_transformers_hip_hip} ${native_transformers_hip_cpp})
+      ${native_hip_bgemm} ${native_hip_ck})
  endif()
  # TODO: Codegen separate files for HIP and use those (s/cuda_generated_sources/hip_generated_sources)
  list(APPEND all_hip_cpp
@ -408,9 +407,6 @@ endif()
    ${miopen_cpp}
    ${all_hip_cpp}
  )
-  if(WIN32) # Windows doesn't support Triton
-    exclude(all_hip_cpp "${all_hip_cpp}" ${native_transformers_hip_cpp})
-  endif()
 endif()

 if(USE_XPU)
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -182,7 +182,7 @@ NestedTensorImpl::NestedTensorImpl(
      "coverage, and works with torch.compile.");
  auto storage_device = storage_.device();
  TORCH_INTERNAL_ASSERT(
-      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(),
+      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(),
      "NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ",
      storage_device);
  validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@ -29,12 +29,20 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
      bool is_non_overlapping_and_dense = true)
      : TensorImpl(key_set, data_type, device),
        opaque_handle_(std::move(opaque_handle)) {
-    set_storage_access_should_throw();
-    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
-    sizes_and_strides_.set_sizes(sizes);
-    refresh_numel();
-    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
-    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
+    constructor_impl(sizes, is_non_overlapping_and_dense);
+  }
+
+  OpaqueTensorImpl(
+      TensorImpl::ImplType impl_type,
+      c10::Storage&& storage,
+      at::DispatchKeySet key_set,
+      const caffe2::TypeMeta data_type,
+      OpaqueHandle opaque_handle,
+      c10::IntArrayRef sizes,
+      bool is_non_overlapping_and_dense = true)
+      : TensorImpl(impl_type, std::move(storage), key_set, data_type),
+        opaque_handle_(std::move(opaque_handle)) {
+    constructor_impl(sizes, is_non_overlapping_and_dense);
  }

  // Destructor doesn't call release_resources because it's
@ -181,6 +189,17 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
    return "OpaqueTensorImpl";
  }

+  void constructor_impl(
+      c10::IntArrayRef sizes,
+      bool is_non_overlapping_and_dense) {
+    set_storage_access_should_throw();
+    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
+    sizes_and_strides_.set_sizes(sizes);
+    refresh_numel();
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
+    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
+  }
+
  OpaqueHandle opaque_handle_;
 };

--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -10,15 +10,13 @@
 #include <mkl.h>
 #endif

+#if AT_MKLDNN_ENABLED()
+#include <ATen/native/mkldnn/IDeepRegistration.h>
+#endif
+
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>

 namespace at {
-#if AT_MKLDNN_ENABLED()
-namespace native::mkldnn {
-// NOLINTNEXTLINE(misc-use-internal-linkage)
-void clear_computation_cache();
-} // namespace native::mkldnn
-#endif

 namespace {
 // Number of threads set by the user
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -222,8 +222,8 @@ inline Tensor applySlice(
        ? (*self_sizes)[dim]
        : self.sym_size(dim);
    if (!disable_slice_optimization &&
-        TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) &&
-        TORCH_GUARD_SIZE_OBLIVIOUS(length.sym_eq(stop)) && step == 1) {
+        TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
+        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
      return self;
    }
  }
--- a/aten/src/ATen/core/CachingHostAllocator.cpp
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@ -0,0 +1,33 @@
+#include <ATen/core/CachingHostAllocator.h>
+
+#include <array>
+
+namespace at {
+
+namespace {
+
+static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_array{};
+static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_priority{};
+
+} // anonymous namespace
+
+void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority) {
+  if (priority >= allocator_priority[static_cast<int>(device_type)]) {
+    allocator_array[static_cast<int>(device_type)] = allocator;
+    allocator_priority[static_cast<int>(device_type)] = priority;
+  }
+}
+
+at::HostAllocator* getHostAllocator(at::DeviceType device_type) {
+  auto* allocator = allocator_array[static_cast<int>(device_type)];
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocator, "Host Allocator for ", device_type, " is not set.");
+  return allocator;
+}
+
+} // namespace at
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,4 +1,5 @@
 #include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
@ -46,7 +47,7 @@ namespace {
 }

 // Struct containing memory allocator summary statistics for host.
-struct HostStats {
+struct TORCH_API HostStats {
  // COUNT: allocations requested by client code. Note that active
  // count can be extracted by looking at current allocations
  Stat allocation;
@ -274,7 +275,8 @@ struct CachingHostAllocatorImpl {
    }
  }

-  virtual bool record_event(void* ptr, void* ctx, S stream) {
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream s) {
+    S stream = S(s);
    auto* block = reinterpret_cast<B*>(ctx);

    // Note: we need to check if the passed-in `ctx` is valid. This is because
@ -620,24 +622,49 @@ protected:
  alignas(64) HostStatsStaged stats_;
 };

-template <typename T>
-struct CachingHostAllocatorInterface : public at::Allocator {
+struct TORCH_API HostAllocator : public at::Allocator {
+  // Associates the pinned memory allocation with a stream to track
+  // dependencies. This ensures the memory won't be reused until the stream's
+  // operations complete
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0;
+
+  // Frees all cached pinned memory and returns it to the system, clearing the
+  // allocator's internal cache
+  virtual void empty_cache() = 0;
+
+  // Returns comprehensive statistics about the allocator's memory usage,
+  // allocation patterns, and timing metrics
+  virtual HostStats get_stats() = 0;
+
+  // Resets the cumulative allocation statistics
+  virtual void reset_accumulated_stats() = 0;
+
+  // Resets the peak memory usage metrics
+  virtual void reset_peak_stats() = 0;
+};
+
+template <typename T, c10::DeleterFnPtr deleteFunc>
+struct CachingHostAllocatorInterface : public HostAllocator {
  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}

  at::DataPtr allocate(size_t size) override {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        deleteFunc, // Use the template parameter deleter function
+        at::DeviceType::CPU};
  }

  void free(void* ctx) {
    impl_->free(ctx);
  }

-  template <typename S>
-  bool record_event(void* ptr, void* ctx, S stream) {
+  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
    return impl_->record_event(ptr, ctx, stream);
  }

-  void empty_cache() {
+  void empty_cache() override {
    impl_->empty_cache();
  }

@ -646,20 +673,54 @@ struct CachingHostAllocatorInterface : public at::Allocator {
    impl_->copy_data(dest, src, count);
  }

-  HostStats getStats() {
+  HostStats get_stats() override {
    return impl_->getStats();
  }

-  void resetAccumulatedStats() {
+  void reset_accumulated_stats() override {
    impl_->resetAccumulatedStats();
  }

-  void resetPeakStats() {
+  void reset_peak_stats() override {
    impl_->resetPeakStats();
  }

  std::unique_ptr<T> impl_;
 };

+#define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance)       \
+  void deleter(void* ptr);                                          \
+  struct name final                                                 \
+      : public at::CachingHostAllocatorInterface<impl, deleter> {}; \
+  static name instance;                                                    \
+  void deleter(void* ptr) {                                         \
+    instance.free(ptr);                                             \
+  }
+
+/**
+ * Set the host allocator for DeviceType `device_type`. This allocator manages
+ * pinned memory on the host that can be accessed efficiently by the specified
+ * device type. Note that this function is not thread-safe.
+ */
+TORCH_API void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority = 0);
+
+TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type);
+
+template <DeviceType device_type>
+struct HostAllocatorRegistry {
+  explicit HostAllocatorRegistry(HostAllocator* allocator) {
+    at::setHostAllocator(device_type, allocator);
+  }
+};
+
+#define REGISTER_HOST_ALLOCATOR(device_type, allocator) \
+  namespace {                                           \
+  static at::HostAllocatorRegistry<device_type>         \
+      g_host_allocator_registry_instance(allocator);    \
+  }
+
 } // namespace at
 C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/core/Dict.h
+++ b/aten/src/ATen/core/Dict.h
@ -116,10 +116,7 @@ public:

  DictIterator(const DictIterator& rhs): entryRef_(rhs.entryRef_) {}
  DictIterator(DictIterator&& rhs) noexcept: entryRef_(std::move(rhs.entryRef_)) {}
-  DictIterator& operator=(const DictIterator& rhs) {
-    entryRef_ = rhs.entryRef_;
-    return *this;
-  }
+  DictIterator& operator=(const DictIterator& rhs) = default;
  DictIterator& operator=(DictIterator&& rhs) noexcept {
    entryRef_ = std::move(rhs.entryRef_);
    return *this;
--- a/aten/src/ATen/core/alias_info.h
+++ b/aten/src/ATen/core/alias_info.h
@ -1,4 +1,6 @@
 #pragma once
+#include <set>
+#include <string>
 #include <unordered_set>
 #include <vector>
 #include <ATen/core/symbol.h>
@ -18,6 +20,15 @@ namespace c10 {
 */
 class AliasInfo {
 public:
+  AliasInfo() = default;
+  AliasInfo(bool is_write, const std::set<std::string>& before_qual_strings, const std::set<std::string>& after_qual_strings) : isWrite_(is_write) {
+    for (const auto& s: before_qual_strings) {
+      beforeSets_.insert(Symbol::fromQualString(s));
+    }
+    for (const auto& s : after_qual_strings) {
+      afterSets_.insert(Symbol::fromQualString(s));
+    }
+  }
  // Symbol for the set that can alias anything
  static Symbol wildcardSet() {
    static const Symbol wc = Symbol::fromQualString("alias::*");
--- a/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
+++ b/aten/src/ATen/core/dispatch/DispatchKeyExtractor.h
@ -225,8 +225,7 @@ struct TORCH_API DispatchKeyExtractor final {

  explicit DispatchKeyExtractor(c10::utils::bitset dispatch_arg_indices_reverse)
      : dispatch_arg_indices_reverse_(dispatch_arg_indices_reverse),
-        nonFallthroughKeys_(DispatchKeySet::FULL),
-        requiresBitsetPerBackend_(false) {
+        nonFallthroughKeys_(DispatchKeySet::FULL) {
    for (const auto i : c10::irange(nonFallthroughKeysPerBackend_.size())) {
      nonFallthroughKeysPerBackend_[i] = DispatchKeySet::FULL;
    }
@ -252,7 +251,7 @@ struct TORCH_API DispatchKeyExtractor final {
  // Flag to tell us if we can use the single set of nonFallthroughKeys_ (fast
  // path), or if we need to fall back to the slower path and check
  // nonFallthroughKeysPerBackend_
-  bool requiresBitsetPerBackend_;
+  bool requiresBitsetPerBackend_{false};
 };

 } // namespace c10
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -41,9 +41,15 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
    }
  };
  std::vector<Argument> new_arguments, new_returns;
-  std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
+  new_arguments.reserve(arguments().size());
+  for (const auto& arg: arguments()) {
+    new_arguments.push_back(cloneWithRealTypes(arg));
+  }
  // NB: SymInt returns are always SymInt
-  std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
+  new_returns.reserve(returns().size());
+  for (const auto& ret: returns()) {
+    new_returns.push_back(alwaysCloneWithRealTypes(ret));
+  }
  return FunctionSchema(
    name(),
    overload_name(),
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@ -1,6 +1,7 @@
 #include <torch/library.h>

 #include <ATen/core/dispatch/Dispatcher.h>
+#include <fmt/format.h>

 namespace torch {

@ -11,7 +12,7 @@ namespace {
 #ifdef STRIP_ERROR_MESSAGES
    return std::string();
 #else
-    return c10::str("registered at ", file, ":", line);
+    return fmt::format("registered at {}:{}", file, line);
 #endif
  }

@ -58,7 +59,7 @@ void Library::reset() {

 #define ERROR_CONTEXT "(Error occurred while processing ", toString(kind_), " block at ", file_, ":", line_, ")"

-#ifdef TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT
+#if defined(TORCH_LIBRARY_THREAD_UNSAFE_LAZY_INIT) && defined(C10_MOBILE)
 namespace detail {
  std::vector<TorchLibraryInit*> torch_library_initializers;
 } // namespace detail
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -248,7 +248,6 @@ namespace at::cuda::blas {
    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
  } while (0)

-
 namespace {
 // Following the pattern of CuSparseDescriptor
 // Defined here for now because this is the only place cublas_lt interface is
@ -334,9 +333,10 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 } // namespace


-template <typename Dtype>
-static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  cudaDataType_t abcType = CUDA_R_32F;
+template <typename Dtype, typename C_Dtype = Dtype>
+static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
 #ifndef USE_ROCM
@ -346,7 +346,8 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
  void * alpha_ptr = &alpha;
  void * beta_ptr = &beta;
  if constexpr (std::is_same_v<Dtype, double>) {
-    abcType = CUDA_R_64F;
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
@ -354,11 +355,13 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
-    abcType = CUDA_C_64F;
+    abType = CUDA_C_64F;
+    cType = CUDA_C_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_C_64F;
  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
-    abcType = CUDA_C_32F;
+    abType = CUDA_C_32F;
+    cType = CUDA_C_32F;
    scaleType = CUDA_C_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
@ -371,9 +374,11 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      beta_ptr = &hbeta;
    }
 #endif
-    abcType = CUDA_R_16F;
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abcType = CUDA_R_16BF;
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  } else {
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }
@ -395,9 +400,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
            at::globalContext()._SMCarveout_EXPERIMENTAL().value());
  }
 #endif
-  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+  CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, ldc);

  if (num_batches > 1) {
    int num_batches_as_int = static_cast<int>(num_batches);
@ -482,8 +487,10 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      ldb,
      " ldc ",
      ldc,
-      " abcType ",
-      abcType,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -495,9 +502,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
 }


-template <typename Dtype>
-inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented");
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::bgemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
 }

 template <>
@ -556,8 +563,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
      reinterpret_cast<cuComplex*>(c), ldc, stridec, num_batches));
 }

-template <>
-void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -602,23 +609,33 @@ void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
      handle, opa, opb, m, n, k,
      alpha_ptr, a, CUDA_R_16F, lda, stridea,
      b, CUDA_R_16F, ldb, strideb, beta_ptr,
-      c, CUDA_R_16F, ldc, stridec,
+      c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, stridec,
      num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  } else {
    for (const auto i : c10::irange(num_batches)) {
-      at::cuda::blas::gemm<at::Half>(
-        transa, transb,
-        m, n, k,
-        alpha, (a + i * stridea), lda,
-        (b + i * strideb), ldb, beta,
-        (c + i * stridec), ldc);
+      if (std::is_same_v<C_Dtype, float>) {
+        float* c_ptr = (float*)(c + i * stridec);
+        at::cuda::blas::gemm<at::Half, float>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            c_ptr, ldc);
+      } else {
+        at::cuda::blas::gemm<at::Half>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            (c + i * stridec), ldc);
+      }
    }
  }
 #endif // USE_ROCM
 }

-template <>
-void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
@ -635,15 +652,37 @@ void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
  auto compute_type = CUDA_R_32F;
 #endif
  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
-                                  opa, opb, (int)m, (int)n, (int)k,
-                                  (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
-                                  b, CUDA_R_16BF, (int)ldb, strideb,
-                                  (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
-                                  (int)num_batches,
-                                  compute_type,
-                                  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                              opa, opb, (int)m, (int)n, (int)k,
+                              (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
+                              b, CUDA_R_16BF, (int)ldb, strideb,
+                              (void*)&fbeta, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+                              (int)ldc, stridec, (int)num_batches,
+                              compute_type,
+                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 }

+template <>
+void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  bgemm_internal_cublas_half_helper<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  bgemm_internal_cublas_half_helper<float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  bgemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+template <>
+void bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  bgemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
 template <>
 void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
 {
@ -742,9 +781,50 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
  }
 }

-template <typename DType>
-inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
-  tunable::GemmStridedBatchedParams<DType> params;
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half))) {
+      bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16))) {
+      bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  tunable::GemmStridedBatchedParams<Dtype> params;
  params.transa = transa;
  params.transb = transb;
  params.m = m;
@ -767,19 +847,19 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
  bool transb_ = ((transb != 'n') && (transb != 'N'));

  if (transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else {
@ -853,9 +933,35 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  }
 }

-template <typename Dtype>
-inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
+template <>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support tuning for Half inputs and FP32 output
+  bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+
+template <>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support tuning for BFloat16 inputs and FP32 output
+  bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
 }

 template <>
@ -914,8 +1020,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
      reinterpret_cast<cuComplex*>(c), ldc));
 }

-template <>
-void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+template <typename C_Dtype>
+inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -994,7 +1100,7 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
        ldb,
        beta_ptr,
        c,
-        CUDA_R_16F,
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
        ldc,
        compute_type,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -1016,14 +1122,14 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
        ldb,
        &fbeta,
        c,
-        CUDA_R_16F,
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
        ldc));
  }
 #endif
 }

-template <>
-void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+template <typename C_Dtype>
+inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1060,15 +1166,35 @@ void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
      ldb,
      &fbeta,
      c,
-      CUDA_R_16BF,
+      std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
      ldc,
      compute_type,
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
 }

-template <typename Dtype>
-inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+template <>
+void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  gemm_internal_cublas_half_helper<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  gemm_internal_cublas_half_helper<float>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  gemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  gemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  // forward to bgemm implementation but set strides and batches to 0
  if (!bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0)) {
    gemm_internal_cublas(CUDABLAS_GEMM_ARGS(Dtype));
@ -1180,8 +1306,45 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
  }
 }

-template <typename DType>
-inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType, typename C_Dtype>
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(DType, C_Dtype)) {
  tunable::GemmParams<DType> params;
  params.transa = transa;
  params.transb = transb;
@ -1287,8 +1450,32 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  }
 }

+template <>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support Tuning for fp16-fp32 gemm
+  gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+}

-template <typename Dtype>
+
+template <>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support Tuning for bf16-fp32 gemm
+  gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+
+template <typename Dtype, typename C_Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1301,13 +1488,27 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    Dtype* result_ptr,
+    C_Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation) {
+
+  if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::BFloat16>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+    #endif
+  } else if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::Half>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+    #endif
+    if (at::globalContext().allowFP16AccumulationCuBLAS())
+      TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
  using opmath_t = at::opmath_type<Dtype>;
  opmath_t beta_val = 0; // bias is added in epilogue

-  cudaDataType_t abcType = CUDA_R_32F;
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
  void * alpha_ptr = &alpha_val;
@ -1317,14 +1518,14 @@ bool gemm_and_bias(
  at::Half hbeta_val;
 #endif
  if constexpr (std::is_same_v<Dtype, double>) {
-    abcType = CUDA_R_64F;
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
    if (at::globalContext().allowTF32CuBLAS()) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
-    abcType = CUDA_R_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
@ -1337,9 +1538,11 @@ bool gemm_and_bias(
      beta_ptr = &hbeta_val;
    }
 #endif
-    abcType = CUDA_R_16F;
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abcType = CUDA_R_16BF;
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  }

  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
@ -1369,9 +1572,9 @@ bool gemm_and_bias(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }

-  CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
-  CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
-  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+  CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);

  CuBlasLtMatmulPreference preference;
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
@ -1449,8 +1652,10 @@ bool gemm_and_bias(
      mat2_ld,
      " result_ld ",
      result_ld,
-      " abcType ",
-      abcType,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -1509,6 +1714,22 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
 template bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1525,6 +1746,22 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
 void scaled_gemm(
    char transa,
    char transb,
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -39,18 +39,26 @@ private:

 /* LEVEL 3 BLAS FUNCTIONS */

-#define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
+#define CUDABLAS_GEMM_ARGTYPES(Dtype) CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                  \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
-      Dtype *c, int64_t ldc
+      C_Dtype *c, int64_t ldc

 #define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc

-template <typename Dtype>
-inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+#define CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT \
+    ((std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value) && std::is_same<C_Dtype, float>::value)
+
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented");
 }

+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
 template <>
 void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@ -63,9 +71,13 @@ template <>
 void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+template<>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

-template <typename Dtype>
-inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm_internal: not implemented");
 }

@ -81,6 +93,10 @@ template <>
 void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

 enum GEMMAndBiasActivationEpilogue {
  None,
@ -90,7 +106,7 @@ enum GEMMAndBiasActivationEpilogue {

 // NOTE: GELU activation is not supported prior to CUDA 11.4 and will
 // do nothing if passed in that case.
-template <typename Dtype>
+template <typename Dtype, typename C_Dtype = Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -103,7 +119,7 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    Dtype* result_ptr,
+    C_Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);

@ -145,20 +161,25 @@ void scaled_gemm(
    bool use_fast_accum,
    bool use_rowwise);

-#define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                   \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
      const Dtype *a, int64_t lda, int64_t stridea,                                           \
      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
-      at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+      at::opmath_type<Dtype> beta, C_Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches

 #define CUDABLAS_BGEMM_ARGS(Dtype) \
  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches

-template <typename Dtype>
-inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented");
 }

+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
 template <>
 void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
 template <>
@ -171,9 +192,13 @@ template <>
 void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+template<>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

-template <typename Dtype>
-inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm_internal: not implemented");
 }

@ -189,6 +214,10 @@ template <>
 void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

 #define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
  cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -249,58 +249,13 @@ struct CUDACachingHostAllocatorImpl
  }
 };

-void raw_local_deleter(void* ptr);
+DECLARE_HOST_ALLOCATOR(
+    CUDACachingHostAllocator,
+    CUDACachingHostAllocatorImpl,
+    raw_local_deleter,
+    caching_host_allocator);

-struct CUDACachingHostAllocator final
-    : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
-  at::DataPtr allocate(size_t size) override {
-    auto ptr_and_ctx = impl_->allocate(size);
-    return {
-        ptr_and_ctx.first,
-        ptr_and_ctx.second,
-        &raw_local_deleter,
-        at::DeviceType::CPU};
-  }
-};
-
-CUDACachingHostAllocator caching_host_allocator;
-
-static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
-  return caching_host_allocator;
-}
-
-void raw_local_deleter(void* ptr) {
-  getCUDACachingHostAllocator().free(ptr);
-}
+REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)

 } // anonymous namespace
-
-bool CachingHostAllocator_recordEvent(
-    void* ptr,
-    void* ctx,
-    at::cuda::CUDAStream stream) {
-  return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
-}
-
-// Releases cached pinned memory allocations via cudaHostFree
-void CachingHostAllocator_emptyCache() {
-  getCUDACachingHostAllocator().empty_cache();
-}
-
-at::Allocator* getCachingHostAllocator() {
-  return &getCUDACachingHostAllocator();
-}
-
-at::HostStats CachingHostAllocator_getStats() {
-  return getCUDACachingHostAllocator().getStats();
-}
-
-void CachingHostAllocator_resetAccumulatedStats() {
-  return getCUDACachingHostAllocator().resetAccumulatedStats();
-}
-
-void CachingHostAllocator_resetPeakStats() {
-  return getCUDACachingHostAllocator().resetPeakStats();
-}
-
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@ -18,25 +18,52 @@ namespace at::cuda {
 // call between host and device, and passed the corresponding context from the
 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
 //
-TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kCUDA) instead.")
+inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
+  return at::getHostAllocator(at::kCUDA);
+}

 // Records an event in the specified stream. The allocation corresponding to the
 // input `ptr`/`ctx` will not be re-used until the event has occurred.
-TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
+inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
    void* ptr,
    void* ctx,
-    c10::cuda::CUDAStream stream);
-
-// Releases cached pinned memory allocations via cudaHostFree
-TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
-
-inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
-  return getCachingHostAllocator()->allocate(size);
+    c10::cuda::CUDAStream stream) {
+  return getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
 }

-TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats();
+// Releases cached pinned memory allocations via cudaHostFree
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kCUDA)->empty_cache() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache() {
+  getHostAllocator(at::kCUDA)->empty_cache();
+}

-TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats();
-TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats();
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->allocate(...) instead.")
+inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
+  return getHostAllocator(at::kCUDA)->allocate(size);
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_getStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->get_stats() instead.")
+inline TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats() {
+  return getHostAllocator(at::kCUDA)->get_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetAccumulatedStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_accumulated_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats() {
+  getHostAllocator(at::kCUDA)->reset_accumulated_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetPeakStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_peak_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats() {
+  getHostAllocator(at::kCUDA)->reset_peak_stats();
+}

 } // namespace at::cuda
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@ -1,11 +1,10 @@
 #pragma once

-#include <c10/core/Allocator.h>
 #include <ATen/cuda/CachingHostAllocator.h>

 namespace at::cuda {

-inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() {
-  return getCachingHostAllocator();
+inline TORCH_CUDA_CPP_API at::HostAllocator* getPinnedMemoryAllocator() {
+  return at::getHostAllocator(at::kCUDA);
 }
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -469,7 +469,7 @@ private:
  bool duplicate_inputs_{false};
 };

-template <typename T>
+template <typename T, typename C_Dtype = T>
 struct GemmStridedBatchedParams : OpParams {
  std::string BLASSignature() const override {
    std::string alpha_str = to_string_opmath<T>(alpha);
@ -477,7 +477,7 @@ struct GemmStridedBatchedParams : OpParams {
    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
      m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
-      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
  }

  std::string Signature() const override {
@ -517,7 +517,7 @@ struct GemmStridedBatchedParams : OpParams {
    c10::DeviceIndex device = 0;
    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
    size_t c_size = GetSizeC();
-    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
    if (duplicate_inputs) {
@ -544,7 +544,7 @@ struct GemmStridedBatchedParams : OpParams {
  }

  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
-    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
  }

@ -561,7 +561,7 @@ struct GemmStridedBatchedParams : OpParams {
  int64_t ldb{};
  int64_t stride_b{};
  at::opmath_type<T> beta;
-  T* c{};
+  C_Dtype* c{};
  int64_t ldc{};
  int64_t stride_c{};
  int64_t batch{};
--- a/aten/src/ATen/cuda/tunable/Tunable.h
+++ b/aten/src/ATen/cuda/tunable/Tunable.h
@ -40,7 +40,7 @@ enum TORCH_CUDA_CPP_API TuningStatus {
 class TORCH_CUDA_CPP_API ResultEntry {
  public:
    explicit ResultEntry(std::string  key, double time) : key_(std::move(key)), time_(time) {}
-    explicit ResultEntry(std::string  key, double time, const std::string& blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(blas_sig) {}
+    explicit ResultEntry(std::string  key, double time, std::string blas_sig ) : key_(std::move(key)), time_(time), blas_sig_(std::move(blas_sig)) {}
    bool operator==(const ResultEntry& other) const { return key_ == other.key_; }
    bool operator!=(const ResultEntry& other) const { return key_ != other.key_; }
    operator std::string () { return key_; }
--- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
+++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp
@ -773,6 +773,15 @@ std::tuple<Tensor, std::optional<int64_t>> scatter_add_batch_rule(
                            self, self_bdim, dim, index, index_bdim, src, src_bdim);
 }

+std::tuple<Tensor, std::optional<int64_t>> scatter_add__batch_rule(
+    const Tensor& self, std::optional<int64_t> self_bdim,
+    int64_t dim,
+    const Tensor& index, std::optional<int64_t> index_bdim,
+    const Tensor& src, std::optional<int64_t> src_bdim) {
+  return scatter_batch_rule(ATEN_FN(scatter_add_),
+                            self, self_bdim, dim, index, index_bdim, src, src_bdim);
+}
+
 std::tuple<Tensor, std::optional<int64_t>> scatter_reduce_batch_rule(
    const Tensor& self, std::optional<int64_t> self_bdim,
    int64_t dim,
@ -1278,6 +1287,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) {
  VMAP_SUPPORT2(scatter, value, scatter_value_batch_rule);
  VMAP_SUPPORT2(scatter, src, scatter_src_batch_rule);
  VMAP_SUPPORT(scatter_add, scatter_add_batch_rule);
+  VMAP_SUPPORT(scatter_add_, scatter_add__batch_rule);
  VMAP_SUPPORT2(scatter, reduce, scatter_reduce_batch_rule);
  VMAP_SUPPORT2(scatter, value_reduce, scatter_value_reduce_batch_rule);
  VMAP_SUPPORT2(scatter_reduce, two, scatter_reduce_two_batch_rule);
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -849,10 +849,7 @@ namespace at::native {
 // linear algebra operations

 template<class scalar_t>
-void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);
-
-template<class scalar_t, class value_t=scalar_t>
-void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
+static void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);

 template<> void lapackLu<c10::complex<double>>(int m, int n, c10::complex<double> *a, int lda, int *ipiv, int *info) {
  zgetrf_(&m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, info);
@ -2693,12 +2690,6 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b
  TORCH_CHECK(other.dim() >= 2, "torch.ormqr: other must have at least 2 dimensions.");

  int64_t left_size_condition = left ? -2 : -1;
-  TORCH_CHECK(
-      other.size(left_size_condition) >= tau.size(-1),
-      "torch.ormqr: other.shape[",
-      left_size_condition,
-      "] must be greater than or equal to tau.shape[-1]");
-
  TORCH_CHECK(
      other.size(left_size_condition) == input.size(-2),
      "torch.ormqr: other.shape[",
@ -2706,8 +2697,10 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b
      "] must be equal to input.shape[-2]");

  TORCH_CHECK(
-      tau.size(-1) <= input.size(-1),
-      "torch.ormqr: tau.shape[-1] must be less than or equal to input.shape[-1]");
+      std::min(other.size(left_size_condition), input.size(-1)) == tau.size(-1),
+      "torch.ormqr: tau.shape[-1] must be equal to min(other.shape[",
+      left_size_condition,
+      "], input.shape[-1])");

  TORCH_CHECK(
      input.dim() - tau.dim() == 1,
@ -2716,6 +2709,7 @@ Tensor& ormqr_out(const Tensor& input, const Tensor& tau, const Tensor& other, b
      tau.dim(),
      " and input.ndim is equal to ",
      input.dim());
+
  TORCH_CHECK(
      input.dim() == other.dim(),
      "torch.ormqr: ",
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -1383,35 +1383,35 @@ Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) {
 }

 template <typename Stub>
-Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
+static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
  auto iter = TensorIterator::comparison_op(result, self, other);
  stub(iter.device_type(), iter);
  return result;
 }

 template <typename OutImpl>
-Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
+static Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
  Tensor result = at::empty({0}, self.options().dtype(kBool));
  return out_impl(result, self, other);
 }

 template <typename OutImpl>
-Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
+static Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
  return out_impl(self, self, other);
 }

 template <typename OutImpl>
-Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return out_impl(result, self, wrapped_scalar_tensor(other));
 }

 template <typename OutImpl>
-Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+static Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return comparison_op(self, wrapped_scalar_tensor(other), out_impl);
 }

 template <typename OutImpl>
-Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
+static Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return out_impl(self, self, wrapped_scalar_tensor(other));
 }

--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -7,6 +7,11 @@
 #include <ATen/Config.h>

 #include <ATen/native/mkldnn/Matmul.h>
+#include <ATen/native/mkldnn/Linear.h>
+#include <ATen/native/Resize.h>
+#if !defined(__s390x__) && !defined(__powerpc__)
+#include <cpuinfo.h>
+#endif

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/CPUFunctions.h>
@ -24,6 +29,9 @@
 #include <ATen/ops/mv_native.h>
 #include <ATen/ops/scalar_tensor_native.h>
 #include <ATen/ops/vdot_native.h>
+#include <ATen/ops/_scaled_mm_native.h>
+#include <ATen/ops/mul.h>
+#include <ATen/ops/matmul.h>
 #endif

 namespace at::meta {
@ -222,4 +230,92 @@ Tensor vdot(const Tensor &self, const Tensor &other){

 }

+static Tensor&
+_scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
+  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
+  TORCH_CHECK(
+      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
+      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
+
+  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
+  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
+       " but got ", bias->numel());
+
+  // Check types
+  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
+  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
+  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
+
+  auto mat1_c = mat1.contiguous();
+  auto mat2_c = mat2.contiguous();
+  IntArrayRef mat1_sizes = mat1_c.sizes();
+  IntArrayRef mat2_sizes = mat2_c.sizes();
+  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
+
+  float input_scale = scale_a.item<float>();
+  float weight_scale = scale_b.item<float>();
+  auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale);
+  auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale);
+  auto out_tmp = at::matmul(fp32_mat1, fp32_mat2);
+  if (bias) {
+    out_tmp.add_(bias.value());
+  }
+  out_tmp = out_tmp.to(out.scalar_type());
+  out.copy_(out_tmp);
+  return out;
+}
+
+Tensor&
+_scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum,
+          Tensor& out) {
+#if AT_MKLDNN_ENABLED()
+  if (at::globalContext().userEnabledMkldnn()) {
+    bool mixed_dtype = mat1.scalar_type() != mat2.scalar_type();
+    if ((!mixed_dtype && cpuinfo_has_x86_amx_int8()) ||
+        (mixed_dtype && cpuinfo_has_x86_amx_fp16())) {
+      return mkldnn_scaled_mm(
+          mat1,
+          mat2,
+          scale_a,
+          scale_b,
+          bias,
+          scale_result,
+          out_dtype,
+          use_fast_accum,
+          out);
+    }
+  }
+#endif
+  {
+  return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
+  }
+}
+
+Tensor
+_scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
+          const Tensor& scale_a,
+          const Tensor& scale_b,
+          const std::optional<at::Tensor>& bias,
+          const std::optional<at::Tensor>& scale_result,
+          std::optional<c10::ScalarType> out_dtype,
+          bool use_fast_accum) {
+  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
+  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
+  return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
+}
+
 }  // namespace at::native
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -116,21 +116,44 @@ void fp16_gemv_trans(
  fp16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

-void bf16_gemv_trans(
-    const int m,
-    const int n,
-    const at::BFloat16 alpha,
-    const at::BFloat16* a,
-    const int lda,
-    const at::BFloat16* x,
-    const int incx,
-    const at::BFloat16 beta,
-    at::BFloat16* y,
-    const int incy);
-
 #endif // !defined(C10_MOBILE)

 #if defined(__aarch64__) && !defined(C10_MOBILE)
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f16(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto yf16 = y + i;
+      auto matRow = vld1_f16(column + i);
+      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
+      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
+      vst1_f16(yf16, resVec);
+    }
+  }
+}
+#endif
+
+static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  std::vector<float> sum(m);
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f32(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto sf32 = sum.data() + i;
+      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
+      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
+      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
+      vst1q_f32(sf32, resVec);
+    }
+  }
+
+  for (auto i = 0; i < m; i+= 4) {
+    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
+  }
+}
+
 void fp16_gemv_notrans(
    const int m,
    const int n,
@ -143,17 +166,55 @@ void fp16_gemv_notrans(
    Half* y,
    const int incy);

+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const Half* a,
+    const int lda,
+    const Half* x,
+    const int incx,
+    const float beta,
+    Half* y,
+    const int incy) {
+  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+    if (at::globalContext().allowFP16ReductionCPU())  {
+      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+    }
+#endif
+    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+  }
+  std::vector<float> sum(m);
+  for (const auto j : c10::irange(n)) {
+    const auto* column_ = a + lda * j;
+    auto z = alpha * x[j * incx];
+    for (const auto i : c10::irange(m)) {
+      sum[i] += z * column_[i];
+    }
+  }
+  if (beta == 0.0) {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] = sum[i];
+    }
+  } else {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] += sum[i];
+    }
+  }
+}
+
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)

 template <typename scalar_t>
-bool scal_use_fast_path(
+static bool scal_use_fast_path(
    [[maybe_unused]] int64_t n,
    [[maybe_unused]] int64_t incx) {
  return false;
 }

 template <typename scalar_t>
-bool gemv_use_fast_path(
+static bool gemv_use_fast_path(
    [[maybe_unused]] char trans,
    [[maybe_unused]] int64_t m,
    [[maybe_unused]] int64_t n,
@ -166,7 +227,7 @@ bool gemv_use_fast_path(
 }

 template <typename scalar_t>
-void scal_fast_path(
+static void scal_fast_path(
    [[maybe_unused]] int* n,
    [[maybe_unused]] scalar_t* a,
    [[maybe_unused]] scalar_t* x,
@ -176,7 +237,7 @@ void scal_fast_path(
 }

 template <typename scalar_t>
-void gemv_fast_path(
+static void gemv_fast_path(
    [[maybe_unused]] const char* trans,
    [[maybe_unused]] const int* m,
    [[maybe_unused]] const int* n,
@ -258,10 +319,6 @@ template <>
 void gemv_fast_path<float>(const char *trans, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy) {
  sgemv_(remove_const(trans), remove_const(m), remove_const(n), remove_const(alpha), remove_const(a), remove_const(lda), remove_const(x), remove_const(incx), remove_const(beta), y, remove_const(incy));
 }
-#else
-INSTANTIATE(float)
-INSTANTIATE(double)
-#endif // AT_BUILD_WITH_BLAS

 INSTANTIATE(uint8_t)
 INSTANTIATE(int8_t)
@ -283,7 +340,7 @@ bool gemv_use_fast_path<at::BFloat16>(
      beta == 0.0;
 }

-void bf16_gemv_trans(
+static void bf16_gemv_trans(
  const int m,
  const int n,
  const at::BFloat16 alpha,
@ -368,14 +425,7 @@ void gemv_fast_path<at::Half>(
      y,
      *incy);
 }
-#else
-template <>
-bool scal_use_fast_path<at::Half>(
-    [[maybe_unused]] int64_t n,
-    [[maybe_unused]] int64_t incx) {
-  return false;
-}
-
+#else // !defined(__aarch64__))
 template <>
 bool gemv_use_fast_path<at::Half>(
    char trans,
@ -391,79 +441,6 @@ bool gemv_use_fast_path<at::Half>(
      (c10::detail::fp16_from_bits(beta.x) == 0.0f || trans == 't' || trans == 'T');
 }

-#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
-  for (auto j = 0; j < n; j++) {
-    auto vecCol = vdup_n_f16(x[j]);
-    const auto* column = a + lda * j;
-    for (auto i = 0; i < m; i += 4) {
-      auto yf16 = y + i;
-      auto matRow = vld1_f16(column + i);
-      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
-      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
-      vst1_f16(yf16, resVec);
-    }
-  }
-}
-#endif
-
-static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
-  std::vector<float> sum(m);
-  for (auto j = 0; j < n; j++) {
-    auto vecCol = vdup_n_f32(x[j]);
-    const auto* column = a + lda * j;
-    for (auto i = 0; i < m; i += 4) {
-      auto sf32 = sum.data() + i;
-      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
-      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
-      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
-      vst1q_f32(sf32, resVec);
-    }
-  }
-
-  for (auto i = 0; i < m; i+= 4) {
-    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
-  }
-}
-
-void fp16_gemv_notrans(
-    const int m,
-    const int n,
-    const float alpha,
-    const Half* a,
-    const int lda,
-    const Half* x,
-    const int incx,
-    const float beta,
-    Half* y,
-    const int incy) {
-  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
-#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-    if (at::globalContext().allowFP16ReductionCPU())  {
-      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
-    }
-#endif
-    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
-  }
-  std::vector<float> sum(m);
-  for (const auto j : c10::irange(n)) {
-    const auto* column_ = a + lda * j;
-    auto z = alpha * x[j * incx];
-    for (const auto i : c10::irange(m)) {
-      sum[i] += z * column_[i];
-    }
-  }
-  if (beta == 0.0) {
-    for (const auto i : c10::irange(m)) {
-      y[i * incy] = sum[i];
-    }
-  } else {
-    for (const auto i : c10::irange(m)) {
-      y[i * incy] += sum[i];
-    }
-  }
-}
-
 template <>
 void gemv_fast_path<at::Half>(
    const char* trans,
@ -511,6 +488,7 @@ void gemv_fast_path<at::Half>(
 INSTANTIATE(c10::Half)
 INSTANTIATE(c10::BFloat16)
 #endif // !defined(C10_MOBILE)
+#endif // AT_BUILD_WITH_BLAS
 #undef INSTANTIATE

 } // namespace blas_impl
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -554,7 +554,7 @@ using is_blas_library_type = std::integral_constant<bool,
    std::is_same_v<scalar_t, c10::complex<float>>>;

 template <typename scalar_t>
-void gemm_batched_generic(
+static void gemm_batched_generic(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -568,7 +568,7 @@ void gemm_batched_generic(
 }

 template <typename scalar_t>
-void gemm_batched(
+static void gemm_batched(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -596,7 +596,7 @@ void gemm_batched(
 }

 template <typename scalar_t>
-void gemm_batched_with_stride_generic(
+static void gemm_batched_with_stride_generic(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -945,7 +945,7 @@ struct PackKey {
  }
 };

-inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+static inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
  if (dtype == ScalarType::Float) {
    return dnnl::memory::data_type::f32;
  } else if (dtype == ScalarType::BFloat16) {
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@ -13,15 +13,13 @@ class Tensor;
 namespace native {

 template<typename O, typename C>
-void _assert_match(const O& original, const C& compared, const std::string& name) {
+static void _assert_match(const O& original, const C& compared, const std::string& name) {
  if (compared) {
    bool equal = (original == compared.value());
    if (!equal) {
      std::stringstream msg;
-      msg << "Tensor " << name << " mismatch!";
-      if (!equal) {
-        throw std::runtime_error(msg.str());
-      }
+      msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
+      throw std::runtime_error(msg.str());
    }
  }
 }
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -437,4 +437,19 @@ inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor
  return is_channel_last(input) || is_channel_last(weight);
 }

+inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
+
+  // check layout only for mps tensor.
+  if (!input.is_mps() || !weight.is_mps()) {
+    return false;
+  }
+  if (!input.defined() || input.is_sparse()) {
+    // suggest channels_first
+    return false;
+  }
+
+  auto fmt = input.suggest_memory_format();
+  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -30,6 +30,10 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif

+#ifdef USE_MPS
+#include <ATen/mps/MPSDevice.h>
+#endif
+
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@ -93,7 +97,7 @@ static bool conv_benchmark_empty_cache = true;

 // Check workload to activate fast depthwise FP16 cudnn conv kernels
 template <typename T>
-bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
+static bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
  auto w = at::symint::size<T>(input, 3);  // same as h
  auto ch = at::symint::size<T>(input, 1);
  auto bs = at::symint::size<T>(input, 0);
@ -216,7 +220,7 @@ bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {

 // simplified version for cudnn 8.2 and above
 template <typename T>
-bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
+static bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
  // 1D conv
  if(at::symint::size<T>(input, 2) == 1 && stride == 1){
    return true;
@ -636,7 +640,7 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub)
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub)

 template <typename T>
-std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
+static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
  out << "ConvParams {"
      << "  stride = " << IntArrayRef{params.stride}
      << "  padding = " << ArrayRef<T>{params.padding}
@ -1199,7 +1203,7 @@ at::Tensor convolution_overrideable(
 // a bool indicating whether the bias is defined. This is done to save memory by
 // avoiding saving the full bias tensor for backward.
 template <typename T>
-ConvBackend _select_conv_backend(
+static ConvBackend _select_conv_backend(
    const Tensor& input,
    const Tensor& weight,
    const std::optional<Tensor>& bias,
@ -1413,7 +1417,7 @@ static inline at::MemoryFormat determine_backend_memory_format(
    const Tensor& input,
    const Tensor& weight,
    const ConvBackend backend) {
-  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
+  auto backend_memory_format = at::MemoryFormat::Contiguous;
 #if !defined(C10_MOBILE)
  auto k = weight.ndimension();
  // See Note [Mobile check segfaults]
@ -1451,6 +1455,17 @@ static inline at::MemoryFormat determine_backend_memory_format(
        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
      }
      break;
+    case ConvBackend::Mps:
+    case ConvBackend::MpsTranspose:
+      if (mps_conv_use_channels_last(input, weight)) {
+#ifdef USE_MPS
+        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
+          break;
+        }
+#endif
+        backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
+      }
+      break;
    default:
      backend_memory_format = at::MemoryFormat::Contiguous;
  }
--- a/aten/src/ATen/native/DispatchStub.cpp
+++ b/aten/src/ATen/native/DispatchStub.cpp
@ -147,6 +147,7 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
        c10::DeviceType::MPS,
        c10::DeviceType::MTIA,
        c10::DeviceType::XPU,
+        c10::DeviceType::HPU,
        c10::DeviceType::PrivateUse1
    );
    // Check if the device type is supported.
@ -203,6 +204,9 @@ DispatchResult DispatchStubImpl::try_get_call_ptr(
      return xpu_dispatch_ptr != nullptr ? DispatchResult(xpu_dispatch_ptr) : ErrorType::MissingDeviceKernel;
 #endif

+    case DeviceType::HPU:
+      return hpu_dispatch_ptr != nullptr ? DispatchResult(hpu_dispatch_ptr) : ErrorType::MissingDeviceKernel;
+
    case DeviceType::PrivateUse1:
      return privateuse1_dispatch_ptr != nullptr ? DispatchResult(privateuse1_dispatch_ptr) : ErrorType::MissingDeviceKernel;

--- a/aten/src/ATen/native/DispatchStub.h
+++ b/aten/src/ATen/native/DispatchStub.h
@ -44,6 +44,7 @@
 //   - MPS: Apple Silicon GPUs (Metal Performance Shaders)
 //   - MTIA: Meta Training and Inference Devices
 //   - XPU: Intel GPUs
+//   - HPU: Reserved for HPU (Intel Gaudi) device types
 //   - PrivateUse1: Reserved for private/custom device types
 //
 // If you want to update the list of supported devices, add a new dispatch_ptr
@ -196,6 +197,7 @@ struct TORCH_API DispatchStubImpl {
  #if defined(USE_XPU)
    void* xpu_dispatch_ptr;
  #endif
+    void* hpu_dispatch_ptr;
    void* privateuse1_dispatch_ptr;
  #else
    std::atomic<void*> cpu_dispatch_ptr{nullptr};
@ -206,6 +208,7 @@ struct TORCH_API DispatchStubImpl {
  #if defined(USE_XPU)
    void* xpu_dispatch_ptr = nullptr;
  #endif
+    void* hpu_dispatch_ptr = nullptr;
    void* privateuse1_dispatch_ptr = nullptr;
  #endif
 };
@ -259,6 +262,10 @@ public:
  }
  #endif

+  void set_hpu_dispatch_ptr(FnPtr fn_ptr) {
+    impl.hpu_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
+  }
+
  void set_hip_dispatch_ptr(FnPtr fn_ptr) {
    impl.hip_dispatch_ptr = reinterpret_cast<void*>(fn_ptr);
  }
@ -337,6 +344,13 @@ struct RegisterXPUDispatch {
  }
 };

+template <typename DispatchStub>
+struct RegisterHPUDispatch {
+  RegisterHPUDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value){
+    stub.set_hpu_dispatch_ptr(value);
+  }
+};
+
 template <typename DispatchStub>
 struct RegisterMPSDispatch {
  RegisterMPSDispatch(DispatchStub &stub, typename DispatchStub::FnPtr value) {
@ -437,6 +451,9 @@ struct RegisterPRIVATEUSE1Dispatch {
 #define REGISTER_XPU_DISPATCH(name, fn) \
  static RegisterXPUDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);

+#define REGISTER_HPU_DISPATCH(name, fn) \
+  static RegisterHPUDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);
+
 #define REGISTER_HIP_DISPATCH(name, fn) \
  static RegisterHIPDispatch<struct name##_DECLARE_DISPATCH_type> name ## __register(name, fn);

--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@ -1059,7 +1059,7 @@ static Tensor apply_bag_size_backward(
 }

 template <typename scalar_t>
-void embedding_bag_cpu_max_out(
+static void embedding_bag_cpu_max_out(
    Tensor* max_indices,
    const Tensor& weight,
    const Tensor& indices,
@ -1505,7 +1505,7 @@ static std::vector<index_t> compute_counts_uniq(
 }

 template <typename scalar_t>
-void _embedding_bag_dense_backward_cpu_sum_mean(
+static void _embedding_bag_dense_backward_cpu_sum_mean(
    const Tensor& grad,
    const Tensor& indices_,
    const Tensor& offset2bag_,
@ -1641,7 +1641,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
 }

 template<typename scalar_t>
-Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
+static Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
    const Tensor& grad,
    const Tensor& weight,  // NB: embedding table, not per_sample_weights
    const Tensor& indices_,
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -5,6 +5,7 @@
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/TensorOperators.h>
 #include <c10/util/irange.h>
+#include <c10/core/GradMode.h>
 #include <c10/core/SymInt.h>
 #include <c10/util/MaybeOwned.h>
 #include <ATen/TensorSubclassLikeUtils.h>
@ -158,11 +159,11 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  Tensor left = left_;
  Tensor right = right_;
  for (const auto i : c10::irange(dim)) {
-    auto sl = left.sym_size(i)!=1;
-    auto sr = right.sym_size(i)!=1;
+    auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1));
+    auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1));
    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
-        TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+        TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
        sum_size *= left.sym_size(i);
      } else if (sl) { // if it is only in one of left and right, we can sum right away
        left = left.sum(i, true);
@ -171,7 +172,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
      }
    } else if (sl && sr) { // now deal with dimensions that will be in the output
      // dimensions nontrivially in both left and right must be of the same size
-      TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+      TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
      lro.push_back(i);
      lro_size *= left.sym_size(i);
    } else if (sl) { // keep track of dimensions appearing only once
@ -481,10 +482,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
        // Iterate over each dimension covered by ellipsis
        const auto ndim = operands[i].ndimension() - (static_cast<int64_t>(op_labels[i].size()) - 1);
        for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) {
-          if (op.sym_size(dim) != 1) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
            // Update ellipsis size
-            TORCH_CHECK(
-                ell_sizes[j] == 1 || ell_sizes[j] == op.sym_size(dim),
+            TORCH_SYM_CHECK(
+                ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))),
                "einsum(): dimension ",
                dim,
                " covered by ellipsis in operand ",
@ -500,10 +501,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
          permutation[ell_index + j] = dim++;
        }
      } else if (permutation[label_perm_index[s]] == -1) {
-        if (op.sym_size(dim) != 1) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
          // Update subscript
-          TORCH_CHECK(
-              label_size[s] == 1 || label_size[s] == op.sym_size(dim),
+          TORCH_SYM_CHECK(
+              label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))),
              "einsum(): subscript ",
              subscript_to_label(s),
              " has size ",
@ -578,16 +579,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
    SmallVector<int64_t, 5> a_dims_to_sum;
    SmallVector<int64_t, 5> b_dims_to_sum;
    for (auto dim = out_num_dim; dim < perm_index; ++dim) {
-      if (a.sym_size(dim) != 1 && b.sym_size(dim) != 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))
+        && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
        if (--dim_counts[dim] == 1) {
          sum_dims.push_back(dim);
          dim_counts[dim] = 0;
        }
      } else if (dim_counts[dim] == 1) {
-        if (a.sym_size(dim) != 1) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) {
          a_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
-        } else if (b.sym_size(dim) != 1) {
+        } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
          b_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
        }
@ -831,6 +833,14 @@ Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef di
  auto output_device = result.device();
  auto input1_device = input1.device();
  auto input2_device = input2.device();
+
+  if(result.defined()) {
+    TORCH_CHECK(
+      !(result.requires_grad() && at::GradMode::is_enabled() && result.sizes() != result_tmp.sizes()),
+      "tensordot(): the 'out' tensor was specified and requires gradients, and its shape does not match the expected result. "
+      "Either remove the 'out' argument, ensure it does not require gradients, or make sure its shape matches the expected output."
+    );
+  }
  // check if the input & output tensors are on the same device.
  TORCH_CHECK(
    (output_device == input1_device) && (input1_device == input2_device),
--- a/Show More
+++ b/Show More