Halves time spent in generating the key strings

Adding a direct MPS kernel path to linear op and MPS kernel caching mechanism for improved perf.
[EZ/Profiler] Update Submodule (#151843 )
2025-10-25 16:14:55 +08:00 · 2025-04-22 11:32:36 -07:00 · 2025-04-22 11:32:34 -07:00 · 2025-04-22 18:19:43 +00:00 · 2025-04-22 17:57:31 +00:00 · 2025-04-22 17:20:09 +00:00
452 changed files with 17593 additions and 8443 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,82 +1,60 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -exou pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE_NAME="pytorch/${image}"
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')

+CUDA_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+fi

-export DOCKER_BUILDKIT=1
-TOPDIR=$(git rev-parse --show-toplevel)
-
-CUDA_VERSION=${CUDA_VERSION:-12.1}
-
-case ${CUDA_VERSION} in
+case ${DOCKER_TAG_PREFIX} in
  cpu)
    BASE_TARGET=base
-    DOCKER_TAG=cpu
    ;;
-  all)
-    BASE_TARGET=all_cuda
-    DOCKER_TAG=latest
+  cuda*)
+    BASE_TARGET=cuda${CUDA_VERSION}
    ;;
  *)
-    BASE_TARGET=cuda${CUDA_VERSION}
-    DOCKER_TAG=cuda${CUDA_VERSION}
+    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
+    exit 1
    ;;
 esac

+# TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+# is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+sudo systemctl daemon-reload
+sudo systemctl restart docker

-(
-  set -x
-  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-  sudo systemctl daemon-reload
-  sudo systemctl restart docker
+export DOCKER_BUILDKIT=1
+TOPDIR=$(git rev-parse --show-toplevel)
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-  docker build \
-    --target final \
-    --progress plain \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
-    --build-arg "DEVTOOLSET_VERSION=11" \
-    -t ${DOCKER_IMAGE_NAME} \
-    $@ \
-    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
-    ${TOPDIR}/.ci/docker/
-)
+docker build \
+  --target final \
+  --progress plain \
+  --build-arg "BASE_TARGET=${BASE_TARGET}" \
+  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
+  --build-arg "DEVTOOLSET_VERSION=11" \
+  -t ${tmp_tag} \
+  $@ \
+  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
+  ${TOPDIR}/.ci/docker/

-if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
+if [ -n "${CUDA_VERSION}" ]; then
  # Test that we're using the right CUDA compiler
-  (
-    set -x
-    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
-  )
-fi
-
-GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
-if [[ "${WITH_PUSH:-}" == true ]]; then
-  (
-    set -x
-    docker push "${DOCKER_IMAGE_NAME}"
-    if [[ -n ${GITHUB_REF} ]]; then
-        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
-        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
-        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
-        docker push "${DOCKER_IMAGE_SHA_TAG}"
-    fi
-  )
+  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
 fi
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-4022ff142a5392aa5197e05f4dfe85d356f742bf
+381ae5d57d35c165d98df728380b20fbde350392
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -19,6 +19,13 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

+    # Make sure rocm packages from repo.radeon.com have highest priority
+    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
+Package: *
+Pin: release o=repo.radeon.com
+Pin-Priority: 600
+EOF
+
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,9 +25,7 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
-# GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
-GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
+GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 pushd drm

 ###########################
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -17,10 +17,14 @@ function do_install() {
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        tar -xvf "${magma_archive}"
-        mkdir -p "${rocm_dir}/magma"
-        mv include "${rocm_dir}/magma/include"
-        mv lib "${rocm_dir}/magma/lib"
+        if tar -xvf "${magma_archive}"
+        then
+            mkdir -p "${rocm_dir}/magma"
+            mv include "${rocm_dir}/magma/include"
+            mv lib "${rocm_dir}/magma/lib"
+        else
+            echo "${magma_archive} not found, skipping magma install"
+        fi
        popd
    )
 }
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -89,7 +89,7 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python -y && \
+    apt-get install python3 python-is-python3 -y && \
    apt-get clean

 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,83 +1,63 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -eoux pipefail

 image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGENAME:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE="pytorch/${image}"
-
 TOPDIR=$(git rev-parse --show-toplevel)

-GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
-
-WITH_PUSH=${WITH_PUSH:-}
-
 DOCKER=${DOCKER:-docker}

-case ${GPU_ARCH_TYPE} in
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
+
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi
+
+case ${DOCKER_TAG_PREFIX} in
    cpu)
        BASE_TARGET=cpu
-        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda)
+    cuda*)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm)
+    rocm*)
        BASE_TARGET=rocm
-        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
-        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
+        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
-        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
        exit 1
        ;;
 esac

+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')

-(
-    set -x
-    DOCKER_BUILDKIT=1 ${DOCKER} build \
-         --target final \
-        ${DOCKER_GPU_BUILD_ARG} \
-        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-        --build-arg "BASE_TARGET=${BASE_TARGET}" \
-        -t "${DOCKER_IMAGE}" \
-        $@ \
-        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
-        "${TOPDIR}/.ci/docker/"
-
-)
-
-GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  (
-    set -x
-    ${DOCKER} push "${DOCKER_IMAGE}"
-    if [[ -n ${GITHUB_REF} ]]; then
-        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
-        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
-        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
-        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
-    fi
-  )
-fi
+DOCKER_BUILDKIT=1 ${DOCKER} build \
+    --target final \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline

-set -eou pipefail
+set -exou pipefail

 TOPDIR=$(git rev-parse --show-toplevel)

@ -9,152 +9,110 @@ image="$1"
 shift

 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE"
+  echo "Usage: $0 IMAGE:ARCHTAG"
  exit 1
 fi

-DOCKER_IMAGE="pytorch/${image}"
+# Go from imagename:tag to tag
+DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')

-DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
+GPU_ARCH_VERSION=""
+if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
+    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
+elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
+    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
+    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
+fi

-GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
-GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
-WITH_PUSH=${WITH_PUSH:-}

-case ${GPU_ARCH_TYPE} in
-    cpu)
+case ${image} in
+    manylinux2_28-builder:cpu)
        TARGET=cpu_final
-        DOCKER_TAG=cpu
-        GPU_IMAGE=centos:7
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
-        ;;
-    cpu-manylinux_2_28)
-        TARGET=cpu_final
-        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    cpu-aarch64)
+    manylinuxaarch64-builder:cpu-aarch64)
        TARGET=final
-        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
-    cpu-aarch64-2_28)
+    manylinux2_28_aarch64-builder:cpu-aarch64)
        TARGET=final
-        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
-    cpu-cxx11-abi)
+    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
        TARGET=final
-        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    cpu-s390x)
+    manylinuxs390x-builder:cpu-s390x)
        TARGET=final
-        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    cuda)
+    manylinux2_28-builder:cuda*)
        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
-        # Keep this up to date with the minimum version of CUDA we currently support
-        GPU_IMAGE=centos:7
-        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
-        ;;
-    cuda-manylinux_2_28)
-        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    cuda-aarch64)
+    manylinuxaarch64-builder:cuda*)
        TARGET=cuda_final
-        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    rocm|rocm-manylinux_2_28)
+    manylinux2_28-builder:rocm*)
        TARGET=rocm_final
-        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        DEVTOOLSET_VERSION="9"
-        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
-            MANY_LINUX_VERSION="2_28"
-            DEVTOOLSET_VERSION="11"
-            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
-        fi
+        MANY_LINUX_VERSION="2_28"
+        DEVTOOLSET_VERSION="11"
+        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    xpu)
+    manylinux2_28-builder:xpu)
        TARGET=xpu_final
-        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
+        echo "ERROR: Unrecognized image name: ${image}"
        exit 1
        ;;
 esac

-IMAGES=''
-
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-(
-    set -x
-
-    # Only activate this if in CI
-    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-        sudo systemctl daemon-reload
-        sudo systemctl restart docker
-    fi
-
-    DOCKER_BUILDKIT=1 docker build  \
-        ${DOCKER_GPU_BUILD_ARG} \
-        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-        --target "${TARGET}" \
-        -t "${DOCKER_IMAGE}" \
-        $@ \
-        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
-        "${TOPDIR}/.ci/docker/"
-)
-
-GITHUB_REF=${GITHUB_REF:-"dev")}
-GIT_BRANCH_NAME=${GITHUB_REF##*/}
-GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
-DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
-DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
-
-if [[ "${WITH_PUSH}" == true ]]; then
-    (
-        set -x
-        docker push "${DOCKER_IMAGE}"
-        if [[ -n ${GITHUB_REF} ]]; then
-            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
-            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
-            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
-            docker push "${DOCKER_IMAGE_SHA_TAG}"
-        fi
-    )
+# Only activate this if in CI
+if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
 fi
+
+tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
+
+DOCKER_BUILDKIT=1 docker build  \
+    ${DOCKER_GPU_BUILD_ARG} \
+    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+    --target "${TARGET}" \
+    -t "${tmp_tag}" \
+    $@ \
+    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
+    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,15 +1,20 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
+
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6

+sphinxext-opengraph==0.9.1
+#Description: This is used to generate PyTorch docs
+#Pinned versions: 0.9.1
+
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -46,5 +51,6 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-panels==0.4.1
+sphinx-design==0.4.0
+sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash

 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.3
+DESIRED_ROCM ?= 6.4
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,6 +16,7 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh

 .PHONY: all
+all: magma-rocm64
 all: magma-rocm63
 all: magma-rocm624

@ -24,6 +25,11 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output

+.PHONY: magma-rocm64
+magma-rocm64: DESIRED_ROCM := 6.4
+magma-rocm64:
+	$(DOCKER_RUN)
+
 .PHONY: magma-rocm63
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -34,11 +34,14 @@ if which sccache > /dev/null; then
 fi

 print_cmake_info
-
-# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
-
+if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
+  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+else
+  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
+  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
+fi
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -221,25 +221,39 @@ test_torchbench_smoketest() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  local backend=eager
-  local dtype=notset
  local device=mps
+  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder moco speech_transformer)

-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+  for backend in eager inductor; do

-  echo "Setup complete, launching torchbench training performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --training --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-  done
+    for dtype in notset float16 bfloat16; do
+      echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
+      local dtype_arg="--${dtype}"
+      if [ "$dtype" == notset ]; then
+          dtype_arg="--float32"
+      fi
+      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
+      for model in "${models[@]}"; do
+        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
+          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
+      done
+    done
+
+    for dtype in notset amp; do
+      echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype}"
+      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
+      local dtype_arg="--${dtype}"
+      if [ "$dtype" == notset ]; then
+          dtype_arg="--float32"
+      fi
+      for model in "${models[@]}"; do
+        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
+          --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
+          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
+      done
+    done

-  echo "Launching torchbench inference performance run"
-  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
-      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done

  echo "Pytorch benchmark on mps device completed"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,12 +119,6 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"

-# Prevent Google from indexing $install_path/_modules. This folder contains
-# generated source files.
-# NB: the following only works on gnu sed. The sed shipped with mac os is different.
-# One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
-find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
-
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -1,22 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-
-export USE_SCCACHE=1
-export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-
-echo "Free space on filesystem before build:"
-df -h
-
-export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-
-if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
-elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
-    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
-fi
-
-echo "Free space on filesystem after build:"
-df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -1,6 +0,0 @@
-#!/bin/bash
-set -eux -o pipefail
-
-source "${BINARY_ENV_FILE:-/c/w/env}"
-
-pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -4,11 +4,13 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"

-export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-export USE_SCCACHE=1
-export SCCACHE_BUCKET=ossci-compiler-cache
-export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-export VC_YEAR=2022
+if [[ "$OS" != "windows-arm64" ]]; then
+    export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+    export USE_SCCACHE=1
+    export SCCACHE_BUCKET=ossci-compiler-cache
+    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+    export VC_YEAR=2022
+fi

 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export USE_SCCACHE=0
@ -21,7 +23,16 @@ df -h

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-./windows/internal/build_wheels.bat
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
+        ./windows/arm64/build_libtorch.bat
+    elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
+        ./windows/arm64/build_pytorch.bat
+    fi
+else
+    ./windows/internal/build_wheels.bat
+fi

 echo "Free space on filesystem after build:"
 df -h
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -11,6 +11,11 @@ if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
 fi

 pushd "$PYTORCH_ROOT/.ci/pytorch/"
-./windows/internal/smoke_test.bat
+
+if [[ "$OS" == "windows-arm64" ]]; then
+    ./windows/arm64/smoke_test.bat
+else
+    ./windows/internal/smoke_test.bat
+fi

 popd
--- a/.github/actions/binary-docker-build/action.yml
+++ b/.github/actions/binary-docker-build/action.yml
@ -0,0 +1,70 @@
+name: Binary docker build
+
+description: Build docker image for binary builds
+
+inputs:
+  docker-image-name:
+    description: Docker image name for PR builds
+    required: true
+  docker-build-dir:
+    description: Location of the build.sh relative to .ci/docker
+    required: true
+  custom-tag-prefix:
+    description: Custom tag prefix for the docker image
+    required: false
+  DOCKER_TOKEN:
+    description: Docker token for authentication
+    required: true
+  DOCKER_ID:
+    description: Docker ID for authentication
+    required: true
+
+runs:
+  using: composite
+  steps:
+    - name: Checkout PyTorch
+      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+
+    - name: Calculate docker image
+      id: calculate-docker-image
+      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+      with:
+        docker-image-name: ${{ inputs.docker-image-name }}
+        docker-build-dir: .ci/docker
+        custom-tag-prefix: ${{ inputs.custom-tag-prefix }}
+        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
+        always-rebuild: true
+        push: true
+
+    - name: Tag and (if WITH_PUSH) push docker image to docker.io
+      env:
+        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
+        DOCKER_ID: ${{ inputs.DOCKER_ID }}
+        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
+        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
+        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
+      shell: bash
+      run: |
+        set -euox pipefail
+        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
+        GIT_BRANCH_NAME=${GITHUB_REF##*/}
+        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
+        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
+
+        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
+
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+
+        # Pretty sure Github will mask tokens and I'm not sure if it will even be
+        # printed due to pipe, but just in case
+        set +x
+        if [[ ${WITH_PUSH:-false} == "true" ]]; then
+          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
+          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
+        fi
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -112,3 +112,22 @@
 - torch/csrc/inductor/aoti_include/xpu.h
 - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
 - torch/csrc/inductor/cpp_wrapper/xpu.h
+
+"release notes: inductor (aoti)":
+- torch/_C/_aoti.pyi
+- torch/_dynamo/repro/aoti.py
+- torch/_export/serde/aoti_schema.py
+- torch/_higher_order_ops/aoti_call_delegate.py
+- torch/_inductor/codegen/aoti_runtime/**
+- torch/_inductor/codegen/aoti_hipify_utils.py
+- torch/_inductor/codegen/cpp_wrapper_cpu.py
+- torch/_inductor/codegen/cpp_wrapper_gpu.py
+- torch/_inductor/aoti_eager.py
+- torch/csrc/inductor/aoti_runtime/**
+- torch/csrc/inductor/aoti_torch/**
+- torch/csrc/inductor/aoti_runner/**
+- torch/csrc/inductor/aoti_eager/**
+- torch/csrc/inductor/aoti_package/**
+- torch/csrc/inductor/aoti_include/**
+- torchgen/aoti/**
+- torchgen/gen_aoti_c_shim.py
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }

 # NOTE: Also update the ROCm sources in tools/nightly.py when changing this list
-ROCM_ARCHES = ["6.2.4", "6.3"]
+ROCM_ARCHES = ["6.3", "6.4"]

 XPU_ARCHES = ["xpu"]

@ -173,7 +173,7 @@ WHEEL_CONTAINER_IMAGES = {
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
+    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
 }

 RELEASE = "release"
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -227,42 +227,6 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
-]
-
-WINDOWS_BINARY_SMOKE_WORKFLOWS = [
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.RELEASE,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.RELEASE,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.WINDOWS,
-        package_type="libtorch",
-        build_variant=generate_binary_build_matrix.DEBUG,
-        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
-            OperatingSystem.WINDOWS,
-            generate_binary_build_matrix.DEBUG,
-            arches=["cpu"],
-            libtorch_variants=["shared-with-deps"],
-        ),
-        branches="main",
-        ciflow_config=CIFlowConfig(
-            isolated_workflow=True,
-        ),
-    ),
-]
-
-WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="wheel",
@ -308,6 +272,39 @@ WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    ),
 ]

+WINDOWS_BINARY_SMOKE_WORKFLOWS = [
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.RELEASE,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.RELEASE,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+    BinaryBuildWorkflow(
+        os=OperatingSystem.WINDOWS,
+        package_type="libtorch",
+        build_variant=generate_binary_build_matrix.DEBUG,
+        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
+            OperatingSystem.WINDOWS,
+            generate_binary_build_matrix.DEBUG,
+            arches=["cpu"],
+            libtorch_variants=["shared-with-deps"],
+        ),
+        branches="main",
+        ciflow_config=CIFlowConfig(
+            isolated_workflow=True,
+        ),
+    ),
+]
+
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -402,10 +399,6 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
-        (
-            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
-            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
-        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -434,7 +434,7 @@ query ($owner: String!, $name: String!) {
 RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
 RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE)
 RE_PULL_REQUEST_RESOLVED = re.compile(
-    r"Pull Request resolved: "
+    r"(Pull Request resolved|Pull-Request-resolved): "
    r"https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)",
    re.MULTILINE,
 )
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -1,197 +0,0 @@
-{% import 'common.yml.j2' as common %}
-{% import 'upload.yml.j2' as upload %}
-
-{%- block name -%}
-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
-# Generation script: .github/scripts/generate_ci_workflows.py
-name: !{{ build_environment }}
-{%- endblock %}
-
-{%- macro set_runner_specific_vars() -%}
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: cmd
-        run: |
-          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
-          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
-          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-{%- endmacro %}
-
-on:
-  push:
-    branches:
-      - !{{ branches }}
-    {%- if branches == "nightly" %}
-    tags:
-      # NOTE: Binary build pipelines should only get triggered on release candidate builds
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-    {%- endif %}
-{%- for label in ciflow_config.labels | sort %}
-    {%- if loop.first and branches != "nightly" %}
-    tags:
-    {%- endif %}
-      - '!{{ label }}/*'
-{%- endfor %}
-  workflow_dispatch:
-
-env:
-  BUILD_ENVIRONMENT: !{{ build_environment }}
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  PR_NUMBER: ${{ github.event.pull_request.number }}
-  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-  SKIP_ALL_TESTS: 1
-  PYTORCH_ROOT: /pytorch
-  DOWNLOADS_DIR: c:\temp\downloads
-  DEPENDENCIES_DIR: c:\temp\dependencies
-  ENABLE_APL: 1
-  ENABLE_OPENBLAS: 0
-  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
-
-jobs:
-  get-label-type:
-    if: github.repository_owner == 'pytorch'
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
-{%- for config in build_configs %}
-  !{{ config["build_name"] }}-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
-    {%- endif %}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - name: Bootstrap folders
-        shell: cmd
-        run: |
-          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
-          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch - recursive
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Bootstrap sccache
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
-      - name: Bootstrap Libuv
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Build PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
-      - uses: !{{ common.upload_artifact_action }}
-        if: always()
-        with:
-          name: !{{ config["build_name"] }}
-          retention-days: 14
-          if-no-files-found: error
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-  !{{ config["build_name"] }}-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - !{{ config["build_name"] }}-build
-      - get-label-type
-    runs-on: "windows-11-arm64"
-    timeout-minutes: !{{ common.timeout_minutes }}
-    !{{ upload.binary_env(config, True) }}
-    steps:
-      !{{ set_runner_specific_vars() }}
-      - uses: !{{ common.download_artifact_action }}
-        name: Download Build Artifacts
-        with:
-          name: !{{ config["build_name"] }}
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-      - name: Bootstrap Git
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
-      - name: Remove Pytorch folder
-        shell: cmd
-        run: |
-          rmdir /s /q "pytorch"
-      - name: Git checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          path: "pytorch"
-          submodules: recursive
-      - name: Bootstrap APL
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
-      - name: Bootstrap Python
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
-      - name: Bootstrap Build Tools
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
-      - name: Bootstrap Rust
-        shell: cmd
-        run: |
-          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
-      - name: Populate binary env
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
-      - name: Test PyTorch binary
-        shell: bash
-        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
-  {%- if branches == "nightly" %}
-  !{{ upload.upload_binaries(config, True) }}
-  {%- endif %}
-{%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -49,6 +49,15 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: !{{ os }}
+{%- if os == "windows-arm64" %}
+  PYTORCH_ROOT: /pytorch
+  DOWNLOADS_DIR: c:\temp\downloads
+  DEPENDENCIES_DIR: c:\temp\dependencies
+  ENABLE_APL: 1
+  ENABLE_OPENBLAS: 0
+  MSVC_VERSION : 14.42
+{%- endif %}
 !{{ common.concurrency(build_environment) }}

 jobs:
@ -66,20 +75,79 @@ jobs:
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
+    {%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64"
+    {%- else %}
    {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
+    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
    steps:
-      !{{ common.setup_ec2_windows() }}
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Bootstrap folders
+        shell: cmd
+        run: |
+          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
+          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Git
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch - recursive
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - name: Bootstrap sccache
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
+      - name: Bootstrap Libuv
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
+{%- else %}
      !{{ set_runner_specific_vars() }}
+      !{{ common.setup_ec2_windows() }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
+{%- endif %}
      - name: Populate binary env
        shell: bash
        run: |
@ -95,12 +163,17 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{% endif %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - !{{ config["build_name"] }}-build
      - get-label-type
+{%- if os == "windows-arm64" %}
+    runs-on: "windows-11-arm64"
+{%- else %}
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
@ -113,18 +186,61 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
+{%- endif %}
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
+{%- if os == "windows-arm64" %}
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
+          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
+          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+      - name: Populate binary env
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
+      - name: Remove Pytorch folder
+        shell: cmd
+        run: |
+          rmdir /s /q "pytorch"
+      - name: Git checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          path: "pytorch"
+          submodules: recursive
+      - name: Bootstrap APL
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
+      - name: Bootstrap Python
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
+      - name: Bootstrap Build Tools
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
+      - name: Bootstrap Rust
+        shell: cmd
+        run: |
+          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+{%- else %}
      !{{ common.setup_ec2_windows() }}
+      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ set_runner_specific_vars() }}
+{%- endif %}
      - uses: !{{ common.download_artifact_action }}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      - name: Populate binary env
        shell: bash
        run: |
@ -133,8 +249,10 @@ jobs:
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+{%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
+{%- endif %}
  {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config, True) }}
  {%- endif %}
-{%- endfor %}
+{%- endfor %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -33,6 +33,10 @@ on:
        default: "linux.large"
        description: Runner type

+permissions:
+  id-token: write
+  contents: read
+
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

@ -80,6 +84,13 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux

+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
+          role-session-name: gha-bazel-build
+          aws-region: us-east-1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -202,6 +213,13 @@ jobs:
        uses: ./.github/actions/chown-workspace
        if: always()

+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
+          role-session-name: gha-bazel-build-upload-artifacts
+          aws-region: us-east-1
+
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -38,6 +38,11 @@ on:
        required: false
        type: boolean
        default: true
+    secrets:
+      HUGGING_FACE_HUB_TOKEN:
+        required: false
+        description: |
+          HF Auth token to avoid rate limits when downloading models or datasets from hub

 jobs:
  test:
@ -166,6 +171,7 @@ jobs:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
+          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
          # shellcheck disable=SC1090
          set -ex
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -47,6 +47,10 @@ on:
        type: boolean
        default: true

+permissions:
+  id-token: write
+  contents: read
+
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -11,14 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/almalinux/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/almalinux/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-almalinux-images.yml
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -37,37 +37,12 @@ jobs:
    strategy:
      matrix:
        cuda_version: ["11.8", "12.4", "12.6", "cpu"]
-    env:
-      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/almalinux
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: almalinux-builder
+          custom-tag-prefix: ${{ matrix.cuda_version != 'cpu' && 'cuda' || '' }}${{matrix.cuda_version}}
+          docker-build-dir: almalinux
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -10,14 +10,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/libtorch/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/libtorch/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-libtorch-images.yml
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -39,123 +39,29 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  build-docker-cuda:
+  build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral
+    name: libtorch-cxx11-builder:${{ matrix.tag }}
    strategy:
+      fail-fast: false
      matrix:
-        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-    env:
-      GPU_ARCH_TYPE: cuda
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+        include: [
+          { tag: "cuda12.8" },
+          { tag: "cuda12.6" },
+          { tag: "cuda12.4" },
+          { tag: "cuda11.8" },
+          { tag: "rocm6.3"  },
+          { tag: "rocm6.4"  },
+          { tag: "cpu"      },
+        ]
    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: libtorch-cxx11-builder
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: libtorch
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
-  build-docker-rocm:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    strategy:
-      matrix:
-        rocm_version: ["6.2.4", "6.3"]
-    env:
-      GPU_ARCH_TYPE: rocm
-      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
-  build-docker-cpu:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: libtorch-cxx11-builder-cpu
-            docker-build-dir:  .ci/docker/libtorch
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["63", "624"]
+        rocm_version: ["64", "63"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -11,15 +11,11 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images-s390x.yml
  pull_request:
    paths:
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
-      - '.ci/docker/common/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images-s390x.yml


@ -37,26 +33,45 @@ jobs:
    if: github.repository_owner == 'pytorch'
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.s390x
-    env:
-      GPU_ARCH_TYPE: cpu-s390x
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
          no-sudo: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
+
+      - name: Build Docker Image
+        run: |
+          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x -t manylinuxs390x-builder:cpu-s390x
+
+      - name: Tag and (if WITH_PUSH) push docker image to docker.io
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
+          CREATED_FULL_DOCKER_IMAGE_NAME: manylinuxs390x-builder:cpu-s390x
+        shell: bash
        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
+          set -euox pipefail
+          GITHUB_REF="${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}"
+          GIT_BRANCH_NAME="${GITHUB_REF##*/}"
+          GIT_COMMIT_SHA="${GITHUB_SHA:-$(git rev-parse HEAD)}"
+          CI_FOLDER_SHA="$(git rev-parse HEAD:.ci/docker)"
+
+          DOCKER_IMAGE_NAME_PREFIX="docker.io/pytorch/${CREATED_FULL_DOCKER_IMAGE_NAME}"
+
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
+
+          # Prety sure Github will mask tokens and I'm not sure if it will even be
+          # printed due to pipe, but just in case
+          set +x
+          if [[ "${WITH_PUSH:-false}" == "true" ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
+            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
          fi
-      - name: Build Docker Image
-        run: |
-          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x

      - name: Cleanup docker
        if: cancelled()
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -11,17 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - '.ci/docker/common/*'
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images.yml
+      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - '.ci/docker/common/*'
-      - '.ci/docker/manywheel/*'
-      - '.ci/docker/manywheel/build_scripts/*'
+      - .ci/docker/**
      - .github/workflows/build-manywheel-images.yml
-
+      - .github/actions/binary-docker-build/**

 env:
  DOCKER_REGISTRY: "docker.io"
@ -43,322 +40,34 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  build-docker-cuda-manylinux_2_28:
+  build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
+      fail-fast: false
      matrix:
-        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-    env:
-      GPU_ARCH_TYPE: cuda-manylinux_2_28
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+        include: [
+          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.4",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda11.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
+        ]
+    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
+    name: ${{ matrix.name }}:${{ matrix.tag }}
    steps:
-      - name: Purge tools folder (free space for build)
-        run: rm -rf /opt/hostedtoolcache
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      - name: Build docker image
+        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
+          docker-image-name: ${{ matrix.name }}
+          custom-tag-prefix: ${{ matrix.tag }}
+          docker-build-dir: manywheel
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
-  build-docker-cuda-aarch64:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    strategy:
-      matrix:
-        cuda_version: ["12.8"]
-    env:
-      GPU_ARCH_TYPE: cuda-aarch64
-      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
-  build-docker-rocm-manylinux_2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    strategy:
-      matrix:
-        rocm_version: ["6.2.4", "6.3"]
-    env:
-      GPU_ARCH_TYPE: rocm-manylinux_2_28
-      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}}
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}}
-  build-docker-cpu-manylinux_2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-manylinux_2_28
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-cpu
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
-  build-docker-cpu-aarch64:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-aarch64
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
-  build-docker-cpu-aarch64-2_28:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-aarch64-2_28
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
-  build-docker-cpu-cxx11-abi:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: cpu-cxx11-abi
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
-  build-docker-xpu:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
-    env:
-      GPU_ARCH_TYPE: xpu
-    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          submodules: false
-      - name: Calculate docker image
-        if: env.WITH_PUSH == 'false'
-        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-        with:
-            docker-image-name: manylinux2_28-builder-xpu
-            docker-build-dir:  .ci/docker/manywheel
-            always-rebuild: true
-            push: true
-      - name: Authenticate if WITH_PUSH
-        if: env.WITH_PUSH == 'true'
-        env:
-          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
-          DOCKER_ID: ${{ secrets.DOCKER_ID }}
-        run: |
-          if [[ "${WITH_PUSH}" == true ]]; then
-            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
-          fi
-      - name: Build Docker Image
-        if: env.WITH_PUSH == 'true'
-        uses: nick-fields/retry@v3.0.0
-        with:
-          shell: bash
-          timeout_minutes: 90
-          max_attempts: 3
-          retry_wait_seconds: 90
-          command: |
-            .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -54,7 +54,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.3"
+            rocm_version: "6.4"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -79,7 +79,7 @@ jobs:
        ]
        include:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
-            runner: linux.arm64.2xlarge
+            runner: linux.arm64.m7g.4xlarge
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -301,98 +301,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  libtorch-rocm6_2_4-shared-with-deps-release-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: libtorch-rocm6_2_4-shared-with-deps-release
-      build_environment: linux-binary-libtorch
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  libtorch-rocm6_2_4-shared-with-deps-release-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - libtorch-rocm6_2_4-shared-with-deps-release-build
-      - get-label-type
-    runs-on: linux.rocm.gpu
-    timeout-minutes: 240
-    env:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      SKIP_ALL_TESTS: 1
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-    steps:
-      - name: Setup ROCm
-        uses: ./.github/actions/setup-rocm
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-rocm6_2_4-shared-with-deps-release
-          path: "${{ runner.temp }}/artifacts/"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
-      - name: ROCm set GPU_FLAG
-        run: |
-          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
-      - name: Pull Docker image
-        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-        with:
-          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      - name: Test Pytorch binary
-        uses: ./pytorch/.github/actions/test-pytorch-binary
-      - name: Teardown ROCm
-        uses: ./.github/actions/teardown-rocm
-  libtorch-rocm6_2_4-shared-with-deps-release-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: libtorch-rocm6_2_4-shared-with-deps-release-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: libtorch
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: rocm6.2.4
-      GPU_ARCH_VERSION: 6.2.4
-      GPU_ARCH_TYPE: rocm
-      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
-      LIBTORCH_CONFIG: release
-      LIBTORCH_VARIANT: shared-with-deps
-      build_name: libtorch-rocm6_2_4-shared-with-deps-release
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  libtorch-rocm6_3-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -484,3 +392,95 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  libtorch-rocm6_4-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-rocm6_4-shared-with-deps-release-build
+      - get-label-type
+    runs-on: linux.rocm.gpu
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      SKIP_ALL_TESTS: 1
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+    steps:
+      - name: Setup ROCm
+        uses: ./.github/actions/setup-rocm
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-rocm6_4-shared-with-deps-release
+          path: "${{ runner.temp }}/artifacts/"
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: ROCm set GPU_FLAG
+        run: |
+          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
+      - name: Pull Docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      - name: Test Pytorch binary
+        uses: ./pytorch/.github/actions/test-pytorch-binary
+      - name: Teardown ROCm
+        uses: ./.github/actions/teardown-rocm
+  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-rocm6_4-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: rocm6.4
+      GPU_ARCH_VERSION: 6.4
+      GPU_ARCH_TYPE: rocm
+      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-rocm6_4-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -55,7 +55,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
@ -79,7 +79,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -101,7 +101,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -120,7 +120,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
@ -144,7 +144,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -166,7 +166,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -185,7 +185,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
@ -209,7 +209,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -231,7 +231,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -250,7 +250,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
@ -274,7 +274,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -296,7 +296,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -315,7 +315,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
@ -339,7 +339,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
@ -361,7 +361,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-debug

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-debug
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -59,9 +66,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -117,11 +121,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -135,7 +139,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -150,25 +154,17 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-debug
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -197,14 +193,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-debug
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-release

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-release
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -59,9 +66,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -117,11 +121,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -135,7 +139,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -150,25 +154,17 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: libtorch-cpu-shared-with-deps-release
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -197,14 +193,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: libtorch-cpu-shared-with-deps-release
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -1,11 +1,12 @@
 # @generated DO NOT EDIT MANUALLY

-# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-wheel

 on:
  push:
+    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -17,18 +18,24 @@ on:
  workflow_dispatch:

 env:
+  # Needed for conda builds
+  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
+  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-wheel
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-  AWS_DEFAULT_REGION: us-east-1
+concurrency:
+  group: windows-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true

 jobs:
  get-label-type:
@ -44,7 +51,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -56,9 +63,6 @@ jobs:
      DESIRED_PYTHON: "3.12"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -114,11 +118,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -132,7 +136,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 240
+    timeout-minutes: 300
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -143,25 +147,17 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
    steps:
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
-      - uses: actions/download-artifact@v4.1.7
-        name: Download Build Artifacts
-        with:
-          name: wheel-py3_12-cpu
-          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Bootstrap Git
+      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -190,14 +186,19 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
+      - uses: actions/download-artifact@v4.1.7
+        name: Download Build Artifacts
+        with:
+          name: wheel-py3_12-cpu
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_populate_env.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
  wheel-py3_12-cpu-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -19,6 +19,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -52,6 +53,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -96,15 +106,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -145,6 +146,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -210,6 +212,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -224,18 +238,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -26,6 +26,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -59,6 +60,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -103,15 +113,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -152,6 +153,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -217,6 +219,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -231,18 +245,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -306,6 +308,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -350,15 +361,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -399,6 +401,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda11_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -465,6 +468,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -479,18 +494,6 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -555,6 +558,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -599,15 +611,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -648,6 +651,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_6-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -714,6 +718,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -728,18 +744,6 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -804,6 +808,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -848,15 +861,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -897,6 +901,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -963,6 +968,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -977,18 +994,6 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -19,6 +19,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -52,6 +53,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -96,15 +106,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -145,6 +146,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -210,6 +212,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -224,18 +238,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -26,6 +26,7 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
+  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -59,6 +60,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -103,15 +113,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -152,6 +153,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -217,6 +219,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -231,18 +245,6 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -306,6 +308,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -350,15 +361,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -399,6 +401,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda11_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -465,6 +468,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -479,18 +494,6 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -555,6 +558,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -599,15 +611,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -648,6 +651,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_6-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -714,6 +718,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -728,18 +744,6 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -804,6 +808,15 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -848,15 +861,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
-      # NOTE: These environment variables are put here so that they can be applied on every job equally
-      #       They are also here because setting them at a workflow level doesn't give us access to the
-      #       runner.temp variable, which we need.
-      - name: Populate binary env
-        shell: bash
-        run: |
-          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
-          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
-          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -897,6 +901,7 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
+
  libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -963,6 +968,18 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -977,18 +994,6 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
-      - name: Checkout PyTorch
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-          submodules: recursive
-          path: pytorch
-          show-progress: false
-      - name: Clean PyTorch checkout
-        run: |
-          # Remove any artifacts from the previous checkouts
-          git clean -fxd
-        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -1,5 +1,4 @@
-name: perf-nightly-macos
-# Technically not an inductor test, but uses it as a template for tracking macos performance
+name: inductor-perf-nightly-macos

 on:
  schedule:
@ -24,6 +23,7 @@ on:
  pull_request:
    paths:
      - .github/workflows/inductor-perf-test-nightly-macos.yml
+      - .ci/pytorch/macos-test.sh

 concurrency:
  group:  ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -38,7 +38,7 @@ jobs:
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-perf-py3-arm64-build
-      build-environment: macos-py3-arm64
+      build-environment: macos-py3-arm64-distributed
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
@ -54,7 +54,7 @@ jobs:
    uses: ./.github/workflows/_mac-test.yml
    needs: macos-perf-py3-arm64-build
    with:
-      build-environment: macos-py3-arm64
+      build-environment: macos-py3-arm64-distributed
      # Same as the build job
      python-version: 3.9.12
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -36,11 +36,11 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -65,8 +65,8 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -90,7 +90,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
    secrets: inherit

@ -114,7 +114,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
    secrets: inherit

@ -138,10 +138,10 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
    secrets: inherit

@ -165,8 +165,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -53,11 +53,11 @@ jobs:
      sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

@ -82,14 +82,14 @@ jobs:
      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
        ]}
    secrets: inherit

--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -37,13 +37,13 @@ jobs:
      runner: linux.arm64.2xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
        ]}
    secrets: inherit

--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -182,14 +182,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
        ]}
    secrets: inherit

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -184,7 +184,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang10
      docker-image-name: pytorch-linux-focal-py3.9-clang10
      test-matrix: |
@ -385,6 +385,9 @@ jobs:
    name: linux-focal-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
    needs: get-label-type
+    permissions:
+      id-token: write
+      contents: read
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.6-py3.10-gcc11-bazel-test
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@ -21,6 +21,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
      runner: linux.s390x
    secrets: inherit
--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@ -42,7 +42,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
      runner: linux.s390x
      test-matrix: |
        { include: [
@ -70,7 +70,7 @@ jobs:
      - target-determination
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x
      test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
      timeout-minutes: 600
      use-gha: "yes"
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -143,9 +143,9 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build
    secrets: inherit
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats

 on:
  workflow_run:
-    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, inductor-perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
    types:
      - completed

--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1165,14 +1165,6 @@ exclude_patterns = [
    'test/quantization/core/test_utils.py',
    'test/quantization/core/test_workflow_module.py',
    'test/quantization/core/test_workflow_ops.py',
-    'test/quantization/eager/__init__.py',
-    'test/quantization/eager/test_bias_correction_eager.py',
-    'test/quantization/eager/test_equalize_eager.py',
-    'test/quantization/eager/test_fuse_eager.py',
-    'test/quantization/eager/test_model_numerics.py',
-    'test/quantization/eager/test_numeric_suite_eager.py',
-    'test/quantization/eager/test_quantize_eager_ptq.py',
-    'test/quantization/eager/test_quantize_eager_qat.py',
    'test/quantization/fx/__init__.py',
    'test/quantization/fx/test_equalize_fx.py',
    'test/quantization/fx/test_model_report_fx.py',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,4 +1,5 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
+load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_python//python:defs.bzl", "py_library", "py_test")
@ -659,6 +660,15 @@ cc_library(
 # torch
 torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])

+flatbuffer_cc_library(
+    name = "torch_flatbuffers",
+    srcs = [
+        "torch/csrc/jit/serialization/mobile_bytecode.fbs",
+    ],
+    flatc_args = ["--cpp", "--gen-mutable", "--scoped-enums"],
+    out_prefix = "torch/csrc/jit/serialization/",
+)
+
 cc_library(
    name = "torch_headers",
    hdrs = if_cuda(
@ -672,6 +682,7 @@ cc_library(
        ],
        exclude = [
            "torch/csrc/*/generated/*.h",
+            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
@ -686,6 +697,7 @@ cc_library(
    deps = [
        ":aten_headers",
        ":caffe2_headers",
+        ":torch_flatbuffers",
        "//c10",
        "@com_github_google_flatbuffers//:flatbuffers",
        "@local_config_python//:python_headers",
--- a/6
+++ b/6
@ -165,9 +165,9 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi

 # Dynamic Shapes
-/torch/fx/experimental/symbolic_shapes.py @bobren @laithsakka
-/torch/fx/experimental/sym_node.py @bobren @laithsakka
-/torch/fx/experimental/recording.py @bobren @laithsakka
+/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/sym_node.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/recording.py @bobrenjc93 @laithsakka

 # serialization-related files
 /aten/src/ATen/MapAllocator* @mikaylagawarecki
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -222,8 +222,8 @@ inline Tensor applySlice(
        ? (*self_sizes)[dim]
        : self.sym_size(dim);
    if (!disable_slice_optimization &&
-        TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) &&
-        TORCH_GUARD_SIZE_OBLIVIOUS(length.sym_eq(stop)) && step == 1) {
+        TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
+        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
      return self;
    }
  }
--- a/aten/src/ATen/core/CachingHostAllocator.cpp
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@ -0,0 +1,33 @@
+#include <ATen/core/CachingHostAllocator.h>
+
+#include <array>
+
+namespace at {
+
+namespace {
+
+static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_array{};
+static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
+    allocator_priority{};
+
+} // anonymous namespace
+
+void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority) {
+  if (priority >= allocator_priority[static_cast<int>(device_type)]) {
+    allocator_array[static_cast<int>(device_type)] = allocator;
+    allocator_priority[static_cast<int>(device_type)] = priority;
+  }
+}
+
+at::HostAllocator* getHostAllocator(at::DeviceType device_type) {
+  auto* allocator = allocator_array[static_cast<int>(device_type)];
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+      allocator, "Host Allocator for ", device_type, " is not set.");
+  return allocator;
+}
+
+} // namespace at
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,4 +1,5 @@
 #include <c10/core/Allocator.h>
+#include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
@ -46,7 +47,7 @@ namespace {
 }

 // Struct containing memory allocator summary statistics for host.
-struct HostStats {
+struct TORCH_API HostStats {
  // COUNT: allocations requested by client code. Note that active
  // count can be extracted by looking at current allocations
  Stat allocation;
@ -274,7 +275,8 @@ struct CachingHostAllocatorImpl {
    }
  }

-  virtual bool record_event(void* ptr, void* ctx, S stream) {
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream s) {
+    S stream = S(s);
    auto* block = reinterpret_cast<B*>(ctx);

    // Note: we need to check if the passed-in `ctx` is valid. This is because
@ -620,24 +622,49 @@ protected:
  alignas(64) HostStatsStaged stats_;
 };

-template <typename T>
-struct CachingHostAllocatorInterface : public at::Allocator {
+struct TORCH_API HostAllocator : public at::Allocator {
+  // Associates the pinned memory allocation with a stream to track
+  // dependencies. This ensures the memory won't be reused until the stream's
+  // operations complete
+  virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0;
+
+  // Frees all cached pinned memory and returns it to the system, clearing the
+  // allocator's internal cache
+  virtual void empty_cache() = 0;
+
+  // Returns comprehensive statistics about the allocator's memory usage,
+  // allocation patterns, and timing metrics
+  virtual HostStats get_stats() = 0;
+
+  // Resets the cumulative allocation statistics
+  virtual void reset_accumulated_stats() = 0;
+
+  // Resets the peak memory usage metrics
+  virtual void reset_peak_stats() = 0;
+};
+
+template <typename T, c10::DeleterFnPtr deleteFunc>
+struct CachingHostAllocatorInterface : public HostAllocator {
  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}

  at::DataPtr allocate(size_t size) override {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
+    auto ptr_and_ctx = impl_->allocate(size);
+    return {
+        ptr_and_ctx.first,
+        ptr_and_ctx.second,
+        deleteFunc, // Use the template parameter deleter function
+        at::DeviceType::CPU};
  }

  void free(void* ctx) {
    impl_->free(ctx);
  }

-  template <typename S>
-  bool record_event(void* ptr, void* ctx, S stream) {
+  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
    return impl_->record_event(ptr, ctx, stream);
  }

-  void empty_cache() {
+  void empty_cache() override {
    impl_->empty_cache();
  }

@ -646,20 +673,54 @@ struct CachingHostAllocatorInterface : public at::Allocator {
    impl_->copy_data(dest, src, count);
  }

-  HostStats getStats() {
+  HostStats get_stats() override {
    return impl_->getStats();
  }

-  void resetAccumulatedStats() {
+  void reset_accumulated_stats() override {
    impl_->resetAccumulatedStats();
  }

-  void resetPeakStats() {
+  void reset_peak_stats() override {
    impl_->resetPeakStats();
  }

  std::unique_ptr<T> impl_;
 };

+#define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance)       \
+  void deleter(void* ptr);                                          \
+  struct name final                                                 \
+      : public at::CachingHostAllocatorInterface<impl, deleter> {}; \
+  static name instance;                                                    \
+  void deleter(void* ptr) {                                         \
+    instance.free(ptr);                                             \
+  }
+
+/**
+ * Set the host allocator for DeviceType `device_type`. This allocator manages
+ * pinned memory on the host that can be accessed efficiently by the specified
+ * device type. Note that this function is not thread-safe.
+ */
+TORCH_API void setHostAllocator(
+    at::DeviceType device_type,
+    at::HostAllocator* allocator,
+    uint8_t priority = 0);
+
+TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type);
+
+template <DeviceType device_type>
+struct HostAllocatorRegistry {
+  explicit HostAllocatorRegistry(HostAllocator* allocator) {
+    at::setHostAllocator(device_type, allocator);
+  }
+};
+
+#define REGISTER_HOST_ALLOCATOR(device_type, allocator) \
+  namespace {                                           \
+  static at::HostAllocatorRegistry<device_type>         \
+      g_host_allocator_registry_instance(allocator);    \
+  }
+
 } // namespace at
 C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -41,9 +41,15 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
    }
  };
  std::vector<Argument> new_arguments, new_returns;
-  std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
+  new_arguments.reserve(arguments().size());
+  for (const auto& arg: arguments()) {
+    new_arguments.push_back(cloneWithRealTypes(arg));
+  }
  // NB: SymInt returns are always SymInt
-  std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
+  new_returns.reserve(returns().size());
+  for (const auto& ret: returns()) {
+    new_returns.push_back(alwaysCloneWithRealTypes(ret));
+  }
  return FunctionSchema(
    name(),
    overload_name(),
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@ -1,6 +1,7 @@
 #include <torch/library.h>

 #include <ATen/core/dispatch/Dispatcher.h>
+#include <fmt/format.h>

 namespace torch {

@ -11,7 +12,7 @@ namespace {
 #ifdef STRIP_ERROR_MESSAGES
    return std::string();
 #else
-    return c10::str("registered at ", file, ":", line);
+    return fmt::format("registered at {}:{}", file, line);
 #endif
  }

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -248,7 +248,6 @@ namespace at::cuda::blas {
    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
  } while (0)

-
 namespace {
 // Following the pattern of CuSparseDescriptor
 // Defined here for now because this is the only place cublas_lt interface is
@ -334,9 +333,10 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 } // namespace


-template <typename Dtype>
-static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  cudaDataType_t abcType = CUDA_R_32F;
+template <typename Dtype, typename C_Dtype = Dtype>
+static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
 #ifndef USE_ROCM
@ -346,7 +346,8 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
  void * alpha_ptr = &alpha;
  void * beta_ptr = &beta;
  if constexpr (std::is_same_v<Dtype, double>) {
-    abcType = CUDA_R_64F;
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
@ -354,11 +355,13 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
-    abcType = CUDA_C_64F;
+    abType = CUDA_C_64F;
+    cType = CUDA_C_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_C_64F;
  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
-    abcType = CUDA_C_32F;
+    abType = CUDA_C_32F;
+    cType = CUDA_C_32F;
    scaleType = CUDA_C_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
@ -371,9 +374,11 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      beta_ptr = &hbeta;
    }
 #endif
-    abcType = CUDA_R_16F;
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abcType = CUDA_R_16BF;
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  } else {
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }
@ -395,9 +400,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
            at::globalContext()._SMCarveout_EXPERIMENTAL().value());
  }
 #endif
-  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
+  CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, ldc);

  if (num_batches > 1) {
    int num_batches_as_int = static_cast<int>(num_batches);
@ -482,8 +487,10 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
      ldb,
      " ldc ",
      ldc,
-      " abcType ",
-      abcType,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -495,9 +502,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
 }


-template <typename Dtype>
-inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented");
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::bgemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
 }

 template <>
@ -556,8 +563,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
      reinterpret_cast<cuComplex*>(c), ldc, stridec, num_batches));
 }

-template <>
-void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -602,23 +609,33 @@ void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
      handle, opa, opb, m, n, k,
      alpha_ptr, a, CUDA_R_16F, lda, stridea,
      b, CUDA_R_16F, ldb, strideb, beta_ptr,
-      c, CUDA_R_16F, ldc, stridec,
+      c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, stridec,
      num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  } else {
    for (const auto i : c10::irange(num_batches)) {
-      at::cuda::blas::gemm<at::Half>(
-        transa, transb,
-        m, n, k,
-        alpha, (a + i * stridea), lda,
-        (b + i * strideb), ldb, beta,
-        (c + i * stridec), ldc);
+      if (std::is_same_v<C_Dtype, float>) {
+        float* c_ptr = (float*)(c + i * stridec);
+        at::cuda::blas::gemm<at::Half, float>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            c_ptr, ldc);
+      } else {
+        at::cuda::blas::gemm<at::Half>(
+            transa, transb,
+            m, n, k,
+            alpha, (a + i * stridea), lda,
+            (b + i * strideb), ldb, beta,
+            (c + i * stridec), ldc);
+      }
    }
  }
 #endif // USE_ROCM
 }

-template <>
-void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+template <typename C_Dtype>
+inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
@ -635,15 +652,37 @@ void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
  auto compute_type = CUDA_R_32F;
 #endif
  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
-                                  opa, opb, (int)m, (int)n, (int)k,
-                                  (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
-                                  b, CUDA_R_16BF, (int)ldb, strideb,
-                                  (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
-                                  (int)num_batches,
-                                  compute_type,
-                                  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                              opa, opb, (int)m, (int)n, (int)k,
+                              (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
+                              b, CUDA_R_16BF, (int)ldb, strideb,
+                              (void*)&fbeta, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+                              (int)ldc, stridec, (int)num_batches,
+                              compute_type,
+                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 }

+template <>
+void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
+  bgemm_internal_cublas_half_helper<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  bgemm_internal_cublas_half_helper<float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+template <>
+void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
+  bgemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+template <>
+void bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  bgemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
 template <>
 void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
 {
@ -742,9 +781,50 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
  }
 }

-template <typename DType>
-inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
-  tunable::GemmStridedBatchedParams<DType> params;
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half))) {
+      bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    if (!bgemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16))) {
+      bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+    }
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  tunable::GemmStridedBatchedParams<Dtype> params;
  params.transa = transa;
  params.transb = transb;
  params.m = m;
@ -767,19 +847,19 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
  bool transb_ = ((transb != 'n') && (transb != 'N'));

  if (transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else {
@ -853,9 +933,35 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  }
 }

-template <typename Dtype>
-inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
+template <>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support tuning for Half inputs and FP32 output
+  bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
+}
+
+
+template <>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support tuning for BFloat16 inputs and FP32 output
+  bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
+}
+
+
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+  TORCH_CHECK(false, "at::cuda::blas::gemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
 }

 template <>
@ -914,8 +1020,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
      reinterpret_cast<cuComplex*>(c), ldc));
 }

-template <>
-void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+template <typename C_Dtype>
+inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -994,7 +1100,7 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
        ldb,
        beta_ptr,
        c,
-        CUDA_R_16F,
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
        ldc,
        compute_type,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -1016,14 +1122,14 @@ void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
        ldb,
        &fbeta,
        c,
-        CUDA_R_16F,
+        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
        ldc));
  }
 #endif
 }

-template <>
-void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+template <typename C_Dtype>
+inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1060,15 +1166,35 @@ void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
      ldb,
      &fbeta,
      c,
-      CUDA_R_16BF,
+      std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
      ldc,
      compute_type,
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
 }

-template <typename Dtype>
-inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+template <>
+void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+  gemm_internal_cublas_half_helper<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  gemm_internal_cublas_half_helper<float>(CUDABLAS_GEMM_ARGS(at::Half));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
+  gemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <>
+void gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  gemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  // forward to bgemm implementation but set strides and batches to 0
  if (!bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0)) {
    gemm_internal_cublas(CUDABLAS_GEMM_ARGS(Dtype));
@ -1180,8 +1306,45 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
  }
 }

-template <typename DType>
-inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+{
+  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
+    // Do not allow fp16 reductions with fp32 output
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+  }
+}
+
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
+{
+  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
+    gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+#if defined(USE_ROCM) && !defined(_MSC_VER)
+  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  }
+#endif
+  else {
+    gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+  }
+}
+
+template <typename DType, typename C_Dtype>
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(DType, C_Dtype)) {
  tunable::GemmParams<DType> params;
  params.transa = transa;
  params.transb = transb;
@ -1287,8 +1450,32 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  }
 }

+template <>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+  #endif
+  // TODO: Support Tuning for fp16-fp32 gemm
+  gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
+}

-template <typename Dtype>
+
+template <>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
+  #ifdef USE_ROCM
+  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+  #else
+    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
+
+    if (prop->major < 8)
+      TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
+  #endif
+  // TODO: Support Tuning for bf16-fp32 gemm
+  gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
+}
+
+
+template <typename Dtype, typename C_Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1301,13 +1488,27 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    Dtype* result_ptr,
+    C_Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation) {
+
+  if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::BFloat16>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
+    #endif
+  } else if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::Half>) {
+    #ifdef USE_ROCM
+    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
+    #endif
+    if (at::globalContext().allowFP16AccumulationCuBLAS())
+      TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
+  }
+
  using opmath_t = at::opmath_type<Dtype>;
  opmath_t beta_val = 0; // bias is added in epilogue

-  cudaDataType_t abcType = CUDA_R_32F;
+  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
  void * alpha_ptr = &alpha_val;
@ -1317,14 +1518,14 @@ bool gemm_and_bias(
  at::Half hbeta_val;
 #endif
  if constexpr (std::is_same_v<Dtype, double>) {
-    abcType = CUDA_R_64F;
+    abType = CUDA_R_64F;
+    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
    if (at::globalContext().allowTF32CuBLAS()) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
-    abcType = CUDA_R_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
@ -1337,9 +1538,11 @@ bool gemm_and_bias(
      beta_ptr = &hbeta_val;
    }
 #endif
-    abcType = CUDA_R_16F;
+    abType = CUDA_R_16F;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abcType = CUDA_R_16BF;
+    abType = CUDA_R_16BF;
+    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  }

  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
@ -1369,9 +1572,9 @@ bool gemm_and_bias(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }

-  CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
-  CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
-  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
+  CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);

  CuBlasLtMatmulPreference preference;
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
@ -1449,8 +1652,10 @@ bool gemm_and_bias(
      mat2_ld,
      " result_ld ",
      result_ld,
-      " abcType ",
-      abcType,
+      " abType ",
+      abType,
+      " cType ",
+      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -1509,6 +1714,22 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::Half> alpha_val,
+    const at::Half* mat1_ptr,
+    int64_t mat1_ld,
+    const at::Half* mat2_ptr,
+    int64_t mat2_ld,
+    const at::Half* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
 template bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1525,6 +1746,22 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);

+template bool gemm_and_bias(
+    bool transpose_mat1,
+    bool transpose_mat2,
+    int64_t m,
+    int64_t n,
+    int64_t k,
+    at::opmath_type<at::BFloat16> alpha_val,
+    const at::BFloat16* mat1_ptr,
+    int64_t mat1_ld,
+    const at::BFloat16* mat2_ptr,
+    int64_t mat2_ld,
+    const at::BFloat16* bias,
+    float* result_ptr,
+    int64_t result_ld,
+    GEMMAndBiasActivationEpilogue activation);
+
 void scaled_gemm(
    char transa,
    char transb,
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -39,18 +39,26 @@ private:

 /* LEVEL 3 BLAS FUNCTIONS */

-#define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
+#define CUDABLAS_GEMM_ARGTYPES(Dtype) CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                  \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
-      Dtype *c, int64_t ldc
+      C_Dtype *c, int64_t ldc

 #define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc

-template <typename Dtype>
-inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+#define CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT \
+    ((std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value) && std::is_same<C_Dtype, float>::value)
+
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented");
 }

+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
 template <>
 void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@ -63,9 +71,13 @@ template <>
 void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+template<>
+void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

-template <typename Dtype>
-inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm_internal: not implemented");
 }

@ -81,6 +93,10 @@ template <>
 void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
+template<>
+void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

 enum GEMMAndBiasActivationEpilogue {
  None,
@ -90,7 +106,7 @@ enum GEMMAndBiasActivationEpilogue {

 // NOTE: GELU activation is not supported prior to CUDA 11.4 and will
 // do nothing if passed in that case.
-template <typename Dtype>
+template <typename Dtype, typename C_Dtype = Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -103,7 +119,7 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    Dtype* result_ptr,
+    C_Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);

@ -145,20 +161,25 @@ void scaled_gemm(
    bool use_fast_accum,
    bool use_rowwise);

-#define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+
+#define CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                   \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
      const Dtype *a, int64_t lda, int64_t stridea,                                           \
      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
-      at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+      at::opmath_type<Dtype> beta, C_Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches

 #define CUDABLAS_BGEMM_ARGS(Dtype) \
  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches

-template <typename Dtype>
-inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented");
 }

+template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
+
 template <>
 void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
 template <>
@ -171,9 +192,13 @@ template <>
 void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+template<>
+void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

-template <typename Dtype>
-inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm_internal: not implemented");
 }

@ -189,6 +214,10 @@ template <>
 void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
+template<>
+void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
+template<>
+void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));

 #define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
  cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -249,58 +249,13 @@ struct CUDACachingHostAllocatorImpl
  }
 };

-void raw_local_deleter(void* ptr);
+DECLARE_HOST_ALLOCATOR(
+    CUDACachingHostAllocator,
+    CUDACachingHostAllocatorImpl,
+    raw_local_deleter,
+    caching_host_allocator);

-struct CUDACachingHostAllocator final
-    : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
-  at::DataPtr allocate(size_t size) override {
-    auto ptr_and_ctx = impl_->allocate(size);
-    return {
-        ptr_and_ctx.first,
-        ptr_and_ctx.second,
-        &raw_local_deleter,
-        at::DeviceType::CPU};
-  }
-};
-
-CUDACachingHostAllocator caching_host_allocator;
-
-static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
-  return caching_host_allocator;
-}
-
-void raw_local_deleter(void* ptr) {
-  getCUDACachingHostAllocator().free(ptr);
-}
+REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)

 } // anonymous namespace
-
-bool CachingHostAllocator_recordEvent(
-    void* ptr,
-    void* ctx,
-    at::cuda::CUDAStream stream) {
-  return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
-}
-
-// Releases cached pinned memory allocations via cudaHostFree
-void CachingHostAllocator_emptyCache() {
-  getCUDACachingHostAllocator().empty_cache();
-}
-
-at::Allocator* getCachingHostAllocator() {
-  return &getCUDACachingHostAllocator();
-}
-
-at::HostStats CachingHostAllocator_getStats() {
-  return getCUDACachingHostAllocator().getStats();
-}
-
-void CachingHostAllocator_resetAccumulatedStats() {
-  return getCUDACachingHostAllocator().resetAccumulatedStats();
-}
-
-void CachingHostAllocator_resetPeakStats() {
-  return getCUDACachingHostAllocator().resetPeakStats();
-}
-
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@ -18,25 +18,52 @@ namespace at::cuda {
 // call between host and device, and passed the corresponding context from the
 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
 //
-TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kCUDA) instead.")
+inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
+  return at::getHostAllocator(at::kCUDA);
+}

 // Records an event in the specified stream. The allocation corresponding to the
 // input `ptr`/`ctx` will not be re-used until the event has occurred.
-TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
+inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
    void* ptr,
    void* ctx,
-    c10::cuda::CUDAStream stream);
-
-// Releases cached pinned memory allocations via cudaHostFree
-TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
-
-inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
-  return getCachingHostAllocator()->allocate(size);
+    c10::cuda::CUDAStream stream) {
+  return getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
 }

-TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats();
+// Releases cached pinned memory allocations via cudaHostFree
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kCUDA)->empty_cache() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache() {
+  getHostAllocator(at::kCUDA)->empty_cache();
+}

-TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats();
-TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats();
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->allocate(...) instead.")
+inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
+  return getHostAllocator(at::kCUDA)->allocate(size);
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_getStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->get_stats() instead.")
+inline TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats() {
+  return getHostAllocator(at::kCUDA)->get_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetAccumulatedStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_accumulated_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats() {
+  getHostAllocator(at::kCUDA)->reset_accumulated_stats();
+}
+
+C10_DEPRECATED_MESSAGE(
+  "at::cuda::CachingHostAllocator_resetPeakStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_peak_stats() instead.")
+inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats() {
+  getHostAllocator(at::kCUDA)->reset_peak_stats();
+}

 } // namespace at::cuda
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@ -1,11 +1,10 @@
 #pragma once

-#include <c10/core/Allocator.h>
 #include <ATen/cuda/CachingHostAllocator.h>

 namespace at::cuda {

-inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() {
-  return getCachingHostAllocator();
+inline TORCH_CUDA_CPP_API at::HostAllocator* getPinnedMemoryAllocator() {
+  return at::getHostAllocator(at::kCUDA);
 }
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -469,7 +469,7 @@ private:
  bool duplicate_inputs_{false};
 };

-template <typename T>
+template <typename T, typename C_Dtype = T>
 struct GemmStridedBatchedParams : OpParams {
  std::string BLASSignature() const override {
    std::string alpha_str = to_string_opmath<T>(alpha);
@ -477,7 +477,7 @@ struct GemmStridedBatchedParams : OpParams {
    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
      m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
-      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
  }

  std::string Signature() const override {
@ -517,7 +517,7 @@ struct GemmStridedBatchedParams : OpParams {
    c10::DeviceIndex device = 0;
    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
    size_t c_size = GetSizeC();
-    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
    if (duplicate_inputs) {
@ -544,7 +544,7 @@ struct GemmStridedBatchedParams : OpParams {
  }

  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
-    auto c_dtype = c10::CppTypeToScalarType<T>::value;
+    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
  }

@ -561,7 +561,7 @@ struct GemmStridedBatchedParams : OpParams {
  int64_t ldb{};
  int64_t stride_b{};
  at::opmath_type<T> beta;
-  T* c{};
+  C_Dtype* c{};
  int64_t ldc{};
  int64_t stride_c{};
  int64_t batch{};
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -116,21 +116,44 @@ void fp16_gemv_trans(
  fp16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }

-static void bf16_gemv_trans(
-    const int m,
-    const int n,
-    const at::BFloat16 alpha,
-    const at::BFloat16* a,
-    const int lda,
-    const at::BFloat16* x,
-    const int incx,
-    const at::BFloat16 beta,
-    at::BFloat16* y,
-    const int incy);
-
 #endif // !defined(C10_MOBILE)

 #if defined(__aarch64__) && !defined(C10_MOBILE)
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f16(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto yf16 = y + i;
+      auto matRow = vld1_f16(column + i);
+      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
+      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
+      vst1_f16(yf16, resVec);
+    }
+  }
+}
+#endif
+
+static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
+  std::vector<float> sum(m);
+  for (auto j = 0; j < n; j++) {
+    auto vecCol = vdup_n_f32(x[j]);
+    const auto* column = a + lda * j;
+    for (auto i = 0; i < m; i += 4) {
+      auto sf32 = sum.data() + i;
+      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
+      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
+      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
+      vst1q_f32(sf32, resVec);
+    }
+  }
+
+  for (auto i = 0; i < m; i+= 4) {
+    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
+  }
+}
+
 void fp16_gemv_notrans(
    const int m,
    const int n,
@ -143,6 +166,44 @@ void fp16_gemv_notrans(
    Half* y,
    const int incy);

+void fp16_gemv_notrans(
+    const int m,
+    const int n,
+    const float alpha,
+    const Half* a,
+    const int lda,
+    const Half* x,
+    const int incx,
+    const float beta,
+    Half* y,
+    const int incy) {
+  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
+#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
+    if (at::globalContext().allowFP16ReductionCPU())  {
+      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+    }
+#endif
+    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
+  }
+  std::vector<float> sum(m);
+  for (const auto j : c10::irange(n)) {
+    const auto* column_ = a + lda * j;
+    auto z = alpha * x[j * incx];
+    for (const auto i : c10::irange(m)) {
+      sum[i] += z * column_[i];
+    }
+  }
+  if (beta == 0.0) {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] = sum[i];
+    }
+  } else {
+    for (const auto i : c10::irange(m)) {
+      y[i * incy] += sum[i];
+    }
+  }
+}
+
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)

 template <typename scalar_t>
@ -258,10 +319,6 @@ template <>
 void gemv_fast_path<float>(const char *trans, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy) {
  sgemv_(remove_const(trans), remove_const(m), remove_const(n), remove_const(alpha), remove_const(a), remove_const(lda), remove_const(x), remove_const(incx), remove_const(beta), y, remove_const(incy));
 }
-#else
-INSTANTIATE(float)
-INSTANTIATE(double)
-#endif // AT_BUILD_WITH_BLAS

 INSTANTIATE(uint8_t)
 INSTANTIATE(int8_t)
@ -283,7 +340,7 @@ bool gemv_use_fast_path<at::BFloat16>(
      beta == 0.0;
 }

-void bf16_gemv_trans(
+static void bf16_gemv_trans(
  const int m,
  const int n,
  const at::BFloat16 alpha,
@ -368,14 +425,7 @@ void gemv_fast_path<at::Half>(
      y,
      *incy);
 }
-#else
-template <>
-bool scal_use_fast_path<at::Half>(
-    [[maybe_unused]] int64_t n,
-    [[maybe_unused]] int64_t incx) {
-  return false;
-}
-
+#else // !defined(__aarch64__))
 template <>
 bool gemv_use_fast_path<at::Half>(
    char trans,
@ -391,79 +441,6 @@ bool gemv_use_fast_path<at::Half>(
      (c10::detail::fp16_from_bits(beta.x) == 0.0f || trans == 't' || trans == 'T');
 }

-#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
-  for (auto j = 0; j < n; j++) {
-    auto vecCol = vdup_n_f16(x[j]);
-    const auto* column = a + lda * j;
-    for (auto i = 0; i < m; i += 4) {
-      auto yf16 = y + i;
-      auto matRow = vld1_f16(column + i);
-      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
-      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
-      vst1_f16(yf16, resVec);
-    }
-  }
-}
-#endif
-
-static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
-  std::vector<float> sum(m);
-  for (auto j = 0; j < n; j++) {
-    auto vecCol = vdup_n_f32(x[j]);
-    const auto* column = a + lda * j;
-    for (auto i = 0; i < m; i += 4) {
-      auto sf32 = sum.data() + i;
-      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
-      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
-      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
-      vst1q_f32(sf32, resVec);
-    }
-  }
-
-  for (auto i = 0; i < m; i+= 4) {
-    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
-  }
-}
-
-void fp16_gemv_notrans(
-    const int m,
-    const int n,
-    const float alpha,
-    const Half* a,
-    const int lda,
-    const Half* x,
-    const int incx,
-    const float beta,
-    Half* y,
-    const int incy) {
-  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
-#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
-    if (at::globalContext().allowFP16ReductionCPU())  {
-      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
-    }
-#endif
-    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
-  }
-  std::vector<float> sum(m);
-  for (const auto j : c10::irange(n)) {
-    const auto* column_ = a + lda * j;
-    auto z = alpha * x[j * incx];
-    for (const auto i : c10::irange(m)) {
-      sum[i] += z * column_[i];
-    }
-  }
-  if (beta == 0.0) {
-    for (const auto i : c10::irange(m)) {
-      y[i * incy] = sum[i];
-    }
-  } else {
-    for (const auto i : c10::irange(m)) {
-      y[i * incy] += sum[i];
-    }
-  }
-}
-
 template <>
 void gemv_fast_path<at::Half>(
    const char* trans,
@ -511,6 +488,7 @@ void gemv_fast_path<at::Half>(
 INSTANTIATE(c10::Half)
 INSTANTIATE(c10::BFloat16)
 #endif // !defined(C10_MOBILE)
+#endif // AT_BUILD_WITH_BLAS
 #undef INSTANTIATE

 } // namespace blas_impl
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -448,11 +448,8 @@ inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor
    return false;
  }

-  auto is_channel_last = [](const at::Tensor& t) {
-    auto fmt = t.suggest_memory_format();
-    return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
-  };
-  return is_channel_last(input) || is_channel_last(weight);
+  auto fmt = input.suggest_memory_format();
+  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
 }

 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -607,7 +607,7 @@ struct ConvParams {
  // nInputPlane and nInputPlane == nOutputPlane (the latter due to the lack of
  // a depthwise multiplier)
  bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    return (input.is_cuda() || input.is_xpu()) &&
+    return input.is_cuda() &&
           !transposed &&
           (input.ndimension() == 4 || input.ndimension() == 5) &&
           at::symint::size<T>(input, 1) == groups &&
@ -1223,12 +1223,6 @@ static ConvBackend _select_conv_backend(
      return ConvBackend::Cudnn;
    } else if (params.use_miopen(input, weight, bias_sizes_opt.has_value())) {
      return ConvBackend::MiopenDepthwise;
-    } else if (params.use_mkldnn(input, weight)) {
-      if (params.transposed) {
-        return ConvBackend::MkldnnTranspose;
-      } else {
-        return ConvBackend::Mkldnn;
-      }
    } else {
      if (input.ndimension() == 4) {
        return ConvBackend::CudaDepthwise2d;
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -159,11 +159,11 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  Tensor left = left_;
  Tensor right = right_;
  for (const auto i : c10::irange(dim)) {
-    auto sl = left.sym_size(i)!=1;
-    auto sr = right.sym_size(i)!=1;
+    auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1));
+    auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1));
    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
-        TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+        TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
        sum_size *= left.sym_size(i);
      } else if (sl) { // if it is only in one of left and right, we can sum right away
        left = left.sum(i, true);
@ -172,7 +172,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
      }
    } else if (sl && sr) { // now deal with dimensions that will be in the output
      // dimensions nontrivially in both left and right must be of the same size
-      TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
+      TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
      lro.push_back(i);
      lro_size *= left.sym_size(i);
    } else if (sl) { // keep track of dimensions appearing only once
@ -482,10 +482,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
        // Iterate over each dimension covered by ellipsis
        const auto ndim = operands[i].ndimension() - (static_cast<int64_t>(op_labels[i].size()) - 1);
        for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) {
-          if (op.sym_size(dim) != 1) {
+          if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
            // Update ellipsis size
-            TORCH_CHECK(
-                ell_sizes[j] == 1 || ell_sizes[j] == op.sym_size(dim),
+            TORCH_SYM_CHECK(
+                ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))),
                "einsum(): dimension ",
                dim,
                " covered by ellipsis in operand ",
@ -501,10 +501,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
          permutation[ell_index + j] = dim++;
        }
      } else if (permutation[label_perm_index[s]] == -1) {
-        if (op.sym_size(dim) != 1) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
          // Update subscript
-          TORCH_CHECK(
-              label_size[s] == 1 || label_size[s] == op.sym_size(dim),
+          TORCH_SYM_CHECK(
+              label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))),
              "einsum(): subscript ",
              subscript_to_label(s),
              " has size ",
@ -579,16 +579,17 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
    SmallVector<int64_t, 5> a_dims_to_sum;
    SmallVector<int64_t, 5> b_dims_to_sum;
    for (auto dim = out_num_dim; dim < perm_index; ++dim) {
-      if (a.sym_size(dim) != 1 && b.sym_size(dim) != 1) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))
+        && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
        if (--dim_counts[dim] == 1) {
          sum_dims.push_back(dim);
          dim_counts[dim] = 0;
        }
      } else if (dim_counts[dim] == 1) {
-        if (a.sym_size(dim) != 1) {
+        if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) {
          a_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
-        } else if (b.sym_size(dim) != 1) {
+        } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
          b_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
        }
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -1023,22 +1023,9 @@ static Tensor _mask_to_indices(const Tensor& mask) {
 }

 static std::pair<Tensor, Tensor> _not_zero_mask_to_col_row_indices(
-    Tensor not_zero_mask,
-    ScalarType index_dtype,
-    Device index_device) {
-  auto col_indices =
-      at::native::arange(
-          not_zero_mask.size(-1), index_dtype, kStrided, index_device)
-          .view({1, not_zero_mask.size(-1)})
-          .expand_as(not_zero_mask)
-          .masked_select(not_zero_mask);
-  auto row_indices =
-      at::native::arange(
-          not_zero_mask.size(-2), index_dtype, kStrided, index_device)
-          .view({not_zero_mask.size(-2), 1})
-          .expand_as(not_zero_mask)
-          .masked_select(not_zero_mask);
-  return std::pair<Tensor, Tensor>(col_indices, row_indices);
+    Tensor not_zero_mask) {
+  auto nz = not_zero_mask.nonzero();
+  return {nz.select(1, 1), nz.select(1, 0)};
 }

 // Sparse layout conversions Start
@ -1319,8 +1306,8 @@ static Tensor dense_to_sparse_compressed(
  Tensor col_indices;
  Tensor compressed_indices;
  if (compressed_rows_layout) {
-    std::tie(col_indices, row_indices) = _not_zero_mask_to_col_row_indices(
-        not_zero_mask, at::kLong, not_zero_mask.device());
+    std::tie(col_indices, row_indices) =
+        _not_zero_mask_to_col_row_indices(not_zero_mask);
    compressed_indices = at::_convert_indices_from_coo_to_csr(
        row_indices, not_zero_mask.size(0), false /*out_int32*/);
    {
@ -1328,8 +1315,8 @@ static Tensor dense_to_sparse_compressed(
      values = values.flatten(0, 1).index_select(0, mask_indices);
    }
  } else {
-    std::tie(row_indices, col_indices) = _not_zero_mask_to_col_row_indices(
-        not_zero_mask.transpose(1, 0), at::kLong, not_zero_mask.device());
+    std::tie(row_indices, col_indices) =
+        _not_zero_mask_to_col_row_indices(not_zero_mask.transpose(1, 0));
    compressed_indices = at::_convert_indices_from_coo_to_csr(
        col_indices, not_zero_mask.size(-1), false /*out_int32*/);
    {
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(
    std::vector<int64_t> view_sizes(
        tensor_size.begin(), tensor_size.begin() + dim);
    view_sizes.insert(view_sizes.end(), {num_chunks, -1});
-    padded_tensors.push_back(padded_tensor.view(view_sizes));
+    padded_tensors.push_back(padded_tensor.reshape(view_sizes));
  }
  return padded_tensors;
 }
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -347,7 +347,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 #endif
  // if lt path fails, we recurse back into this function here and force the lt path to off
  disable_addmm_cuda_lt |= disable_addmm_cuda_lt_override;
-  at::ScalarType scalar_type = self.scalar_type();
+  at::ScalarType scalar_type = mat1.scalar_type();
+  bool is_float_output_with_half_input = (scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;
  c10::MaybeOwned<Tensor> self_;
  if (&result != &self) {
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || defined(USE_ROCM)
@ -442,7 +443,10 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
  if (useLtInterface) {
 #if defined(USE_ROCM)
    bool okay = true;
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+    if (is_float_output_with_half_input) {
+      TORCH_CHECK(false, "float output with half input is not enabled for ROCm");
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
        at::ScalarType::Half,
        at::ScalarType::BFloat16,
        scalar_type,
@ -456,26 +460,27 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
              activation_to_gemm_and_blas_arg(activation));
        }
-        else {
-          okay = at::cuda::blas::gemm_and_bias<scalar_t>(
-              args.transa == 't',
-              args.transb == 't',
-              args.m,
-              args.n,
-              args.k,
-              alpha.to<at::opmath_type<scalar_t>>(),
-              args.mata->const_data_ptr<scalar_t>(),
-              args.lda,
-              args.matb->const_data_ptr<scalar_t>(),
-              args.ldb,
-              // This condition is needed for mm case on ROCm for hipblasLt path.
-              // Passing the bias ptr as null to avoid accuracy issues for mm case.
-              (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
-              args.result->data_ptr<scalar_t>(),
-              args.result_ld,
-              activation_to_gemm_and_blas_arg(activation)
-          );
-        }});
+
+        okay = at::cuda::blas::gemm_and_bias<scalar_t>(
+            args.transa == 't',
+            args.transb == 't',
+            args.m,
+            args.n,
+            args.k,
+            alpha.to<at::opmath_type<scalar_t>>(),
+            args.mata->const_data_ptr<scalar_t>(),
+            args.lda,
+            args.matb->const_data_ptr<scalar_t>(),
+            args.ldb,
+            // This condition is needed for mm case on ROCm for hipblasLt path.
+            // Passing the bias ptr as null to avoid accuracy issues for mm case.
+            (&result != &self) ? self.const_data_ptr<scalar_t>() : nullptr,
+            args.result->data_ptr<scalar_t>(),
+            args.result_ld,
+            activation_to_gemm_and_blas_arg(activation)
+        );
+      });
+    }
    if (!okay) {
      // lt path failed; recurse but disable lt path
      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
@ -492,7 +497,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 #endif

    bool okay = true;
-    AT_DISPATCH_FLOATING_TYPES_AND2(
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda_lt",
+        [&] {
+        auto tuning_ctx = at::cuda::tunable::getTuningContext();
+        if (tuning_ctx->IsTunableOpEnabled()) {
+          TORCH_CHECK(false, "Tunable GEMM is not supported for float output with reduced float input");
+        }
+        else {
+          okay = at::cuda::blas::gemm_and_bias<scalar_t, float>(
+              args.transa == 't',
+              args.transb == 't',
+              args.m,
+              args.n,
+              args.k,
+              alpha.to<at::opmath_type<scalar_t>>(),
+              args.mata->const_data_ptr<scalar_t>(),
+              args.lda,
+              args.matb->const_data_ptr<scalar_t>(),
+              args.ldb,
+              self.const_data_ptr<scalar_t>(),
+              args.result->data_ptr<float>(),
+              args.result_ld,
+              activation_epilogue
+          );
+        }});
+    } else {
+      AT_DISPATCH_FLOATING_TYPES_AND2(
        at::ScalarType::Half,
        at::ScalarType::BFloat16,
        scalar_type,
@ -523,7 +556,8 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              args.result_ld,
              activation_epilogue
          );
-        }});
+      }});
+    }
    if (!okay) {
      // lt path failed; recurse but disable lt path
      return addmm_out_cuda_impl(result, self, mat1, mat2, beta, alpha, activation, true);
@ -531,7 +565,35 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
 #endif
  } else
  {
-    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
+    if (is_float_output_with_half_input) {
+      AT_DISPATCH_REDUCED_FLOATING_TYPES(
+        scalar_type,
+        "addmm_cuda",
+        [&] {
+          using opmath_t = at::opmath_type<scalar_t>;
+          opmath_t alpha_val = alpha.to<opmath_t>();
+          opmath_t beta_val = beta.to<opmath_t>();
+          const scalar_t* mat1_ptr = args.mata->const_data_ptr<scalar_t>();
+          const scalar_t* mat2_ptr = args.matb->const_data_ptr<scalar_t>();
+
+          float* result_ptr = args.result->mutable_data_ptr<float>();
+          at::cuda::blas::gemm<scalar_t, float>(
+              args.transa,
+              args.transb,
+              args.m,
+              args.n,
+              args.k,
+              alpha_val,
+              mat1_ptr,
+              args.lda,
+              mat2_ptr,
+              args.ldb,
+              beta_val,
+              result_ptr,
+              args.result_ld);
+        });
+    } else {
+      AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(
        at::ScalarType::Half,
        at::ScalarType::BFloat16,
        scalar_type,
@ -558,6 +620,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma
              result_ptr,
              args.result_ld);
        });
+    }
    switch (activation) {
      case Activation::RELU:
        // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@ -630,39 +693,77 @@ const Tensor& baddbmm_out_cuda_impl(const Tensor& result, const Tensor& self, co
  int64_t num_batches = result_->sizes()[0];

  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!result_->is_conj());
+  bool is_float_output_with_half_input = (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16) && result.scalar_type() == at::ScalarType::Float;

-  AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "baddbmm_cuda", [&] {
-    using opmath_t = at::opmath_type<scalar_t>;
-    opmath_t alpha_val = alpha.to<opmath_t>();
-    opmath_t beta_val = beta.to<opmath_t>();
-    const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
-    const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
-    scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
-    const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
-    const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
-    // If batch is 1 call gemm rather than bgemm
-    if (num_batches == 1) {
-      at::cuda::blas::gemm<scalar_t>(
+  if (is_float_output_with_half_input) {
+    AT_DISPATCH_REDUCED_FLOATING_TYPES(batch1.scalar_type(), "baddbmm_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_t alpha_val = alpha.to<opmath_t>();
+      opmath_t beta_val = beta.to<opmath_t>();
+      const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
+      const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
+      const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
+      const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
+
+      float* result_ptr = result_->mutable_data_ptr<float>();
+
+      // If batch is 1 call gemm rather than bgemm
+      if (num_batches == 1) {
+          at::cuda::blas::gemm<scalar_t, float>(
+              transa, transb,
+              m, n, k,
+              alpha_val,
+              batch1_ptr, lda,
+              batch2_ptr, ldb,
+              beta_val,
+              result_ptr, ldc);
+        } else {
+          at::cuda::blas::bgemm<scalar_t, float>(
+            transa, transb,
+            m, n, k,
+            alpha_val,
+            batch1_ptr, lda, batch1_->strides()[0],
+            batch2_ptr, ldb, batch2_->strides()[0],
+            beta_val,
+            result_ptr, ldc, result_->strides()[0],
+            num_batches
+          );
+        }
+    });
+  } else {
+    AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, batch1.scalar_type(), "baddbmm_cuda", [&] {
+      using opmath_t = at::opmath_type<scalar_t>;
+      opmath_t alpha_val = alpha.to<opmath_t>();
+      opmath_t beta_val = beta.to<opmath_t>();
+      const scalar_t* batch1_ptr = batch1_->const_data_ptr<scalar_t>();
+      const scalar_t* batch2_ptr = batch2_->const_data_ptr<scalar_t>();
+      const auto transa = transpose_batch1 ? batch1_->is_conj() ? 'c' : 't' : 'n';
+      const auto transb = transpose_batch2 ? batch2_->is_conj() ? 'c' : 't' : 'n';
+      scalar_t* result_ptr = result_->mutable_data_ptr<scalar_t>();
+      // If batch is 1 call gemm rather than bgemm
+      if (num_batches == 1) {
+        at::cuda::blas::gemm<scalar_t>(
+            transa, transb,
+            m, n, k,
+            alpha_val,
+            batch1_ptr, lda,
+            batch2_ptr, ldb,
+            beta_val,
+            result_ptr, ldc);
+      } else {
+        at::cuda::blas::bgemm<scalar_t>(
          transa, transb,
          m, n, k,
          alpha_val,
-          batch1_ptr, lda,
-          batch2_ptr, ldb,
+          batch1_ptr, lda, batch1_->strides()[0],
+          batch2_ptr, ldb, batch2_->strides()[0],
          beta_val,
-          result_ptr, ldc);
-    } else {
-      at::cuda::blas::bgemm<scalar_t>(
-        transa, transb,
-        m, n, k,
-        alpha_val,
-        batch1_ptr, lda, batch1_->strides()[0],
-        batch2_ptr, ldb, batch2_->strides()[0],
-        beta_val,
-        result_ptr, ldc, result_->strides()[0],
-        num_batches
-      );
-   }
-  });
+          result_ptr, ldc, result_->strides()[0],
+          num_batches
+        );
+      }
+    });
+  }
  if (!result.is_same(*result_)) {
    result.copy_(*result_);
  }
@ -1588,6 +1689,88 @@ std::optional<c10::ScalarType> out_dtype) {
 #endif
 }

+Tensor _bmm_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype) {
+  IntArrayRef batch1_sizes = batch1.sizes();
+  IntArrayRef batch2_sizes = batch2.sizes();
+
+  Tensor out = at::empty({batch1_sizes[0], batch1_sizes[1], batch2_sizes[2]}, batch1.options().dtype(out_dtype));
+  return _bmm_out_dtype_cuda(batch1, batch2, out_dtype, out);
+}
+
+Tensor& _bmm_out_dtype_cuda(const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+  Scalar beta(0.0);
+  Scalar alpha(1.0);
+  {
+    NoNamesGuard guard;
+    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  }
+
+  return out;
+}
+
+Tensor _baddbmm_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
+  // We need to copy the tensor
+  Tensor out = self.clone().to(self.options().dtype(out_dtype));
+
+  return _baddbmm_out_dtype_cuda(out, batch1, batch2, out_dtype, beta, alpha, out);
+}
+
+Tensor& _baddbmm_out_dtype_cuda(const Tensor& self, const Tensor& batch1, const Tensor& batch2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  TORCH_CHECK(out_dtype == batch1.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (batch1.scalar_type() == at::ScalarType::Half || batch1.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+
+  {
+    NoNamesGuard guard;
+    baddbmm_out_cuda_impl(out, out, batch1, batch2, beta, alpha);
+  }
+
+  return out;
+}
+
+Tensor _mm_dtype_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype) {
+  Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(out_dtype));
+  return _mm_dtype_out_cuda(self, mat2, out_dtype, result);
+}
+
+Tensor& _mm_dtype_out_cuda(const Tensor& self, const Tensor& mat2, const at::ScalarType out_dtype, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(self.scalar_type() == mat2.scalar_type(), "input dtypes must be the same");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+
+  addmm_out_cuda_impl(const_cast<Tensor&>(out), out, self, mat2, 0, 1);
+
+  return out;
+}
+
+Tensor _addmm_dtype_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha) {
+  Tensor result = at::empty(self.sizes(), self.options().dtype(out_dtype));
+  return _addmm_dtype_out_cuda(self, mat1, mat2, out_dtype, beta, alpha, result);
+}
+
+Tensor& _addmm_dtype_out_cuda(const Tensor& self, const Tensor& mat1, const Tensor& mat2, const at::ScalarType out_dtype, const Scalar& beta, const Scalar& alpha, Tensor &out) {
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+  TORCH_CHECK(out_dtype == self.scalar_type() ||
+    (out_dtype == at::ScalarType::Float && (self.scalar_type() == at::ScalarType::Half || self.scalar_type() == at::ScalarType::BFloat16)),
+    "out_dtype must be the same as input dtype or fp32 for fp16/bf16 inputs");
+  TORCH_CHECK(out_dtype == out.scalar_type(), "out_dtype must be the same as the dtype of the provided out tensor");
+
+  addmm_out_cuda_impl(out, self, mat1, mat2, beta, alpha);
+
+  return out;
+}


 } // namespace at::native
--- a/aten/src/ATen/native/cuda/CUDALoops.cuh
+++ b/aten/src/ATen/native/cuda/CUDALoops.cuh
@ -612,28 +612,41 @@ struct check_binary_functor_types_for_specialization<
 };

 // The following is a list of type specializations for vectorized_templated
-// elementwise kernel. It refers to the first and second runtime types of the
-// arguments of a binary functor.
-
+// elementwise kernel. The three types refer to runtime types of the output
+// tensor, first tensor argument, and the second tensor argument used for a
+// binary functor.
 constexpr std::array rt_binary_specializations = {
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
         c10::CppTypeToScalarType<BFloat16>::value}),
-    std::array<c10::ScalarType, 2>(
-        {c10::CppTypeToScalarType<BFloat16>::value,
-         c10::CppTypeToScalarType<float>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<BFloat16>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<float>::value,
         c10::CppTypeToScalarType<Half>::value}),
-    std::array<c10::ScalarType, 2>(
+    std::array<c10::ScalarType, 3>(
+        {c10::CppTypeToScalarType<float>::value,
+         c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<float>::value}),
+    std::array<c10::ScalarType, 3>(
        {c10::CppTypeToScalarType<Half>::value,
+         c10::CppTypeToScalarType<Half>::value,
         c10::CppTypeToScalarType<float>::value})};

 bool check_binary_rt_types_for_specialization(TensorIteratorBase& iter) {
  if (iter.ninputs() != 2)
    return false;
  for (auto spec : rt_binary_specializations)
-    if (iter.input_dtype(0) == spec[0] && iter.input_dtype(1) == spec[1])
+    if (iter.dtype(0) == spec[0] && iter.input_dtype(0) == spec[1] &&
+        iter.input_dtype(1) == spec[2])
      return true;
  return false;
 }
@ -648,6 +661,7 @@ struct type_specialized_kernel_launcher {
      typename loader_t,
      typename storer_t>
  static void apply(
+      ScalarType ret_t,
      ScalarType arg0_t,
      ScalarType arg1_t,
      int64_t numel,
@ -657,10 +671,9 @@ struct type_specialized_kernel_launcher {
      out_calc_t output_offset_calculator,
      loader_t loader,
      storer_t storer) {
-    using traits = function_traits<func_t>;
-    using return_t = typename traits::result_type;
-    if (arg0_t == rt_binary_specializations[arg_index][0] &&
-        arg1_t == rt_binary_specializations[arg_index][1])
+    if (ret_t == rt_binary_specializations[arg_index][0] &&
+        arg0_t == rt_binary_specializations[arg_index][1] &&
+        arg1_t == rt_binary_specializations[arg_index][2])
      launch_vectorized_templated_kernel<
          func_t,
          array_t,
@ -668,11 +681,12 @@ struct type_specialized_kernel_launcher {
          out_calc_t,
          loader_t,
          storer_t,
-          return_t,
          decltype(c10::impl::ScalarTypeToCPPType<
                   rt_binary_specializations[arg_index][0]>::t),
          decltype(c10::impl::ScalarTypeToCPPType<
-                   rt_binary_specializations[arg_index][1]>::t)>(
+                   rt_binary_specializations[arg_index][1]>::t),
+          decltype(c10::impl::ScalarTypeToCPPType<
+                   rt_binary_specializations[arg_index][2]>::t)>(
          numel,
          f,
          data,
@ -712,7 +726,6 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
 #ifdef USE_ROCM
    // Attempt to call specialized vectorized elementwise kernel
    // that enables interleaving.
-
    if (check_binary_rt_types_for_specialization(iter) &&
        memory::can_vectorize_up_to<func_t>(data) > 1) {
      // constexpr to reduce the amount of kernels generated for
@ -740,6 +753,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) {
            type_specialized_kernel_launcher,
            rt_binary_specializations.size()>::
            with_args(
+                iter.dtype(0),
                iter.input_dtype(0),
                iter.input_dtype(1),
                numel,
--- a/aten/src/ATen/native/cuda/Copy.cu
+++ b/aten/src/ATen/native/cuda/Copy.cu
@ -418,8 +418,7 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) {
    auto* ptr = (dst_device == kCPU ? dst : src);
    auto* ctx = host_tensor.storage().data_ptr().get_context();
    // TODO: warn on the return value.
-    CachingHostAllocator_recordEvent(ptr, ctx, stream);
-
+    at::getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
  } else {
    at::cuda::memcpy_and_sync(dst, src, nbytes, kind, stream);
  }
--- a/aten/src/ATen/native/cuda/MemoryAccess.cuh
+++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh
@ -407,8 +407,8 @@ struct vectorized_templated {
  // float(float,bfloat16) and functor add on float(float,float).
  template <typename scalar_t>
  __device__ inline void store(scalar_t* from, int idx) {
-    using vec_t = aligned_vector<scalar_t, vec_size>;
-    scalar_t* to = reinterpret_cast<scalar_t*>(data[0]) + block_work_size * idx;
+    using vec_t = aligned_vector<CastToT, vec_size>;
+    CastToT* to = reinterpret_cast<CastToT*>(data[0]) + block_work_size * idx;
    vec_t* to_ = reinterpret_cast<vec_t*>(to);
    int thread_idx = threadIdx.x;
 #pragma unroll
--- a/aten/src/ATen/native/cuda/TensorShape.cu
+++ b/aten/src/ATen/native/cuda/TensorShape.cu
@ -422,11 +422,12 @@ static __global__ void chunk_cat_cuda_kernel(
 }

 bool all_contiguous(TensorList tensors) {
-  bool contiguous = true;
  for (const auto& t : tensors) {
-    contiguous &= t.is_non_overlapping_and_dense();
+    if (!t.is_contiguous()) {
+      return false;
+    }
  }
-  return contiguous;
+  return true;
 }

 // Get leading dimensions before `dim`-th dimension.
--- a/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h
+++ b/aten/src/ATen/native/cuda/cutlass_extensions/epilogue/thread/ft_fused_activations.h
@ -42,6 +42,7 @@
 #include <cutlass/half.h>
 #include <cutlass/numeric_conversion.h>
 #include <cutlass/numeric_types.h>
+#include <cutlass/version.h>

 /////////////////////////////////////////////////////////////////////////////////////////////////

@ -69,6 +70,7 @@ __forceinline__ __device__ float tanh_opt(float x)
 }

 /////////////////////////////////////////////////////////////////////////////////////////////////
+#if CUTLASS_VERSION <= 380
 template<>
 struct GELU_taylor<float> {
    static const bool kIsHeavy = true;
@ -92,6 +94,7 @@ struct GELU_taylor<float> {
        return this->operator()(scalar);
    }
 };
+#endif

 }  // namespace thread
 }  // namespace epilogue
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@ -127,6 +127,14 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
  return diff == 0 ? 0 : uint32_t(Align) - diff;
 }

+#if defined (__gfx90a__) || defined(__gfx942__)
+#define CDNA2_OR_LATER 1
+#else
+#define CDNA2_OR_LATER 0
+#endif
+
+#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
+
 #if defined(USE_ROCM)
 // TODO: Support RDNA
 constexpr int32_t kWarpSize = 64;
@ -142,14 +150,6 @@ static bool isCDNA2orLater(int index) {
 constexpr int32_t kWarpSize = 32;
 #endif

-#if defined (__gfx90a__) || defined(__gfx942__)
-#define CDNA2_OR_LATER 1
-#else
-#define CDNA2_OR_LATER 0
-#endif
-
-#if (defined(USE_ROCM) && ROCM_VERSION >= 50700) || ((defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)))
-
 // f16 vector types
 struct __align__(2) f16x1 {
  __half vals[1];
--- a/aten/src/ATen/native/hip/ck_bgemm.h
+++ b/aten/src/ATen/native/hip/ck_bgemm.h
@ -5,8 +5,8 @@

 namespace at::native {

-template <typename Dtype>
-inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
+template <typename Dtype, typename C_Dtype = Dtype>
+inline void bgemm_internal_ck(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas_bgemm_internal_ck: not implemented");
 }

--- a/aten/src/ATen/native/layer_norm.cpp
+++ b/aten/src/ATen/native/layer_norm.cpp
@ -16,6 +16,7 @@
 #include <ATen/ops/empty_like.h>
 #include <ATen/ops/empty_like_native.h>
 #include <ATen/ops/layer_norm_native.h>
+#include <ATen/ops/_fused_rms_norm.h>
 #include <ATen/ops/native_batch_norm.h>
 #include <ATen/ops/native_layer_norm.h>
 #include <ATen/ops/native_layer_norm_backward_native.h>
@ -27,7 +28,6 @@
 #endif

 #ifdef USE_MPS
-#include <ATen/native/mps/operations/RMSNorm.h>
 #include <c10/core/GradMode.h>
 #endif

@ -281,7 +281,7 @@ Tensor rms_norm_symint(

    if (!(GradMode::is_enabled() && any_inputs_require_grad) && !any_nested && is_input_fp && is_weight_fp) {
      auto eps_val = eps.value_or(std::numeric_limits<double>::epsilon());
-      return mps::rms_norm_mps_kernel(input.contiguous(), normalized_shape, weight.contiguous(), eps_val);
+      return at::_fused_rms_norm(input.contiguous(), normalized_shape.size(), weight.contiguous(), eps_val);
    }
  }
 #endif
--- a/aten/src/ATen/native/mkldnn/Utils.cpp
+++ b/aten/src/ATen/native/mkldnn/Utils.cpp
@ -19,6 +19,7 @@ std::vector<int64_t> pool_output_sizes(
  output_size[1] = input_size[1];

  for (const auto i : c10::irange(2, input_size.size())) {
+    TORCH_CHECK_VALUE(stride[i -2] > 0, "Strides must be positive!");
    output_size[i] = pooling_output_shape_pad_lr<int64_t>(
      input_size[i],
      kernel_size[i - 2],
--- a/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
+++ b/aten/src/ATen/native/mkldnn/xpu/qconv.cpp
@ -117,6 +117,44 @@ class QConvoneDNNXPU final {
        /*unary_algorithm*/ algorithm);
  }

+  static at::Tensor run_pointwise_tensor(
+      at::Tensor act,
+      at::Tensor act_scale,
+      at::Tensor act_zero_point,
+      at::Tensor weight,
+      at::Tensor weight_scales,
+      at::Tensor weight_zero_points,
+      std::optional<at::Tensor> bias,
+      torch::List<int64_t> stride,
+      torch::List<int64_t> padding,
+      torch::List<int64_t> dilation,
+      int64_t groups,
+      double output_scale,
+      int64_t output_zero_point,
+      std::optional<c10::ScalarType> output_dtype,
+      std::string_view attr,
+      torch::List<std::optional<at::Scalar>> scalars,
+      std::optional<std::string_view> algorithm) {
+    return run_pointwise(
+        act,
+        act_scale.item().toDouble(),
+        act_zero_point.item().toLong(),
+        weight,
+        weight_scales,
+        weight_zero_points,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        output_scale,
+        output_zero_point,
+        output_dtype,
+        /*unary_attr*/ attr,
+        /*unary_scalars*/ scalars,
+        /*unary_algorithm*/ algorithm);
+  }
+
  static at::Tensor run_pointwise_binary(
      at::Tensor act,
      double act_scale,
@ -223,6 +261,12 @@ TORCH_LIBRARY_IMPL(onednn, XPU, m) {
  m.impl(
      TORCH_SELECTIVE_NAME("onednn::qconv2d_pointwise.binary"),
      QConvoneDNNXPU::run_pointwise_binary);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise"),
+      QConvoneDNNXPU::run_pointwise);
+  m.impl(
+      TORCH_SELECTIVE_NAME("onednn::qconv_pointwise.tensor"),
+      QConvoneDNNXPU::run_pointwise_tensor);
 }

 } // namespace at::native::xpu
--- a/aten/src/ATen/native/mps/OperationUtils.h
+++ b/aten/src/ATen/native/mps/OperationUtils.h
@ -147,6 +147,7 @@ MPSGraphTensorData* getMPSGraphTensorData(MPSGraph* mpsGraph, MPSStream* mpsStre
 MPSGraphTensorData* getMPSGraphTensorFromScalar(MPSStream* mpsStream, MPSScalar& scalar);

 MPSGraph* make_mps_graph();
+MPSKernel* make_mps_kernel();
 void printTensorNDArray(const TensorBase& t);
 MPSNDArray* ndArrayFromTensor(const TensorBase& tensor, MPSShape* shape, MPSDataType mpsType);

@ -160,8 +161,22 @@ string get_mem_format_string(c10::MemoryFormat memory_format);

 using MPSCacheKey = uint64_t;

-// derive this class to cache a graph and its inputs/outputs
-// can be used to store any NSObject
+struct MPSCachedKernel {
+  MPSCachedKernel(NSObject* object) : _object([object retain]) {}
+  virtual ~MPSCachedKernel() {
+    [_object release];
+    _object = nullptr;
+  }
+
+  template <typename T>
+  inline T* kernel() const {  
+    return (T*)_object;
+  }
+
+ private:
+  NSObject* _object = nullptr;
+};
+
 struct MPSCachedGraph {
  MPSCachedGraph(NSObject* object) : _object([object retain]) {}
  virtual ~MPSCachedGraph() {
@ -214,6 +229,101 @@ struct MPSBinaryGradCachedGraph : public MPSCachedGraph {
  MPSGraphTensor* gradInputTensor_ = nil;
 };

+struct MPSKernelCache {
+    typedef MPSCachedKernel* (^CreateCachedKernelBlock)();
+
+    struct CacheEntry {
+        CacheEntry(const std::string& key, MPSCachedKernel* cachedKernel) : cachedKernel_(cachedKernel), key_(key) {}
+//        CacheEntry(const std::string& key, MPSKernel* cachedKernel) : cachedKernel_(cachedKernel), key_(key) {}
+        MPSCachedKernel* cachedKernel_ = nullptr;
+        std::string key_;
+    };
+
+public:
+    static MPSKernelCache* getInstance() {
+        if (_instance_cache == nullptr) {
+            _instance_cache = new MPSKernelCache();
+        }
+        return _instance_cache;
+    }
+
+    ~MPSKernelCache() {
+        dispatch_release(serialQueue_);
+        for (const auto& i : cache_) {
+            delete i.second.cachedKernel_;
+        }
+    }
+
+    // Disallow the copy constructor and operator= functions
+    MPSKernelCache(const MPSKernelCache&) = delete;
+    void operator=(const MPSKernelCache&) = delete;
+
+    MPSCachedKernel* CreateCachedKernel(const std::string& key, CreateCachedKernelBlock createCacheBlock) {
+        __block MPSCachedKernel* cachedKernel = nil;
+        MPSCacheKey hash = std::hash<std::string>{}(key);
+        dispatch_sync_with_rethrow(serialQueue_, ^() {
+            if(cache_.count(hash) != 0) {
+                auto& entry = cache_.at(hash);
+                TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+                cachedKernel = entry.cachedKernel_;
+            } else {
+                cachedKernel = createCacheBlock();
+                CacheEntry entry(key, cachedKernel);
+                cache_.emplace(hash, entry);
+                profileCachedKernel(entry);
+            }
+        });
+        return cachedKernel;
+    }
+    template <typename T>
+    inline T* CreateCachedKernelAs(const std::string& key, CreateCachedKernelBlock createCacheBlock) {
+        return static_cast<T*>(CreateCachedKernel(key, createCacheBlock));
+    }
+
+    MPSCachedKernel* LookUp(const std::string& key) const {
+        __block MPSCachedKernel* cachedKernel = nil;
+
+        MPSCacheKey hash = std::hash<std::string>{}(key);
+        dispatch_sync_with_rethrow(serialQueue_, ^() {
+            if(cache_.count(hash) != 0) {
+                auto& entry = cache_.at(hash);
+                TORCH_INTERNAL_ASSERT_DEBUG_ONLY(key == entry.key_, "Key collision in the MPS cached graph!\n");
+                cachedKernel = entry.cachedKernel_;
+            }
+        });
+        return cachedKernel;
+    }
+
+    template<typename T>
+    inline T* LookUpAs(const std::string& key) const {
+        return static_cast<T*>(LookUp(key));
+    }
+
+private:
+    MPSKernelCache() {
+        serialQueue_ = dispatch_queue_create("cache queue", DISPATCH_QUEUE_SERIAL);
+    }
+    void profileCachedKernel(const CacheEntry& cacheEntry) const;
+
+    static MPSKernelCache* _instance_cache;
+    std::unordered_map<MPSCacheKey, CacheEntry> cache_;
+    dispatch_queue_t serialQueue_ = nullptr;
+};
+
+// Common template for creating graph with a specified cache if missing
+template <typename T>
+inline T* LookUpOrCreateCachedKernel(const std::string& key, std::function<MPSKernel*()> instantiate) {
+  auto cache_ = MPSKernelCache::getInstance();
+  if (auto rc = cache_->LookUpAs<T>(key)) {
+    return rc;
+  }
+  return cache_->CreateCachedKernelAs<T>(key, ^mps::MPSCachedKernel*() {
+    auto k_ = new mps::MPSCachedKernel(instantiate());
+    return k_;
+  });
+}
+
+
 // TODO: Improve the overall design of MPSGraphCache.
 // https://github.com/pytorch/pytorch/issues/77176
 // Cache holding various keys mapped to graphs
--- a/aten/src/ATen/native/mps/OperationUtils.mm
+++ b/aten/src/ATen/native/mps/OperationUtils.mm
@ -315,7 +315,7 @@ std::string getArrayRefString(const IntArrayRef s) {

 std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) {
  std::string str;
-  // The key format per tensor would look like ":Float32[1,1,1,10]:"
+  // The key format per tensor would look like ":Float32[1,1,1,10,]:"
  for (const Tensor& tensor : tensors) {
    str += ":";
    if (tensor.defined()) {
@ -328,7 +328,7 @@ std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, boo
          str += "-1";
        } else {
          str +=
-              std::string([[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","].UTF8String);
+              getArrayRefString(tensor.sizes());
        }
      }
      str += "]";
@ -778,6 +778,19 @@ string get_mem_format_string(c10::MemoryFormat memory_format) {

 MPSGraphCache* MPSGraphCache::_instance_cache = nullptr;

+void MPSKernelCache::profileCachedKernel(const CacheEntry& cacheEntry) const {
+  auto& profiler = getMPSProfiler();
+  if (profiler.isOperationProfilingEnabled()) {
+    std::string graphKey = cacheEntry.key_;
+    // for interval-based signpost tracing, we begin the interval here to be able
+    // to measure the time it takes to compile the graphs (if graph newly created),
+    // and also the time potentially spent on gather/scatter of graph's input tensors
+//    profiler.beginProfileKernel(cacheEntry.cachedKernel_->kernel(), graphKey, true);
+  }
+}
+
+MPSKernelCache* MPSKernelCache::_instance_cache = nullptr;
+
 void MPSGraphCache::profileCachedGraph(const CacheEntry& cacheEntry) const {
  auto& profiler = getMPSProfiler();
  if (profiler.isOperationProfilingEnabled()) {
--- a/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/BinaryKernel.metal
@ -82,6 +82,13 @@ struct hermite_polynomial_h_functor {
  }
 };

+struct hermite_polynomial_he_functor {
+  template <typename T>
+  inline T operator()(const T a, const T b) {
+    return static_cast<T>(c10::metal::hermite_polynomial_he_forward(a, b));
+  }
+};
+
 struct nextafter_functor {
 #if __METAL_VERSION__ < 310
  template <typename U>
@ -173,6 +180,8 @@ REGISTER_BINARY_OP(chebyshev_polynomial_w, float, float);
 REGISTER_BINARY_OP(chebyshev_polynomial_w, half, half);
 REGISTER_BINARY_OP(hermite_polynomial_h, float, float);
 REGISTER_BINARY_OP(hermite_polynomial_h, half, half);
+REGISTER_BINARY_OP(hermite_polynomial_he, float, float);
+REGISTER_BINARY_OP(hermite_polynomial_he, half, half);

 #if __METAL_VERSION__ >= 310
 REGISTER_BINARY_OP(copysign, bfloat, bfloat);
@ -186,6 +195,7 @@ REGISTER_BINARY_OP(chebyshev_polynomial_u, bfloat, bfloat);
 REGISTER_BINARY_OP(chebyshev_polynomial_v, bfloat, bfloat);
 REGISTER_BINARY_OP(chebyshev_polynomial_w, bfloat, bfloat);
 REGISTER_BINARY_OP(hermite_polynomial_h, bfloat, bfloat);
+REGISTER_BINARY_OP(hermite_polynomial_he, bfloat, bfloat);
 #endif

 // Complex binary functions
--- a/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
+++ b/aten/src/ATen/native/mps/kernels/UnaryKernel.metal
@ -67,9 +67,29 @@ struct sqrt_functor {
  }
 };

+struct bitwise_not_functor {
+  template <typename T>
+  inline enable_if_t<!is_same_v<T, bool> && is_scalar_integral_v<T>, T>
+  operator()(const T x) {
+    return ~x;
+  }
+
+  template <typename T>
+  inline enable_if_t<is_same_v<T, bool>, T> operator()(const T x) {
+    return !x;
+  }
+};
+
 DEFINE_UNARY_FLOATING_FUNCTOR(erfinv);
 DEFINE_UNARY_FLOATING_FUNCTOR(sinc);

+REGISTER_UNARY_OP(bitwise_not, int, int);
+REGISTER_UNARY_OP(bitwise_not, long, long);
+REGISTER_UNARY_OP(bitwise_not, short, short);
+REGISTER_UNARY_OP(bitwise_not, char, char);
+REGISTER_UNARY_OP(bitwise_not, uchar, uchar);
+REGISTER_UNARY_OP(bitwise_not, bool, bool);
+
 #define INSTANTIATE_UNARY_KERNELS2(DTYPE0, DTYPE1) \
  REGISTER_UNARY_OP(erfinv, DTYPE1, DTYPE0);       \
  REGISTER_UNARY_OP(exp, DTYPE1, DTYPE0);          \
--- a/aten/src/ATen/native/mps/kernels/UpSample.metal
+++ b/aten/src/ATen/native/mps/kernels/UpSample.metal
@ -1,58 +1,8 @@
+#include <c10/metal/atomic.h>
 #include <metal_stdlib>
+
 using namespace metal;
-
-// Atomic operations helper
-template <typename T>
-struct AtomicType {};
-template <typename T>
-using AtomicType_t = typename AtomicType<T>::type;
-
-template <>
-struct AtomicType<float> {
-  using type = atomic<float>;
-  static inline void atomic_add(device type* data, long offset, float value) {
-    atomic_fetch_add_explicit(data + offset, value, memory_order_relaxed);
-  }
-};
-
-// As of Metal3.2 atomic operations are not supported on half-precision floats,
-// so they must be simulated Using atomic compare and exchange over 32-bit
-// atomic type
-template <typename T>
-static inline void atomic_add_helper(
-    device atomic<int>* data,
-    long offset,
-    float value) {
-  auto ptr = data + (offset >> 1);
-  auto old = atomic_load_explicit(ptr, memory_order_relaxed);
-  union {
-    int i;
-    T t[2];
-  } val;
-  do {
-    val.i = old;
-    val.t[offset & 1] += static_cast<T>(value);
-  } while (!atomic_compare_exchange_weak_explicit(
-      ptr, &old, val.i, memory_order_relaxed, memory_order_relaxed));
-}
-
-template <>
-struct AtomicType<half> {
-  using type = atomic<int>;
-  static inline void atomic_add(device type* data, long offset, float value) {
-    atomic_add_helper<half>(data, offset, value);
-  }
-};
-
-#if __METAL_VERSION__ >= 310
-template <>
-struct AtomicType<bfloat> {
-  using type = atomic<int>;
-  static inline void atomic_add(device type* data, long offset, float value) {
-    atomic_add_helper<bfloat>(data, offset, value);
-  }
-};
-#endif
+using namespace c10::metal;

 // Based on
 // https://en.wikipedia.org/wiki/Bicubic_interpolation#Bicubic_convolution_algorithm
--- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm
@ -116,6 +116,12 @@ static void hermite_polynomial_h_mps_kernel(TensorIteratorBase& iter) {
  lib.exec_binary_kernel(iter, "hermite_polynomial_h");
 }

+static void hermite_polynomial_he_mps_kernel(TensorIteratorBase& iter) {
+  TORCH_CHECK_TYPE(isFloatingType(iter.common_dtype()),
+                   "hermite_polynomial_he_mps not implemented for non-floating types");
+  lib.exec_binary_kernel(iter, "hermite_polynomial_he");
+}
+
 static void polar_mps_kernel(TensorIterator& iter) {
  lib.exec_binary_kernel(iter, "polar");
 }
@ -135,6 +141,7 @@ REGISTER_DISPATCH(chebyshev_polynomial_u_stub, &chebyshev_polynomial_u_mps_kerne
 REGISTER_DISPATCH(chebyshev_polynomial_v_stub, &chebyshev_polynomial_v_mps_kernel)
 REGISTER_DISPATCH(chebyshev_polynomial_w_stub, &chebyshev_polynomial_w_mps_kernel)
 REGISTER_DISPATCH(hermite_polynomial_h_stub, &hermite_polynomial_h_mps_kernel)
+REGISTER_DISPATCH(hermite_polynomial_he_stub, &hermite_polynomial_he_mps_kernel)
 REGISTER_DISPATCH(polar_stub, &polar_mps_kernel);
 REGISTER_DISPATCH(complex_stub, &complex_mps_kernel);
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/BitwiseOps.mm
+++ b/aten/src/ATen/native/mps/operations/BitwiseOps.mm
@ -5,7 +5,6 @@
 #include <ATen/native/Resize.h>
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/bitwise_and_native.h>
-#include <ATen/ops/bitwise_not_native.h>
 #include <ATen/ops/bitwise_or_native.h>
 #include <ATen/ops/bitwise_xor_native.h>
 #include <ATen/ops/logical_not_native.h>
@ -100,11 +99,6 @@ kernel void bitwise_rshift_scalar_tensor(device {0}  *out [[buffer(0)]],
  out[offset] = static_cast<{0}>(a) >> b[offset];
 }}

-kernel void bitwise_not(device {0}  *out [[buffer(0)]],
-                         constant {1}  *a [[buffer(1)]],
-                         uint offset [[thread_position_in_grid]]) {{
-  out[offset] = ~a[offset];
-}}
 )METAL",
                              3);

@ -200,54 +194,6 @@ static void _bitwise_op_out_mps(const Tensor& self,
  return;
 }

-static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) {
-  // Handle boolean tensor using logical not
-  if (self.scalar_type() == c10::ScalarType::Bool) {
-    logical_not_out_mps(self, const_cast<Tensor&>(output_));
-    return;
-  }
-
-  Tensor output = output_;
-  bool needs_output_copy = false;
-
-  resize_output(output, self.sizes());
-  if (needsGather(output)) {
-    output = output.contiguous();
-    needs_output_copy = true;
-  }
-  if (self.dim() == 0) {
-    if (self.scalar_type() == c10::ScalarType::Byte) {
-      // Unsigned types need a special handling to keep result of operation in 0..255 output
-      output.fill_(c10::Scalar(static_cast<uint8_t>(~self.item<uint8_t>())));
-    } else {
-      output.fill_(c10::Scalar(~self.item<int64_t>()));
-    }
-    return;
-  }
-  uint32_t length = output.numel();
-  if (length == 0) {
-    return;
-  }
-  using namespace at::mps;
-  MPSStream* stream = getCurrentMPSStream();
-  auto cplState = getCPLState(output, self, self, "bitwise_not");
-  dispatch_sync(stream->queue(), ^() {
-    getMPSProfiler().beginProfileKernel(cplState, "bitwise_not", {self});
-
-    id<MTLComputeCommandEncoder> commandEncoder = stream->commandEncoder();
-
-    [commandEncoder pushDebugGroup:@"Dispatch bitwise_not kernel"];
-    [commandEncoder setComputePipelineState:cplState];
-    mtl_setArgs(commandEncoder, output, self);
-    mtl_dispatch1DJob(commandEncoder, cplState, length);
-
-    getMPSProfiler().endProfileKernel(cplState);
-  });
-  if (needs_output_copy) {
-    output_.copy_(output);
-  }
-}
-
 } // namespace mps
 namespace {
 void lshift_kernel_mps(TensorIteratorBase& iter) {
@ -272,10 +218,6 @@ TORCH_IMPL_FUNC(bitwise_xor_out_mps)(const Tensor& self, const Tensor& other, co
  mps::_bitwise_op_out_mps(self, other, output, "xor");
 }

-TORCH_IMPL_FUNC(bitwise_not_out_mps)(const Tensor& self, const Tensor& output) {
-  mps::_bitwise_not_out_mps(self, output);
-}
-
 REGISTER_MPS_DISPATCH(lshift_stub, &lshift_kernel_mps)
 REGISTER_MPS_DISPATCH(rshift_stub, &rshift_kernel_mps)

--- a/aten/src/ATen/native/mps/operations/Linear.mm
+++ b/aten/src/ATen/native/mps/operations/Linear.mm
@ -4,12 +4,101 @@
 #include <ATen/native/mps/OperationUtils.h>
 #include <ATen/ops/linear_backward_native.h>
 #include <ATen/ops/linear_native.h>
+#include <ATen/mps/MPSProfiler.h>

 namespace at::native {

 using namespace mps;

+static Tensor _mps_linear_new(const Tensor& input, const Tensor& weight, const std::optional<Tensor>& bias_opt) {
+    using namespace mps;
+
+    if (input.scalar_type() == kComplexFloat || input.scalar_type() == kComplexHalf) {
+      TORCH_CHECK(false, "mps linear does not support complex types");
+    }
+
+    TORCH_CHECK(supportedFloatingOrComplexType(input), "MPS device does not support linear for non-float inputs");
+    TORCH_CHECK(input.is_mps(), "Tensor for argument input is on ", input.device(), " but expected on mps");
+    TORCH_CHECK(supportedFloatingOrComplexType(weight), "MPS device does not support linear for non-float weights");
+    TORCH_CHECK(weight.is_mps(), "Tensor for argument weight is on ", weight.device(), " but expected on mps");
+
+    Tensor bias;
+    bool has_bias = bias_opt.has_value();
+    if(has_bias){
+        bias = bias_opt.value();
+        TORCH_CHECK(bias.is_mps(), "Tensor for argument bias is on ", bias.device(), " but expected on mps");
+        TORCH_CHECK(supportedFloatingOrComplexType(bias), "MPS device does not support linear for non-float bias");
+    }
+
+    if (input.numel() == 0 || weight.numel() == 0) {
+        auto input_size = input.sizes();
+        std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
+        output_size.push_back(weight.size(0));
+      Tensor output = at::empty(output_size, input.scalar_type(), std::nullopt, kMPS, std::nullopt, input.suggest_memory_format());
+      return output;
+    }
+
+    auto input_size = input.sizes();
+    std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
+    output_size.push_back(weight.size(0));
+    Tensor output = at::empty(output_size, input.scalar_type(), std::nullopt, kMPS, std::nullopt, input.suggest_memory_format());
+
+    MPSStream* mpsStream = getCurrentMPSStream();
+    id<MTLDevice> device = MPSDevice::getInstance()->device();
+    id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+
+
+    const string key = "mps_linear" + getTensorsStringKey({input, weight, bias}, true);
+    dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+      @autoreleasepool {
+        mpsStream->endKernelCoalescing();
+        MPSDataType mpsDataType = getMPSDataType(weight.scalar_type());
+
+        auto inputNDArray = getMPSNDArray(input, input.sizes(), input.strides());
+        auto outNDArray = getMPSNDArray(output, output.sizes(), output.strides());
+
+        id<MTLBuffer> weightBuf = getMTLBufferStorage(weight);
+        MPSNDArrayDescriptor* weightTensorDesc = 
+            [MPSNDArrayDescriptor descriptorWithDataType:mpsDataType shape:getMPSShape(weight.sizes())];
+        weightTensorDesc.preferPackedRows = YES;
+        [weightTensorDesc transposeDimension:0 withDimension:1];
+        MPSNDArray* weightNDArray = [[MPSNDArray alloc] initWithBuffer:weightBuf
+                                                            offset:weight.storage_offset() * weight.element_size()
+                                                        descriptor:weightTensorDesc];
+
+        id<MTLCommandBuffer> commandBuffer = mpsStream->commandBuffer();
+        if(has_bias){
+            auto biasNDArray = getMPSNDArray(bias, bias.sizes(), bias.strides());
+            auto cachedKernel = LookUpOrCreateCachedKernel<MPSCachedKernel>(key, [&]() {
+                return [[MPSNDArrayMatrixMultiplication alloc] initWithDevice:device sourceCount:3];
+            });
+            auto kernel = cachedKernel->kernel<MPSNDArrayMatrixMultiplication>();
+
+            getMPSProfiler().beginProfileKernel(kernel, "mps_linear", {input, weight, bias});
+            [kernel encodeToCommandEncoder:computeEncoder
+                                             commandBuffer:commandBuffer
+                                              sourceArrays:@[ inputNDArray, weightNDArray, biasNDArray]
+                                          destinationArray:outNDArray];
+            getMPSProfiler().endProfileKernel(kernel);
+        } else {
+            auto cachedKernel = LookUpOrCreateCachedKernel<MPSCachedKernel>(key, [&]() {
+                return [[MPSNDArrayMatrixMultiplication alloc] initWithDevice:device sourceCount:2];
+            });
+            auto kernel = cachedKernel->kernel<MPSNDArrayMatrixMultiplication>();
+            getMPSProfiler().beginProfileKernel(kernel, "mps_linear", {input, weight, bias});
+            [kernel encodeToCommandEncoder:computeEncoder
+                                             commandBuffer:commandBuffer
+                                              sourceArrays:@[ inputNDArray, weightNDArray]
+                                          destinationArray:outNDArray];
+            getMPSProfiler().endProfileKernel(kernel);
+        }
+      }
+    });
+    return output;
+}
+
 Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::optional<Tensor>& bias_opt) {
+  return _mps_linear_new(input, weight_arg, bias_opt);
  // wT = transpose(weight);
  // y=x*wT+b

@ -67,6 +156,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const std::opt
  @autoreleasepool {
    string key = "mps_linear" + getTensorsStringKey({input, weight, bias});
    auto cachedGraph = LookUpOrCreateCachedGraph<CachedGraph>(key, [&](auto* mpsGraph, auto* newCachedGraph) {
+    
      MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input);
      MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight);

--- a/aten/src/ATen/native/mps/operations/RMSNorm.h
+++ b/aten/src/ATen/native/mps/operations/RMSNorm.h
@ -1,14 +0,0 @@
-#pragma once
-
-#include <ATen/core/Tensor.h>
-#include <c10/core/SymIntArrayRef.h>
-
-namespace at::native::mps {
-
-Tensor rms_norm_mps_kernel(
-    const Tensor& input,
-    c10::SymIntArrayRef normalized_shape,
-    const Tensor& weight,
-    const double eps);
-
-} // namespace at::native::mps
--- a/aten/src/ATen/native/mps/operations/RMSNorm.mm
+++ b/aten/src/ATen/native/mps/operations/RMSNorm.mm
@ -4,13 +4,14 @@
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
 #else
+#include <ATen/ops/_fused_rms_norm_native.h>
 #include <ATen/ops/empty_like.h>
 #endif
 #include <ATen/native/mps/OperationUtils.h>
-#include <ATen/native/mps/operations/RMSNorm.h>
 #include <fmt/format.h>

-namespace at::native::mps {
+namespace at::native {
+using namespace mps;

 #ifndef PYTORCH_JIT_COMPILE_SHADERS
 static auto& lib = MetalShaderLibrary::getBundledLibrary();
@ -18,13 +19,9 @@ static auto& lib = MetalShaderLibrary::getBundledLibrary();
 #include <ATen/native/mps/RMSNorm_metallib.h>
 #endif

-Tensor rms_norm_mps_kernel(const Tensor& input,
-                           c10::SymIntArrayRef normalized_shape,
-                           const Tensor& weight,
-                           const double eps) {
+Tensor _fused_rms_norm_mps(const Tensor& input, const int64_t normalized_ndim, const Tensor& weight, const double eps) {
  TORCH_CHECK(input.is_contiguous() && weight.is_contiguous(), "Expected contiguous input and weight tensors");
  auto output = at::empty_like(input);
-  const int normalized_ndim = normalized_shape.size();
  const auto input_shape = input.sizes();
  const auto input_ndim = input.dim();
  const int axis = input_ndim - normalized_ndim;
@ -64,4 +61,4 @@ Tensor rms_norm_mps_kernel(const Tensor& input,
  return output;
 }

-} // namespace at::native::mps
+} // namespace at::native
--- a/aten/src/ATen/native/mps/operations/TensorCompare.mm
+++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm
@ -16,6 +16,7 @@
 #include <ATen/ops/isin_native.h>
 #include <ATen/ops/nan_to_num_native.h>
 #include <ATen/ops/ones_like_native.h>
+#include <ATen/ops/result_type.h>
 #include <ATen/ops/where_native.h>
 #endif

@ -293,23 +294,22 @@ static void isin_Tensor_Tensor_out_mps(const Tensor& elements,
    return;
  }

+  const auto common_type = at::result_type(elements, test_elements);
  TORCH_CHECK(elements.is_mps() && test_elements.is_mps());
-  TORCH_CHECK(elements.dtype() == test_elements.dtype());
-  TORCH_CHECK(
-      !(!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) && !supportedFloatingType(elements.scalar_type())),
-      "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ",
-      elements.scalar_type());
+  TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) || supportedFloatingType(common_type),
+              "isin_Tensor_Tensor_out only works on floating types on MPS for pre MacOS_14_0. Received dtype: ",
+              common_type);

  @autoreleasepool {
-    string key =
-        op_name + getTensorsStringKey({elements}) + getTensorsStringKey({test_elements}) + std::to_string(invert);
+    string key = op_name + getTensorsStringKey({elements, test_elements}) + std::to_string(invert);

    auto cachedGraph = LookUpOrCreateCachedGraph<MPSBinaryCachedGraph>(key, [&](auto mpsGraph, auto newCachedGraph) {
-      MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(elements.scalar_type()));
-      MPSGraphTensor* otherTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(test_elements.scalar_type()));
+      newCachedGraph->inputTensor_ = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(elements.scalar_type()));
+      newCachedGraph->otherTensor_ = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(test_elements.scalar_type()));

-      newCachedGraph->inputTensor_ = inputTensor;
-      newCachedGraph->otherTensor_ = otherTensor;
+      // Cast to common type
+      auto inputTensor = castMPSTensor(mpsGraph, newCachedGraph->inputTensor_, common_type);
+      auto otherTensor = castMPSTensor(mpsGraph, newCachedGraph->otherTensor_, common_type);

      MPSShape* outputShape = getMPSShape(out);

--- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm
@ -38,10 +38,15 @@ static void sqrt_kernel_mps(TensorIteratorBase& iter) {
  lib.exec_unary_kernel(iter, "sqrt");
 }

+static void bitwise_not_kernel_mps(TensorIteratorBase& iter) {
+  lib.exec_unary_kernel(iter, "bitwise_not");
+}
+
 REGISTER_DISPATCH(exp_stub, exp_kernel);
 REGISTER_DISPATCH(erfinv_stub, erfinv_kernel);
 REGISTER_DISPATCH(sinc_stub, sinc_kernel);
 REGISTER_DISPATCH(tanh_stub, tanh_kernel);
 REGISTER_DISPATCH(round_decimals_stub, round_decimals_kernel);
 REGISTER_DISPATCH(sqrt_stub, sqrt_kernel_mps);
+REGISTER_DISPATCH(bitwise_not_stub, bitwise_not_kernel_mps);
 } // namespace at::native
--- a/aten/src/ATen/native/mps/operations/UnaryOps.mm
+++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm
@ -305,14 +305,12 @@ Tensor angle_mps(const Tensor& self) {
 }

 TORCH_IMPL_FUNC(sigmoid_out_mps)(const Tensor& self, const Tensor& output) {
-  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support sigmoid op with int64 input");
  mps::unary_op(self, output, "sigmoid_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
    return [mpsGraph sigmoidWithTensor:inputTensor name:nil];
  });
 }

 TORCH_IMPL_FUNC(log1p_out_mps)(const Tensor& self, const Tensor& output) {
-  TORCH_CHECK(self.scalar_type() != ScalarType::Long, "MPS does not support log1p op with int64 input");
  mps::unary_op(self, output, "log1p_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) {
    return mps::log1p(mpsGraph, inputTensor);
  });
@ -373,7 +371,11 @@ Tensor& logit_out_mps(const Tensor& self, std::optional<double> eps, Tensor& res
 }

 Tensor logit_mps(const Tensor& self, std::optional<double> eps) {
-  Tensor result = at::empty(self.sizes(), ScalarType::Float, std::nullopt, kMPS, std::nullopt, std::nullopt);
+  auto out_dtype = self.scalar_type();
+  if (c10::isIntegralType(out_dtype, /*includeBool*/ true)) {
+    out_dtype = kFloat;
+  }
+  Tensor result = at::empty(self.sizes(), out_dtype, std::nullopt, kMPS, std::nullopt, std::nullopt);
  logit_mps_impl(self, eps, result, "logit_mps");
  return result;
 }
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -1073,6 +1073,16 @@
    XPU: baddbmm_out_xpu
    SparseCsrCUDA: baddbmm_out_sparse_csr_cuda

+- func: baddbmm.dtype(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _baddbmm_dtype_cuda
+
+- func: baddbmm.dtype_out(Tensor self, Tensor batch1, Tensor batch2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _baddbmm_out_dtype_cuda
+
 - func: bartlett_window(int window_length, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor
  dispatch:
    CompositeExplicitAutograd: bartlett_window
@ -1211,8 +1221,7 @@
  structured: True
  structured_inherits: TensorIteratorBase
  dispatch:
-    CPU, CUDA: bitwise_not_out
-    MPS: bitwise_not_out_mps
+    CPU, CUDA, MPS: bitwise_not_out
  tags: pointwise

 - func: copysign.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)
@ -1375,6 +1384,16 @@
    SparseCUDA: bmm_out_sparse_cuda
    SparseCsrCUDA: bmm_out_sparse_csr_cuda

+- func: bmm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
+  variants: function
+  dispatch:
+    CUDA: _bmm_dtype_cuda
+
+- func: bmm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  variants: function
+  dispatch:
+    CUDA: _bmm_out_dtype_cuda
+
 - func: broadcast_tensors(Tensor[] tensors) -> Tensor[]
  device_check: NoCheck
  device_guard: False
@ -3302,6 +3321,10 @@
  dispatch:
    CompositeImplicitAutograd: rms_norm_symint

+- func: _fused_rms_norm(Tensor input, int normalized_shape_ndim, Tensor weight, float eps) -> Tensor
+  dispatch:
+    MPS: _fused_rms_norm_mps
+
 - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor
  variants: function, method
  dispatch:
@ -4145,6 +4168,14 @@
    SparseCPU, SparseCUDA: _sparse_mm_out
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _sparse_csr_mm_out

+- func: mm.dtype(Tensor self, Tensor mat2, ScalarType out_dtype) -> Tensor
+  dispatch:
+    CUDA: _mm_dtype_cuda
+
+- func: mm.dtype_out(Tensor self, Tensor mat2, ScalarType out_dtype, *, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _mm_dtype_out_cuda
+
 - func: _int_mm(Tensor self, Tensor mat2) -> Tensor
  dispatch:
    CPU: _int_mm_cpu
@ -7044,6 +7075,14 @@
    SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: addmm_sparse_compressed_dense
  tags: core

+- func: addmm.dtype(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1) -> Tensor
+  dispatch:
+    CUDA: _addmm_dtype_cuda
+
+- func: addmm.dtype_out(Tensor self, Tensor mat1, Tensor mat2, ScalarType out_dtype, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)
+  dispatch:
+    CUDA: _addmm_dtype_out_cuda
+
 - func: addmm_(Tensor(a!) self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1) -> Tensor(a!)
  structured_delegate: addmm.out
  variants: method
@ -15324,7 +15363,7 @@
 - func: special_hermite_polynomial_he.out(Tensor x, Tensor n, *, Tensor(a!) out) -> Tensor(a!)
  device_check: NoCheck
  dispatch:
-    CPU, CUDA: special_hermite_polynomial_he_out
+    CPU, CUDA, MPS: special_hermite_polynomial_he_out
  python_module: special
  structured_inherits: TensorIteratorBase
  structured: True
--- a/Show More
+++ b/Show More