[Hierarchical Compile] Handle autocast ctx manager

[Hierarchical Compile] Fix small bug
Disable optimizer and enable graph deduplication
2025-10-29 19:24:55 +08:00 · 2025-04-09 23:51:23 -07:00 · 2025-04-09 23:51:23 -07:00 · 2025-04-09 23:51:23 -07:00
713 changed files with 10247 additions and 27310 deletions
--- a/.ci/docker/almalinux/build.sh
+++ b/.ci/docker/almalinux/build.sh
@ -1,60 +1,82 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
-set -exou pipefail
+set -eou pipefail
 image="$1"
 shift
 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi
-# Go from imagename:tag to tag
+DOCKER_IMAGE_NAME="pytorch/${image}"
 DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
 CUDA_VERSION=""
 if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
    # extract cuda version from image name and tag.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
    CUDA_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
 fi
 case ${DOCKER_TAG_PREFIX} in
  cpu)
    BASE_TARGET=base
    ;;
  cuda*)
    BASE_TARGET=cuda${CUDA_VERSION}
    ;;
  *)
    echo "ERROR: Unknown docker tag ${DOCKER_TAG_PREFIX}"
    exit 1
    ;;
 esac
 # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
 # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
 sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
 sudo systemctl daemon-reload
 sudo systemctl restart docker
 export DOCKER_BUILDKIT=1
 TOPDIR=$(git rev-parse --show-toplevel)
 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
-docker build \
+CUDA_VERSION=${CUDA_VERSION:-12.1}
  --target final \
  --progress plain \
  --build-arg "BASE_TARGET=${BASE_TARGET}" \
  --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
  --build-arg "DEVTOOLSET_VERSION=11" \
  -t ${tmp_tag} \
  $@ \
  -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
  ${TOPDIR}/.ci/docker/
-if [ -n "${CUDA_VERSION}" ]; then
+case ${CUDA_VERSION} in
  cpu)
    BASE_TARGET=base
    DOCKER_TAG=cpu
    ;;
  all)
    BASE_TARGET=all_cuda
    DOCKER_TAG=latest
    ;;
  *)
    BASE_TARGET=cuda${CUDA_VERSION}
    DOCKER_TAG=cuda${CUDA_VERSION}
    ;;
 esac
 (
  set -x
  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
  sudo systemctl daemon-reload
  sudo systemctl restart docker
  docker build \
    --target final \
    --progress plain \
    --build-arg "BASE_TARGET=${BASE_TARGET}" \
    --build-arg "CUDA_VERSION=${CUDA_VERSION}" \
    --build-arg "DEVTOOLSET_VERSION=11" \
    -t ${DOCKER_IMAGE_NAME} \
    $@ \
    -f "${TOPDIR}/.ci/docker/almalinux/Dockerfile" \
    ${TOPDIR}/.ci/docker/
 )
 if [[ "${DOCKER_TAG}" =~ ^cuda* ]]; then
  # Test that we're using the right CUDA compiler
-  docker run --rm "${tmp_tag}" nvcc --version | grep "cuda_${CUDA_VERSION}"
+  (
    set -x
    docker run --rm "${DOCKER_IMAGE_NAME}" nvcc --version | grep "cuda_${CUDA_VERSION}"
  )
 fi
 GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE_NAME}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE_NAME}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH:-}" == true ]]; then
  (
    set -x
    docker push "${DOCKER_IMAGE_NAME}"
    if [[ -n ${GITHUB_REF} ]]; then
        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_BRANCH_TAG}
        docker tag ${DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_SHA_TAG}
        docker push "${DOCKER_IMAGE_BRANCH_TAG}"
        docker push "${DOCKER_IMAGE_SHA_TAG}"
    fi
  )
 fi
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-381ae5d57d35c165d98df728380b20fbde350392
+7e487c24e1c20c3f4606c2d8aca2778873b00b4c
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -19,13 +19,6 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1
    # Make sure rocm packages from repo.radeon.com have highest priority
    cat << EOF > /etc/apt/preferences.d/rocm-pin-600
 Package: *
 Pin: release o=repo.radeon.com
 Pin-Priority: 600
 EOF
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
--- a/.ci/docker/common/install_rocm_drm.sh
+++ b/.ci/docker/common/install_rocm_drm.sh
@ -25,7 +25,9 @@ python3 -m pip install meson ninja
 ###########################
 ### clone repo
 ###########################
-GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
+# TEMPORARY FIX: https://gitlab.freedesktop.org/mesa/drm.git is down until 2025/03/22
 # GIT_SSL_NO_VERIFY=true git clone https://gitlab.freedesktop.org/mesa/drm.git
 GIT_SSL_NO_VERIFY=true git clone git://anongit.freedesktop.org/mesa/drm
 pushd drm
 ###########################
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -17,14 +17,10 @@ function do_install() {
        tmp_dir=$(mktemp -d)
        pushd ${tmp_dir}
        curl -OLs https://ossci-linux.s3.us-east-1.amazonaws.com/${magma_archive}
-        if tar -xvf "${magma_archive}"
+        tar -xvf "${magma_archive}"
-        then
+        mkdir -p "${rocm_dir}/magma"
-            mkdir -p "${rocm_dir}/magma"
+        mv include "${rocm_dir}/magma/include"
-            mv include "${rocm_dir}/magma/include"
+        mv lib "${rocm_dir}/magma/lib"
            mv lib "${rocm_dir}/magma/lib"
        else
            echo "${magma_archive} not found, skipping magma install"
        fi
        popd
    )
 }
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -89,7 +89,7 @@ ADD ./common/install_rocm_magma.sh install_rocm_magma.sh
 # gfortran and python needed for building magma from source for ROCm
 RUN apt-get update -y && \
    apt-get install gfortran -y && \
-    apt-get install python3 python-is-python3 -y && \
+    apt-get install python -y && \
    apt-get clean
 RUN bash ./install_rocm_drm.sh && rm install_rocm_drm.sh
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -1,63 +1,83 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
-set -eoux pipefail
+set -eou pipefail
 image="$1"
 shift
 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGENAME:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi
 DOCKER_IMAGE="pytorch/${image}"
 TOPDIR=$(git rev-parse --show-toplevel)
 GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 WITH_PUSH=${WITH_PUSH:-}
 DOCKER=${DOCKER:-docker}
-# Go from imagename:tag to tag
+case ${GPU_ARCH_TYPE} in
 DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
 GPU_ARCH_VERSION=""
 if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
 elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
 fi
 case ${DOCKER_TAG_PREFIX} in
    cpu)
        BASE_TARGET=cpu
        DOCKER_TAG=cpu
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    cuda*)
+    cuda)
        BASE_TARGET=cuda${GPU_ARCH_VERSION}
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=ubuntu:20.04
        DOCKER_GPU_BUILD_ARG=""
        ;;
-    rocm*)
+    rocm)
        BASE_TARGET=rocm
-        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
+        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-ubuntu-20.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg ROCM_VERSION=${GPU_ARCH_VERSION}"
        ;;
    *)
-        echo "ERROR: Unrecognized DOCKER_TAG_PREFIX: ${DOCKER_TAG_PREFIX}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac
 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
-DOCKER_BUILDKIT=1 ${DOCKER} build \
+(
-    --target final \
+    set -x
-    ${DOCKER_GPU_BUILD_ARG} \
+    DOCKER_BUILDKIT=1 ${DOCKER} build \
-    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
+         --target final \
-    --build-arg "BASE_TARGET=${BASE_TARGET}" \
+        ${DOCKER_GPU_BUILD_ARG} \
-    -t "${tmp_tag}" \
+        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
-    $@ \
+        --build-arg "BASE_TARGET=${BASE_TARGET}" \
-    -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
+        -t "${DOCKER_IMAGE}" \
-    "${TOPDIR}/.ci/docker/"
+        $@ \
        -f "${TOPDIR}/.ci/docker/libtorch/Dockerfile" \
        "${TOPDIR}/.ci/docker/"
 )
 GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH}" == true ]]; then
  (
    set -x
    ${DOCKER} push "${DOCKER_IMAGE}"
    if [[ -n ${GITHUB_REF} ]]; then
        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
        ${DOCKER} tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
        ${DOCKER} push "${DOCKER_IMAGE_BRANCH_TAG}"
        ${DOCKER} push "${DOCKER_IMAGE_SHA_TAG}"
    fi
  )
 fi
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -1,7 +1,7 @@
 #!/usr/bin/env bash
 # Script used only in CD pipeline
-set -exou pipefail
+set -eou pipefail
 TOPDIR=$(git rev-parse --show-toplevel)
@ -9,110 +9,152 @@ image="$1"
 shift
 if [ -z "${image}" ]; then
-  echo "Usage: $0 IMAGE:ARCHTAG"
+  echo "Usage: $0 IMAGE"
  exit 1
 fi
-# Go from imagename:tag to tag
+DOCKER_IMAGE="pytorch/${image}"
 DOCKER_TAG_PREFIX=$(echo "${image}" | awk -F':' '{print $2}')
-GPU_ARCH_VERSION=""
+DOCKER_REGISTRY="${DOCKER_REGISTRY:-docker.io}"
 if [[ "${DOCKER_TAG_PREFIX}" == cuda* ]]; then
    # extract cuda version from image name.  e.g. manylinux2_28-builder:cuda12.8 returns 12.8
    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'cuda' '{print $2}')
 elif [[ "${DOCKER_TAG_PREFIX}" == rocm* ]]; then
    # extract rocm version from image name.  e.g. manylinux2_28-builder:rocm6.2.4 returns 6.2.4
    GPU_ARCH_VERSION=$(echo "${DOCKER_TAG_PREFIX}" | awk -F'rocm' '{print $2}')
 fi
 GPU_ARCH_TYPE=${GPU_ARCH_TYPE:-cpu}
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 MANY_LINUX_VERSION=${MANY_LINUX_VERSION:-}
 DOCKERFILE_SUFFIX=${DOCKERFILE_SUFFIX:-}
 WITH_PUSH=${WITH_PUSH:-}
-case ${image} in
+case ${GPU_ARCH_TYPE} in
-    manylinux2_28-builder:cpu)
+    cpu)
        TARGET=cpu_final
        DOCKER_TAG=cpu
        GPU_IMAGE=centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        ;;
    cpu-manylinux_2_28)
        TARGET=cpu_final
        DOCKER_TAG=cpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinuxaarch64-builder:cpu-aarch64)
+    cpu-aarch64)
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=10"
        MANY_LINUX_VERSION="aarch64"
        ;;
-    manylinux2_28_aarch64-builder:cpu-aarch64)
+    cpu-aarch64-2_28)
        TARGET=final
        DOCKER_TAG=cpu-aarch64
        GPU_IMAGE=arm64v8/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11 --build-arg NINJA_VERSION=1.12.1"
        MANY_LINUX_VERSION="2_28_aarch64"
        ;;
-    manylinuxcxx11-abi-builder:cpu-cxx11-abi)
+    cpu-cxx11-abi)
        TARGET=final
        DOCKER_TAG=cpu-cxx11-abi
        GPU_IMAGE=""
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=9"
        MANY_LINUX_VERSION="cxx11-abi"
        ;;
-    manylinuxs390x-builder:cpu-s390x)
+    cpu-s390x)
        TARGET=final
        DOCKER_TAG=cpu-s390x
        GPU_IMAGE=s390x/almalinux:8
        DOCKER_GPU_BUILD_ARG=""
        MANY_LINUX_VERSION="s390x"
        ;;
-    manylinux2_28-builder:cuda*)
+    cuda)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        # Keep this up to date with the minimum version of CUDA we currently support
        GPU_IMAGE=centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=9"
        ;;
    cuda-manylinux_2_28)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
-    manylinuxaarch64-builder:cuda*)
+    cuda-aarch64)
        TARGET=cuda_final
        DOCKER_TAG=cuda${GPU_ARCH_VERSION}
        GPU_IMAGE=arm64v8/centos:7
        DOCKER_GPU_BUILD_ARG="--build-arg BASE_CUDA_VERSION=${GPU_ARCH_VERSION} --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="aarch64"
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
-    manylinux2_28-builder:rocm*)
+    rocm|rocm-manylinux_2_28)
        TARGET=rocm_final
        DOCKER_TAG=rocm${GPU_ARCH_VERSION}
        GPU_IMAGE=rocm/dev-centos-7:${GPU_ARCH_VERSION}-complete
        DEVTOOLSET_VERSION="9"
-        MANY_LINUX_VERSION="2_28"
+        if [ ${GPU_ARCH_TYPE} == "rocm-manylinux_2_28" ]; then
-        DEVTOOLSET_VERSION="11"
+            MANY_LINUX_VERSION="2_28"
-        GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
+            DEVTOOLSET_VERSION="11"
            GPU_IMAGE=rocm/dev-almalinux-8:${GPU_ARCH_VERSION}-complete
        fi
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
        DOCKER_GPU_BUILD_ARG="--build-arg ROCM_VERSION=${GPU_ARCH_VERSION} --build-arg PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} --build-arg DEVTOOLSET_VERSION=${DEVTOOLSET_VERSION}"
        ;;
-    manylinux2_28-builder:xpu)
+    xpu)
        TARGET=xpu_final
        DOCKER_TAG=xpu
        GPU_IMAGE=amd64/almalinux:8
        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
-        echo "ERROR: Unrecognized image name: ${image}"
+        echo "ERROR: Unrecognized GPU_ARCH_TYPE: ${GPU_ARCH_TYPE}"
        exit 1
        ;;
 esac
 IMAGES=''
 if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
    DOCKERFILE_SUFFIX=_${MANY_LINUX_VERSION}
 fi
-# Only activate this if in CI
+(
-if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
+    set -x
-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    # Only activate this if in CI
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    if [ "$(uname -m)" != "s390x" ] && [ -v CI ]; then
-    sudo systemctl daemon-reload
+        # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    sudo systemctl restart docker
+        # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
        sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
        sudo systemctl daemon-reload
        sudo systemctl restart docker
    fi
    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
        -t "${DOCKER_IMAGE}" \
        $@ \
        -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
        "${TOPDIR}/.ci/docker/"
 )
 GITHUB_REF=${GITHUB_REF:-"dev")}
 GIT_BRANCH_NAME=${GITHUB_REF##*/}
 GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
 DOCKER_IMAGE_BRANCH_TAG=${DOCKER_IMAGE}-${GIT_BRANCH_NAME}
 DOCKER_IMAGE_SHA_TAG=${DOCKER_IMAGE}-${GIT_COMMIT_SHA}
 if [[ "${WITH_PUSH}" == true ]]; then
    (
        set -x
        docker push "${DOCKER_IMAGE}"
        if [[ -n ${GITHUB_REF} ]]; then
            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_BRANCH_TAG}
            docker tag ${DOCKER_IMAGE} ${DOCKER_IMAGE_SHA_TAG}
            docker push "${DOCKER_IMAGE_BRANCH_TAG}"
            docker push "${DOCKER_IMAGE_SHA_TAG}"
        fi
    )
 fi
 tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 DOCKER_BUILDKIT=1 docker build  \
    ${DOCKER_GPU_BUILD_ARG} \
    --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
    --target "${TARGET}" \
    -t "${tmp_tag}" \
    $@ \
    -f "${TOPDIR}/.ci/docker/manywheel/Dockerfile${DOCKERFILE_SUFFIX}" \
    "${TOPDIR}/.ci/docker/"
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,20 +1,15 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@a98ffecb792d50df495be401becbf5c414421423#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought is probably
 # something related to Docker setup. We can investigate this later
 sphinxcontrib.katex==0.8.6
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.8.6
 sphinxext-opengraph==0.9.1
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 0.9.1
 matplotlib==3.5.3
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 3.5.3
@ -51,6 +46,5 @@ myst-nb==0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.4.0
+sphinx-panels==0.4.1
 sphinxcontrib-mermaid==1.0.0
 myst-parser==0.18.1
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 6.4
+DESIRED_ROCM ?= 6.3
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 .PHONY: all
 all: magma-rocm64
 all: magma-rocm63
 all: magma-rocm624
@ -25,11 +24,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 .PHONY: magma-rocm64
 magma-rocm64: DESIRED_ROCM := 6.4
 magma-rocm64:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm63
 magma-rocm63: DESIRED_ROCM := 6.3
 magma-rocm63:
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -301,18 +301,6 @@ else
    fi
    pip_install_whl "$(echo dist/*.whl)"
    if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
      echo "Checking that xpu is compiled"
      pushd dist/
      if python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'; then
        echo "XPU support is compiled in."
      else
        echo "XPU support is NOT compiled in."
        exit 1
      fi
      popd
    fi
    # TODO: I'm not sure why, but somehow we lose verbose commands
    set -x
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -216,14 +216,6 @@ else
  fi
 fi
 ###############################################################################
 # Check XPU configured correctly
 ###############################################################################
 if [[ "$DESIRED_CUDA" == 'xpu' && "$PACKAGE_TYPE" != 'libtorch' ]]; then
  echo "Checking that xpu is compiled"
  python -c 'import torch; exit(0 if torch.xpu._is_compiled() else 1)'
 fi
 ###############################################################################
 # Check CUDA configured correctly
 ###############################################################################
--- a/.ci/pytorch/macos-build.sh
+++ b/.ci/pytorch/macos-build.sh
@ -34,14 +34,11 @@ if which sccache > /dev/null; then
 fi
 print_cmake_info
-if [[ ${BUILD_ENVIRONMENT} == *"distributed"* ]]; then
+
-  # Needed for inductor benchmarks, as lots of HF networks make `torch.distribtued` calls
+# Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
-  USE_DISTRIBUTED=1 USE_OPENMP=1 WERROR=1 python setup.py bdist_wheel
+# that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
-else
+USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
-  # Explicitly set USE_DISTRIBUTED=0 to align with the default build config on mac. This also serves as the sole CI config that tests
+
  # that building with USE_DISTRIBUTED=0 works at all. See https://github.com/pytorch/pytorch/issues/86448
  USE_DISTRIBUTED=0 USE_OPENMP=1 MACOSX_DEPLOYMENT_TARGET=11.0 WERROR=1 BUILD_TEST=OFF USE_PYTORCH_METAL=1 python setup.py bdist_wheel
 fi
 if which sccache > /dev/null; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -221,39 +221,25 @@ test_torchbench_smoketest() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  local backend=eager
  local dtype=notset
  local device=mps
  local models=(hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152 sam pytorch_unet stable_diffusion_text_encoder moco speech_transformer)
-  for backend in eager inductor; do
+  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
  touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
-    for dtype in notset float16 bfloat16; do
+  echo "Setup complete, launching torchbench training performance run"
-      echo "Launching torchbench inference performance run for backend ${backend} and dtype ${dtype}"
+  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
-      local dtype_arg="--${dtype}"
+    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
-      if [ "$dtype" == notset ]; then
+      --performance --only "$model" --backend "$backend" --training --devices "$device" \
-          dtype_arg="--float32"
+      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
-      fi
+  done
      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
      for model in "${models[@]}"; do
        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
          --performance --only "$model" --backend "$backend" --inference --devices "$device" "$dtype_arg" \
          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv" || true
      done
    done
    for dtype in notset amp; do
      echo "Launching torchbench training performance run for backend ${backend} and dtype ${dtype}"
      touch "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv"
      local dtype_arg="--${dtype}"
      if [ "$dtype" == notset ]; then
          dtype_arg="--float32"
      fi
      for model in "${models[@]}"; do
        PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
          --performance --only "$model" --backend "$backend" --training --devices "$device" "$dtype_arg" \
          --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_training_${device}_performance.csv" || true
      done
    done
  echo "Launching torchbench inference performance run"
  for model in hf_T5 llama BERT_pytorch dcgan hf_GPT2 yolov3 resnet152; do
    PYTHONPATH="$(pwd)"/torchbench python benchmarks/dynamo/torchbench.py \
      --performance --only "$model" --backend "$backend" --inference --devices "$device" \
      --output "$TEST_REPORTS_DIR/inductor_${backend}_torchbench_${dtype}_inference_${device}_performance.csv"
  done
  echo "Pytorch benchmark on mps device completed"
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -119,6 +119,12 @@ popd
 git rm -rf "$install_path" || true
 mv "$pt_checkout/docs/build/html" "$install_path"
 # Prevent Google from indexing $install_path/_modules. This folder contains
 # generated source files.
 # NB: the following only works on gnu sed. The sed shipped with mac os is different.
 # One can `brew install gnu-sed` on a mac and then use "gsed" instead of "sed".
 find "$install_path/_modules" -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">'
 git add "$install_path" || true
 git status
 git config user.email "soumith+bot@pytorch.org"
--- a/.circleci/scripts/binary_windows_arm64_build.sh
+++ b/.circleci/scripts/binary_windows_arm64_build.sh
@ -0,0 +1,22 @@
 #!/bin/bash
 set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
 export USE_SCCACHE=1
 export SCCACHE_IGNORE_SERVER_IO_ERROR=1
 echo "Free space on filesystem before build:"
 df -h
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
 if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
    pytorch/.ci/pytorch/windows/arm64/build_libtorch.bat
 elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
    pytorch/.ci/pytorch/windows/arm64/build_pytorch.bat
 fi
 echo "Free space on filesystem after build:"
 df -h
--- a/.circleci/scripts/binary_windows_arm64_test.sh
+++ b/.circleci/scripts/binary_windows_arm64_test.sh
@ -0,0 +1,6 @@
 #!/bin/bash
 set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 pytorch/.ci/pytorch/windows/arm64/smoke_test.bat
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -4,13 +4,11 @@ set -eux -o pipefail
 source "${BINARY_ENV_FILE:-/c/w/env}"
 mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
-if [[ "$OS" != "windows-arm64" ]]; then
+export CUDA_VERSION="${DESIRED_CUDA/cu/}"
-    export CUDA_VERSION="${DESIRED_CUDA/cu/}"
+export USE_SCCACHE=1
-    export USE_SCCACHE=1
+export SCCACHE_BUCKET=ossci-compiler-cache
-    export SCCACHE_BUCKET=ossci-compiler-cache
+export SCCACHE_IGNORE_SERVER_IO_ERROR=1
-    export SCCACHE_IGNORE_SERVER_IO_ERROR=1
+export VC_YEAR=2022
    export VC_YEAR=2022
 fi
 if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
    export USE_SCCACHE=0
@ -23,16 +21,7 @@ df -h
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
 export NIGHTLIES_PYTORCH_ROOT="$PYTORCH_ROOT"
-
+./windows/internal/build_wheels.bat
 if [[ "$OS" == "windows-arm64" ]]; then
    if [[ "$PACKAGE_TYPE" == 'libtorch' ]]; then
        ./windows/arm64/build_libtorch.bat
    elif [[ "$PACKAGE_TYPE" == 'wheel' ]]; then
        ./windows/arm64/build_pytorch.bat
    fi
 else
    ./windows/internal/build_wheels.bat
 fi
 echo "Free space on filesystem after build:"
 df -h
--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -11,11 +11,6 @@ if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
 fi
 pushd "$PYTORCH_ROOT/.ci/pytorch/"
-
+./windows/internal/smoke_test.bat
 if [[ "$OS" == "windows-arm64" ]]; then
    ./windows/arm64/smoke_test.bat
 else
    ./windows/internal/smoke_test.bat
 fi
 popd
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -45,14 +45,10 @@ self-hosted-runner:
    - windows.g5.4xlarge.nvidia.gpu
    # Windows ARM64 runners
    - windows-11-arm64
-    # Organization-wide AMD-hosted runners
+    # Organization-wide AMD hosted runners
    # MI2xx runners
    - linux.rocm.gpu
    - linux.rocm.gpu.2
    - linux.rocm.gpu.4
    # MI300 runners
    - linux.rocm.gpu.mi300.2
    - linux.rocm.gpu.mi300.4
    - rocm-docker
    # Repo-specific Apple hosted  runners
    - macos-m1-ultra
--- a/.github/actions/binary-docker-build/action.yml
+++ b/.github/actions/binary-docker-build/action.yml
@ -1,70 +0,0 @@
 name: Binary docker build
 description: Build docker image for binary builds
 inputs:
  docker-image-name:
    description: Docker image name for PR builds
    required: true
  docker-build-dir:
    description: Location of the build.sh relative to .ci/docker
    required: true
  custom-tag-prefix:
    description: Custom tag prefix for the docker image
    required: false
  DOCKER_TOKEN:
    description: Docker token for authentication
    required: true
  DOCKER_ID:
    description: Docker ID for authentication
    required: true
 runs:
  using: composite
  steps:
    - name: Checkout PyTorch
      uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
    - name: Calculate docker image
      id: calculate-docker-image
      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
      with:
        docker-image-name: ${{ inputs.docker-image-name }}
        docker-build-dir: .ci/docker
        custom-tag-prefix: ${{ inputs.custom-tag-prefix }}
        docker-build-script: ${{ inputs.docker-build-dir }}/build.sh
        always-rebuild: true
        push: true
    - name: Tag and (if WITH_PUSH) push docker image to docker.io
      env:
        DOCKER_TOKEN: ${{ inputs.DOCKER_TOKEN }}
        DOCKER_ID: ${{ inputs.DOCKER_ID }}
        DOCKER_IMAGE_NAME: ${{ inputs.docker-image-name }}
        DOCKER_IMAGE_PREFIX: ${{ inputs.custom-tag-prefix }}
        CREATED_FULL_DOCKER_IMAGE_NAME: ${{ steps.calculate-docker-image.outputs.docker-image }}
      shell: bash
      run: |
        set -euox pipefail
        GITHUB_REF=${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}
        GIT_BRANCH_NAME=${GITHUB_REF##*/}
        GIT_COMMIT_SHA=${GITHUB_SHA:-$(git rev-parse HEAD)}
        CI_FOLDER_SHA=$(git rev-parse HEAD:.ci/docker)
        DOCKER_IMAGE_NAME_PREFIX=docker.io/pytorch/${DOCKER_IMAGE_NAME}:${DOCKER_IMAGE_PREFIX}
        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}
        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
        docker tag ${CREATED_FULL_DOCKER_IMAGE_NAME} ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
        # Pretty sure Github will mask tokens and I'm not sure if it will even be
        # printed due to pipe, but just in case
        set +x
        if [[ ${WITH_PUSH:-false} == "true" ]]; then
          echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          docker push ${DOCKER_IMAGE_NAME_PREFIX}
          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}
          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}
          docker push ${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}
        fi
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -112,22 +112,3 @@
 - torch/csrc/inductor/aoti_include/xpu.h
 - torch/csrc/inductor/cpp_wrapper/device_internal/xpu.h
 - torch/csrc/inductor/cpp_wrapper/xpu.h
 "release notes: inductor (aoti)":
 - torch/_C/_aoti.pyi
 - torch/_dynamo/repro/aoti.py
 - torch/_export/serde/aoti_schema.py
 - torch/_higher_order_ops/aoti_call_delegate.py
 - torch/_inductor/codegen/aoti_runtime/**
 - torch/_inductor/codegen/aoti_hipify_utils.py
 - torch/_inductor/codegen/cpp_wrapper_cpu.py
 - torch/_inductor/codegen/cpp_wrapper_gpu.py
 - torch/_inductor/aoti_eager.py
 - torch/csrc/inductor/aoti_runtime/**
 - torch/csrc/inductor/aoti_torch/**
 - torch/csrc/inductor/aoti_runner/**
 - torch/csrc/inductor/aoti_eager/**
 - torch/csrc/inductor/aoti_package/**
 - torch/csrc/inductor/aoti_include/**
 - torchgen/aoti/**
 - torchgen/gen_aoti_c_shim.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,7 +16,6 @@ ciflow_push_tags:
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/periodic-rocm-mi300
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/s390
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -30,7 +30,7 @@ CUDA_ARCHES_CUDNN_VERSION = {
 }
 # NOTE: Also update the ROCm sources in tools/nightly.py when changing this list
-ROCM_ARCHES = ["6.3", "6.4"]
+ROCM_ARCHES = ["6.2.4", "6.3"]
 XPU_ARCHES = ["xpu"]
@ -173,7 +173,7 @@ WHEEL_CONTAINER_IMAGES = {
    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux2_28-builder:cpu-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": "pytorch/manylinuxs390x-builder:cpu-s390x",
+    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
 }
 RELEASE = "release"
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -227,6 +227,42 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
            isolated_workflow=True,
        ),
    ),
 ]
 WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
        build_variant=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
        branches="main",
        ciflow_config=CIFlowConfig(
            isolated_workflow=True,
        ),
    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
        build_variant=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
        branches="main",
        ciflow_config=CIFlowConfig(
            isolated_workflow=True,
        ),
    ),
 ]
 WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS_ARM64,
        package_type="wheel",
@ -272,39 +308,6 @@ WINDOWS_BINARY_BUILD_WORKFLOWS = [
    ),
 ]
 WINDOWS_BINARY_SMOKE_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
        build_variant=generate_binary_build_matrix.RELEASE,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.RELEASE,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
        branches="main",
        ciflow_config=CIFlowConfig(
            isolated_workflow=True,
        ),
    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.WINDOWS,
        package_type="libtorch",
        build_variant=generate_binary_build_matrix.DEBUG,
        build_configs=generate_binary_build_matrix.generate_libtorch_matrix(
            OperatingSystem.WINDOWS,
            generate_binary_build_matrix.DEBUG,
            arches=["cpu"],
            libtorch_variants=["shared-with-deps"],
        ),
        branches="main",
        ciflow_config=CIFlowConfig(
            isolated_workflow=True,
        ),
    ),
 ]
 MACOS_BINARY_BUILD_WORKFLOWS = [
    BinaryBuildWorkflow(
        os=OperatingSystem.MACOS_ARM64,
@ -399,6 +402,10 @@ def main() -> None:
            jinja_env.get_template("windows_binary_build_workflow.yml.j2"),
            WINDOWS_BINARY_SMOKE_WORKFLOWS,
        ),
        (
            jinja_env.get_template("windows_arm64_binary_build_workflow.yml.j2"),
            WINDOWS_ARM64_BINARY_BUILD_WORKFLOWS,
        ),
        (
            jinja_env.get_template("macos_binary_build_workflow.yml.j2"),
            MACOS_BINARY_BUILD_WORKFLOWS,
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -434,7 +434,7 @@ query ($owner: String!, $name: String!) {
 RE_GHSTACK_HEAD_REF = re.compile(r"^(gh/[^/]+/[0-9]+/)head$")
 RE_GHSTACK_DESC = re.compile(r"Stack.*:\r?\n(\* [^\r\n]+\r?\n)+", re.MULTILINE)
 RE_PULL_REQUEST_RESOLVED = re.compile(
-    r"(Pull Request resolved|Pull-Request-resolved): "
+    r"Pull Request resolved: "
    r"https://github.com/(?P<owner>[^/]+)/(?P<repo>[^/]+)/pull/(?P<number>[0-9]+)",
    re.MULTILINE,
 )
--- a/.github/templates/windows_arm64_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_arm64_binary_build_workflow.yml.j2
@ -0,0 +1,197 @@
 {% import 'common.yml.j2' as common %}
 {% import 'upload.yml.j2' as upload %}
 {%- block name -%}
 # Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: !{{ build_environment }}
 {%- endblock %}
 {%- macro set_runner_specific_vars() -%}
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
 {%- endmacro %}
 on:
  push:
    branches:
      - !{{ branches }}
    {%- if branches == "nightly" %}
    tags:
      # NOTE: Binary build pipelines should only get triggered on release candidate builds
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    {%- endif %}
 {%- for label in ciflow_config.labels | sort %}
    {%- if loop.first and branches != "nightly" %}
    tags:
    {%- endif %}
      - '!{{ label }}/*'
 {%- endfor %}
  workflow_dispatch:
 env:
  BUILD_ENVIRONMENT: !{{ build_environment }}
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
  AWS_DEFAULT_REGION: us-east-1
 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
    steps:
      !{{ set_runner_specific_vars() }}
      - name: Bootstrap folders
        shell: cmd
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
      - name: Bootstrap Build Tools
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
      - name: Remove Pytorch folder
        shell: cmd
        run: |
          rmdir /s /q "pytorch"
      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
          submodules: recursive
      - name: Bootstrap Python
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
      - name: Bootstrap APL
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - name: Bootstrap sccache
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
      - name: Bootstrap Libuv
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
      - name: Populate binary env
        shell: bash
        run: |
          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
      - uses: !{{ common.upload_artifact_action }}
        if: always()
        with:
          name: !{{ config["build_name"] }}
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - !{{ config["build_name"] }}-build
      - get-label-type
    runs-on: "windows-11-arm64"
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    steps:
      !{{ set_runner_specific_vars() }}
      - uses: !{{ common.download_artifact_action }}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
      - name: Remove Pytorch folder
        shell: cmd
        run: |
          rmdir /s /q "pytorch"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
          submodules: recursive
      - name: Bootstrap APL
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
      - name: Bootstrap Python
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
      - name: Bootstrap Build Tools
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - name: Populate binary env
        shell: bash
        run: |
          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
  {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config, True) }}
  {%- endif %}
 {%- endfor %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -49,15 +49,6 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: !{{ os }}
 {%- if os == "windows-arm64" %}
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
 {%- endif %}
 !{{ common.concurrency(build_environment) }}
 jobs:
@ -75,79 +66,20 @@ jobs:
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    {%- if os == "windows-arm64" %}
    runs-on: "windows-11-arm64"
    {%- else %}
    {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
    steps:
 {%- if os == "windows-arm64" %}
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
      - name: Bootstrap folders
        shell: cmd
        run: |
          mkdir "%NIGHTLIES_PYTORCH_ROOT%"
          mkdir "%PYTORCH_FINAL_PACKAGE_DIR%"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
      - name: Bootstrap Build Tools
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
      - name: Remove Pytorch folder
        shell: cmd
        run: |
          rmdir /s /q "pytorch"
      - name: Git checkout PyTorch - recursive
        uses: actions/checkout@v4
        with:
          path: "pytorch"
          submodules: recursive
      - name: Bootstrap Python
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
      - name: Bootstrap APL
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - name: Bootstrap sccache
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_sccache.bat"
      - name: Bootstrap Libuv
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_libuv.bat"
 {%- else %}
      !{{ set_runner_specific_vars() }}
      !{{ common.setup_ec2_windows() }}
      !{{ set_runner_specific_vars() }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
 {%- endif %}
      - name: Populate binary env
        shell: bash
        run: |
@ -163,17 +95,12 @@ jobs:
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
 {%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
 {% endif %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - !{{ config["build_name"] }}-build
      - get-label-type
 {%- if os == "windows-arm64" %}
    runs-on: "windows-11-arm64"
 {%- else %}
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
@ -186,61 +113,18 @@ jobs:
 {%- else %}
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
 {%- endif %}
 {%- endif %}
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
    !{{ upload.binary_env(config, True) }}
    steps:
 {%- if os == "windows-arm64" %}
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
      - name: Populate binary env
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
      - name: Remove Pytorch folder
        shell: cmd
        run: |
          rmdir /s /q "pytorch"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
          submodules: recursive
      - name: Bootstrap APL
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_apl.bat"
      - name: Bootstrap Python
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_python.bat"
      - name: Bootstrap Build Tools
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_buildtools.bat"
      - name: Bootstrap Rust
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
 {%- else %}
      !{{ common.setup_ec2_windows() }}
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ set_runner_specific_vars() }}
 {%- endif %}
      - uses: !{{ common.download_artifact_action }}
        name: Download Build Artifacts
        with:
          name: !{{ config["build_name"] }}
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      - name: Populate binary env
        shell: bash
        run: |
@ -249,10 +133,8 @@ jobs:
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
 {%- if os != "windows-arm64" %}
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
 {%- endif %}
  {%- if branches == "nightly" %}
  !{{ upload.upload_binaries(config, True) }}
  {%- endif %}
-{%- endfor %}
+{%- endfor %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -33,10 +33,6 @@ on:
        default: "linux.large"
        description: Runner type
 permissions:
  id-token: write
  contents: read
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -84,13 +80,6 @@ jobs:
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          role-session-name: gha-bazel-build
          aws-region: us-east-1
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -213,13 +202,6 @@ jobs:
        uses: ./.github/actions/chown-workspace
        if: always()
      - name: Configure AWS Credentials
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts
          role-session-name: gha-bazel-build-upload-artifacts
          aws-region: us-east-1
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -38,11 +38,6 @@ on:
        required: false
        type: boolean
        default: true
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
 jobs:
  test:
@ -171,7 +166,6 @@ jobs:
          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        run: |
          # shellcheck disable=SC1090
          set -ex
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -47,10 +47,6 @@ on:
        type: boolean
        default: true
 permissions:
  id-token: write
  contents: read
 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -11,14 +11,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - .ci/docker/**
+      - '.ci/docker/almalinux/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-almalinux-images.yml
      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - .ci/docker/**
+      - '.ci/docker/almalinux/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-almalinux-images.yml
      - .github/actions/binary-docker-build/**
 env:
  DOCKER_REGISTRY: "docker.io"
@ -37,12 +37,37 @@ jobs:
    strategy:
      matrix:
        cuda_version: ["11.8", "12.4", "12.6", "cpu"]
    env:
      CUDA_VERSION: ${{ matrix.cuda_version }}
    steps:
-      - name: Build docker image
+      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          docker-image-name: almalinux-builder
+          submodules: false
-          custom-tag-prefix: ${{ matrix.cuda_version != 'cpu' && 'cuda' || '' }}${{matrix.cuda_version}}
+      - name: Calculate docker image
-          docker-build-dir: almalinux
+        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: almalinux-builder${{ matrix.cuda_version == 'cpu' && '-' || '-cuda' }}${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/almalinux
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/almalinux/build.sh almalinux-builder${{ matrix.cuda_version == 'cpu' && ':' || ':cuda' }}${{matrix.cuda_version}}
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -10,14 +10,14 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - .ci/docker/**
+      - '.ci/docker/libtorch/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-libtorch-images.yml
      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - .ci/docker/**
+      - '.ci/docker/libtorch/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-libtorch-images.yml
      - .github/actions/binary-docker-build/**
 env:
  DOCKER_REGISTRY: "docker.io"
@ -39,29 +39,123 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  build:
+  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    name: libtorch-cxx11-builder:${{ matrix.tag }}
    strategy:
      fail-fast: false
      matrix:
-        include: [
+        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-          { tag: "cuda12.8" },
+    env:
-          { tag: "cuda12.6" },
+      GPU_ARCH_TYPE: cuda
-          { tag: "cuda12.4" },
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
          { tag: "cuda11.8" },
          { tag: "rocm6.3"  },
          { tag: "rocm6.4"  },
          { tag: "cpu"      },
        ]
    steps:
-      - name: Build docker image
+      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          docker-image-name: libtorch-cxx11-builder
+          submodules: false
-          custom-tag-prefix: ${{ matrix.tag }}
+      - name: Calculate docker image
-          docker-build-dir: libtorch
+        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: libtorch-cxx11-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/libtorch
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.2.4", "6.3"]
    env:
      GPU_ARCH_TYPE: rocm
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: libtorch-cxx11-builder-rocm${{matrix.rocm_version}}
            docker-build-dir:  .ci/docker/libtorch
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: libtorch-cxx11-builder-cpu
            docker-build-dir:  .ci/docker/libtorch
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cpu
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["64", "63"]
+        rocm_version: ["63", "624"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -11,11 +11,15 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - .ci/docker/**
+      - '.ci/docker/manywheel/*'
      - '.ci/docker/manywheel/build_scripts/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-manywheel-images-s390x.yml
  pull_request:
    paths:
-      - .ci/docker/**
+      - '.ci/docker/manywheel/*'
      - '.ci/docker/manywheel/build_scripts/*'
      - '.ci/docker/common/*'
      - .github/workflows/build-manywheel-images-s390x.yml
@ -33,45 +37,26 @@ jobs:
    if: github.repository_owner == 'pytorch'
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    runs-on: linux.s390x
    env:
      GPU_ARCH_TYPE: cpu-s390x
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
          no-sudo: true
-
+      - name: Authenticate if WITH_PUSH
-      - name: Build Docker Image
+        if: env.WITH_PUSH == 'true'
        run: |
          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x -t manylinuxs390x-builder:cpu-s390x
      - name: Tag and (if WITH_PUSH) push docker image to docker.io
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
          CREATED_FULL_DOCKER_IMAGE_NAME: manylinuxs390x-builder:cpu-s390x
        shell: bash
        run: |
-          set -euox pipefail
+          if [[ "${WITH_PUSH}" == true ]]; then
          GITHUB_REF="${GITHUB_REF:-$(git symbolic-ref -q HEAD || git describe --tags --exact-match)}"
          GIT_BRANCH_NAME="${GITHUB_REF##*/}"
          GIT_COMMIT_SHA="${GITHUB_SHA:-$(git rev-parse HEAD)}"
          CI_FOLDER_SHA="$(git rev-parse HEAD:.ci/docker)"
          DOCKER_IMAGE_NAME_PREFIX="docker.io/pytorch/${CREATED_FULL_DOCKER_IMAGE_NAME}"
          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
          docker tag "${CREATED_FULL_DOCKER_IMAGE_NAME}" "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
          # Prety sure Github will mask tokens and I'm not sure if it will even be
          # printed due to pipe, but just in case
          set +x
          if [[ "${WITH_PUSH:-false}" == "true" ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_BRANCH_NAME}"
            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${GIT_COMMIT_SHA}"
            docker push "${DOCKER_IMAGE_NAME_PREFIX}-${CI_FOLDER_SHA}"
          fi
      - name: Build Docker Image
        run: |
          .ci/docker/manywheel/build.sh manylinuxs390x-builder:cpu-s390x
      - name: Cleanup docker
        if: cancelled()
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -11,14 +11,17 @@ on:
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
    paths:
-      - .ci/docker/**
+      - '.ci/docker/common/*'
      - '.ci/docker/manywheel/*'
      - '.ci/docker/manywheel/build_scripts/*'
      - .github/workflows/build-manywheel-images.yml
      - .github/actions/binary-docker-build/**
  pull_request:
    paths:
-      - .ci/docker/**
+      - '.ci/docker/common/*'
      - '.ci/docker/manywheel/*'
      - '.ci/docker/manywheel/build_scripts/*'
      - .github/workflows/build-manywheel-images.yml
-      - .github/actions/binary-docker-build/**
+
 env:
  DOCKER_REGISTRY: "docker.io"
@ -40,34 +43,322 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  build:
+  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      fail-fast: false
      matrix:
-        include: [
+        cuda_version: ["12.8", "12.6", "12.4", "11.8"]
-          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+    env:
-          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
+      GPU_ARCH_TYPE: cuda-manylinux_2_28
-          { name: "manylinux2_28-builder",          tag: "cuda12.4",          runner: "linux.9xlarge.ephemeral" },
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
          { name: "manylinux2_28-builder",          tag: "cuda11.8",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxcxx11-abi-builder",     tag: "cpu-cxx11-abi",     runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
        ]
    runs-on: ${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}
    name: ${{ matrix.name }}:${{ matrix.tag }}
    steps:
-      - name: Build docker image
+      - name: Purge tools folder (free space for build)
-        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
+        run: rm -rf /opt/hostedtoolcache
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          docker-image-name: ${{ matrix.name }}
+          submodules: false
-          custom-tag-prefix: ${{ matrix.tag }}
+      - name: Calculate docker image
-          docker-build-dir: manywheel
+        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinux2_28-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.8"]
    env:
      GPU_ARCH_TYPE: cuda-aarch64
      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@v4
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinuxaarch64-builder-cuda${{matrix.cuda_version}}
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.2.4", "6.3"]
    env:
      GPU_ARCH_TYPE: rocm-manylinux_2_28
      GPU_ARCH_VERSION: ${{ matrix.rocm_version }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinux2_28-builder-rocm${{matrix.rocm_version}}
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinux2_28-builder-cpu
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinuxaarch64-builder-cpu-aarch64
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinux2_28_aarch64-builder-cpu-aarch64
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinuxcxx11-abi-builder-cpu-cxx11-abi
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: xpu
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
          submodules: false
      - name: Calculate docker image
        if: env.WITH_PUSH == 'false'
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
            docker-image-name: manylinux2_28-builder-xpu
            docker-build-dir:  .ci/docker/manywheel
            always-rebuild: true
            push: true
      - name: Authenticate if WITH_PUSH
        if: env.WITH_PUSH == 'true'
        env:
          DOCKER_TOKEN: ${{ secrets.DOCKER_TOKEN }}
          DOCKER_ID: ${{ secrets.DOCKER_ID }}
        run: |
          if [[ "${WITH_PUSH}" == true ]]; then
            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
          fi
      - name: Build Docker Image
        if: env.WITH_PUSH == 'true'
        uses: nick-fields/retry@v3.0.0
        with:
          shell: bash
          timeout_minutes: 90
          max_attempts: 3
          retry_wait_seconds: 90
          command: |
            .ci/docker/manywheel/build.sh manylinux2_28-builder:xpu
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -54,7 +54,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "6.4"
+            rocm_version: "6.3"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -79,7 +79,7 @@ jobs:
        ]
        include:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11
-            runner: linux.arm64.m7g.4xlarge
+            runner: linux.arm64.2xlarge
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -301,6 +301,98 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_2_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.2.4
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-rocm6_2_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_2_4-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm6_2_4-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.2.4
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm6_2_4-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm6_2_4-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm6_2_4-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.2.4
      GPU_ARCH_VERSION: 6.2.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.2.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm6_2_4-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_3-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -392,95 +484,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm6_4-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm6_4-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: pytorch/libtorch-cxx11-builder:rocm6.4-main
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm6_4-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: 6.4
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.4-main
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm6_4-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -55,7 +55,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      runs_on: linux.s390x
@ -79,7 +79,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -101,7 +101,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-s390x
@ -120,7 +120,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      runs_on: linux.s390x
@ -144,7 +144,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -166,7 +166,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-s390x
@ -185,7 +185,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      runs_on: linux.s390x
@ -209,7 +209,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -231,7 +231,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-s390x
@ -250,7 +250,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      runs_on: linux.s390x
@ -274,7 +274,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -296,7 +296,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-s390x
@ -315,7 +315,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      runs_on: linux.s390x
@ -339,7 +339,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
@ -361,7 +361,7 @@ jobs:
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cpu
      GPU_ARCH_TYPE: cpu-s390x
-      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x
+      DOCKER_IMAGE: pytorch/manylinuxs390x-builder:cpu-s390x-main
      use_split_build: False
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu-s390x
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-debug-nightly.yml
@ -1,12 +1,11 @@
 # @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-debug
 on:
  push:
    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -18,24 +17,18 @@ on:
  workflow_dispatch:
 env:
  # Needed for conda builds
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-debug
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-concurrency:
+  AWS_DEFAULT_REGION: us-east-1
  group: windows-arm64-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
 jobs:
  get-label-type:
@ -51,7 +44,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -66,6 +59,9 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -121,11 +117,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -139,7 +135,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-debug-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -154,17 +150,25 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Populate binary env
+      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -193,19 +197,14 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
  libtorch-cpu-shared-with-deps-debug-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-libtorch-release-nightly.yml
@ -1,12 +1,11 @@
 # @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-libtorch-release
 on:
  push:
    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -18,24 +17,18 @@ on:
  workflow_dispatch:
 env:
  # Needed for conda builds
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-libtorch-release
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-concurrency:
+  AWS_DEFAULT_REGION: us-east-1
  group: windows-arm64-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
 jobs:
  get-label-type:
@ -51,7 +44,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -66,6 +59,9 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -121,11 +117,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -139,7 +135,7 @@ jobs:
      - libtorch-cpu-shared-with-deps-release-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
@ -154,17 +150,25 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Populate binary env
+      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -193,19 +197,14 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
  libtorch-cpu-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-arm64-binary-wheel-nightly.yml
@ -1,12 +1,11 @@
 # @generated DO NOT EDIT MANUALLY
-# Template is at:    .github/templates/windows_binary_build_workflow.yml.j2
+# Template is at:    .github/templates/windows_arm64_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
 name: windows-arm64-binary-wheel
 on:
  push:
    # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build
    branches:
      - nightly
    tags:
@ -18,24 +17,18 @@ on:
  workflow_dispatch:
 env:
  # Needed for conda builds
  ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine"
  AWS_DEFAULT_REGION: us-east-1
  BUILD_ENVIRONMENT: windows-arm64-binary-wheel
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows-arm64
  PYTORCH_ROOT: /pytorch
  DOWNLOADS_DIR: c:\temp\downloads
  DEPENDENCIES_DIR: c:\temp\dependencies
  ENABLE_APL: 1
  ENABLE_OPENBLAS: 0
  MSVC_VERSION : 14.42
-concurrency:
+  AWS_DEFAULT_REGION: us-east-1
  group: windows-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
 jobs:
  get-label-type:
@ -51,7 +44,7 @@ jobs:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -63,6 +56,9 @@ jobs:
      DESIRED_PYTHON: "3.12"
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.5.1.17; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.6.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.26.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
@ -118,11 +114,11 @@ jobs:
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
@ -136,7 +132,7 @@ jobs:
      - wheel-py3_12-cpu-build
      - get-label-type
    runs-on: "windows-11-arm64"
-    timeout-minutes: 300
+    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: wheel
@ -147,17 +143,25 @@ jobs:
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: cmd
        run: |
          echo BINARY_ENV_FILE=%RUNNER_TEMP%/env>> %GITHUB_ENV%
          echo PYTORCH_FINAL_PACKAGE_DIR=%RUNNER_TEMP%/artifacts>> %GITHUB_ENV%
          echo WIN_PACKAGE_WORK_DIR=%RUNNER_TEMP%>> %GITHUB_ENV%
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: wheel-py3_12-cpu
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Git checkout PyTorch
        uses: actions/checkout@v4
        with:
          path: "pytorch"
-      - name: Populate binary env
+      - name: Bootstrap Git
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_git.bat"
@ -186,19 +190,14 @@ jobs:
        shell: cmd
        run: |
          "pytorch/.ci/pytorch/windows/arm64/bootstrap_rust.bat"
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: wheel-py3_12-cpu
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+          "pytorch/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
-          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
+          "pytorch/.circleci/scripts/binary_windows_arm64_test.sh"
  wheel-py3_12-cpu-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
--- a/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-main.yml
@ -19,7 +19,6 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -53,15 +52,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -106,6 +96,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -146,7 +145,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -212,18 +210,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -238,6 +224,18 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -26,7 +26,6 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows
 concurrency:
  group: windows-binary-libtorch-debug-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -60,15 +59,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -113,6 +103,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -153,7 +152,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -219,18 +217,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -245,6 +231,18 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -308,15 +306,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -361,6 +350,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -401,7 +399,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda11_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -468,18 +465,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -494,6 +479,18 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -558,15 +555,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -611,6 +599,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -651,7 +648,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_6-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -718,18 +714,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -744,6 +728,18 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -808,15 +804,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -861,6 +848,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -901,7 +897,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_8-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -968,18 +963,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -994,6 +977,18 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-main.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-main.yml
@ -19,7 +19,6 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -53,15 +52,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -106,6 +96,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -146,7 +145,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -212,18 +210,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -238,6 +224,18 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -26,7 +26,6 @@ env:
  PR_NUMBER: ${{ github.event.pull_request.number }}
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 1
  OS: windows
 concurrency:
  group: windows-binary-libtorch-release-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -60,15 +59,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -113,6 +103,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -153,7 +152,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cpu-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -219,18 +217,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -245,6 +231,18 @@ jobs:
        with:
          name: libtorch-cpu-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -308,15 +306,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -361,6 +350,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -401,7 +399,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda11_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -468,18 +465,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -494,6 +479,18 @@ jobs:
        with:
          name: libtorch-cuda11_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -558,15 +555,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -611,6 +599,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -651,7 +648,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_6-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -718,18 +714,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -744,6 +728,18 @@ jobs:
        with:
          name: libtorch-cuda12_6-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
@ -808,15 +804,6 @@ jobs:
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.9"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
@ -861,6 +848,15 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
@ -901,7 +897,6 @@ jobs:
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_8-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
@ -968,18 +963,6 @@ jobs:
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
@ -994,6 +977,18 @@ jobs:
        with:
          name: libtorch-cuda12_8-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-macos.yml
+++ b/.github/workflows/inductor-perf-test-nightly-macos.yml
@ -1,4 +1,5 @@
-name: inductor-perf-nightly-macos
+name: perf-nightly-macos
 # Technically not an inductor test, but uses it as a template for tracking macos performance
 on:
  schedule:
@ -23,7 +24,6 @@ on:
  pull_request:
    paths:
      - .github/workflows/inductor-perf-test-nightly-macos.yml
      - .ci/pytorch/macos-test.sh
 concurrency:
  group:  ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -38,7 +38,7 @@ jobs:
    uses: ./.github/workflows/_mac-build.yml
    with:
      sync-tag: macos-perf-py3-arm64-build
-      build-environment: macos-py3-arm64-distributed
+      build-environment: macos-py3-arm64
      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
@ -54,7 +54,7 @@ jobs:
    uses: ./.github/workflows/_mac-test.yml
    needs: macos-perf-py3-arm64-build
    with:
-      build-environment: macos-py3-arm64-distributed
+      build-environment: macos-py3-arm64
      # Same as the build job
      python-version: 3.9.12
      test-matrix: ${{ needs.macos-perf-py3-arm64-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -36,11 +36,11 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.12xlarge.nvidia.gpu" },
+          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_cpp_wrapper", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit
@ -65,8 +65,8 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit
@ -90,7 +90,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit
@ -114,7 +114,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit
@ -138,10 +138,10 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "linux.10xlarge.avx2" },
        ]}
    secrets: inherit
@ -165,8 +165,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -53,11 +53,11 @@ jobs:
      sync-tag: linux-focal-cuda12_6-py3_10-gcc9-inductor-build
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit
@ -82,14 +82,14 @@ jobs:
      sync-tag: linux-jammy-cpu-py3_9-gcc11-inductor-build
      test-matrix: |
        { include: [
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.8xlarge.amx" },
-          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
        ]}
    secrets: inherit
--- a/.github/workflows/linux-aarch64.yml
+++ b/.github/workflows/linux-aarch64.yml
@ -37,13 +37,13 @@ jobs:
      runner: linux.arm64.2xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" },
-          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.m7g.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.arm64.m7g.4xlarge" },
        ]}
    secrets: inherit
--- a/.github/workflows/periodic-rocm-mi300.yml
+++ b/.github/workflows/periodic-rocm-mi300.yml
@ -1,81 +0,0 @@
 name: periodic-rocm-mi300
 on:
  schedule:
    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
    # Also run less frequently on weekends.
    - cron: 45 0,8,16 * * 1-5
    - cron: 45 4 * * 0,6
    - cron: 45 4,12,20 * * 1-5
    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
      - ciflow/periodic-rocm-mi300/*
    branches:
      - release/*
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true
 permissions: read-all
 jobs:
  llm-td:
    if: github.repository_owner == 'pytorch'
    name: before-test
    uses: ./.github/workflows/llm_td_retrieval.yml
    permissions:
      id-token: write
      contents: read
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
    needs: llm-td
    permissions:
      id-token: write
      contents: read
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-focal-rocm-py3_10-build:
    name: linux-focal-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-rocm-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit
  linux-focal-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
    name: linux-focal-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-focal-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-focal-rocm-py3.10
      docker-image: ${{ needs.linux-focal-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -182,14 +182,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 1, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 2, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 3, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 4, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 5, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 6, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 7, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
-          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
+          { config: "default", shard: 8, num_shards: 8, runner: "linux.g5.4xlarge.nvidia.gpu", owners: ["module:slowgradcheck"] },
        ]}
    secrets: inherit
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -184,7 +184,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.9-clang10
      docker-image-name: pytorch-linux-focal-py3.9-clang10
      test-matrix: |
@ -385,9 +385,6 @@ jobs:
    name: linux-focal-cpu-py3.10-gcc11-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
    needs: get-label-type
    permissions:
      id-token: write
      contents: read
    with:
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.6-py3.10-gcc11-bazel-test
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@ -21,6 +21,6 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
      runner: linux.s390x
    secrets: inherit
--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@ -42,7 +42,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
      runner: linux.s390x
      test-matrix: |
        { include: [
@ -70,7 +70,7 @@ jobs:
      - target-determination
    with:
      build-environment: linux-s390x-binary-manywheel
-      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x
+      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
      test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
      timeout-minutes: 600
      use-gha: "yes"
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -143,9 +143,9 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
        ]}
      sync-tag: asan-build
    secrets: inherit
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -2,7 +2,7 @@ name: Upload test stats
 on:
  workflow_run:
-    workflows: [pull, trunk, periodic, periodic-rocm-mi300, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps]
+    workflows: [pull, trunk, periodic, inductor, unstable, slow, unstable-periodic, inductor-periodic, rocm, rocm-mi300, inductor-micro-benchmark, inductor-micro-benchmark-x86, inductor-cu124, inductor-rocm, inductor-rocm-mi300, mac-mps]
    types:
      - completed
--- a/.github/workflows/upload-torch-dynamo-perf-stats.yml
+++ b/.github/workflows/upload-torch-dynamo-perf-stats.yml
@ -2,7 +2,7 @@ name: Upload torch dynamo performance stats
 on:
  workflow_run:
-    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, inductor-perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
+    workflows: [inductor-A100-perf-nightly, inductor-perf-nightly-A10g, inductor-perf-nightly-aarch64, inductor-perf-nightly-x86, perf-nightly-macos, inductor-perf-nightly-rocm, inductor-perf-nightly-h100]
    types:
      - completed
--- a/.gitignore
+++ b/.gitignore
@ -178,7 +178,6 @@ compile_commands.json
 *.egg-info/
 docs/source/scripts/activation_images/
 docs/source/scripts/quantization_backend_configs/
 docs/source/scripts/lr_scheduler_images/
 ## General
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1165,6 +1165,14 @@ exclude_patterns = [
    'test/quantization/core/test_utils.py',
    'test/quantization/core/test_workflow_module.py',
    'test/quantization/core/test_workflow_ops.py',
    'test/quantization/eager/__init__.py',
    'test/quantization/eager/test_bias_correction_eager.py',
    'test/quantization/eager/test_equalize_eager.py',
    'test/quantization/eager/test_fuse_eager.py',
    'test/quantization/eager/test_model_numerics.py',
    'test/quantization/eager/test_numeric_suite_eager.py',
    'test/quantization/eager/test_quantize_eager_ptq.py',
    'test/quantization/eager/test_quantize_eager_qat.py',
    'test/quantization/fx/__init__.py',
    'test/quantization/fx/test_equalize_fx.py',
    'test/quantization/fx/test_model_report_fx.py',
@ -1715,7 +1723,7 @@ command = [
    '@{{PATHSFILE}}'
 ]
 include_patterns = [
-   'torch/_inductor/**/*.py'
+   'torch/**/not-exist.py'
 ]
 is_formatter = false
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -1,5 +1,4 @@
 load("@bazel_skylib//lib:paths.bzl", "paths")
 load("@com_github_google_flatbuffers//:build_defs.bzl", "flatbuffer_cc_library")
 load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
 load("@rules_cc//cc:defs.bzl", "cc_binary", "cc_library", "cc_test")
 load("@rules_python//python:defs.bzl", "py_library", "py_test")
@ -660,15 +659,6 @@ cc_library(
 # torch
 torch_cuda_headers = glob(["torch/csrc/cuda/*.h"])
 flatbuffer_cc_library(
    name = "torch_flatbuffers",
    srcs = [
        "torch/csrc/jit/serialization/mobile_bytecode.fbs",
    ],
    flatc_args = ["--cpp", "--gen-mutable", "--scoped-enums"],
    out_prefix = "torch/csrc/jit/serialization/",
 )
 cc_library(
    name = "torch_headers",
    hdrs = if_cuda(
@ -682,7 +672,6 @@ cc_library(
        ],
        exclude = [
            "torch/csrc/*/generated/*.h",
            "torch/csrc/jit/serialization/mobile_bytecode_generated.h",
        ] + torch_cuda_headers,
    ) + GENERATED_AUTOGRAD_CPP + [":version_h"],
    includes = [
@ -697,7 +686,6 @@ cc_library(
    deps = [
        ":aten_headers",
        ":caffe2_headers",
        ":torch_flatbuffers",
        "//c10",
        "@com_github_google_flatbuffers//:flatbuffers",
        "@local_config_python//:python_headers",
--- a/6
+++ b/6
@ -165,9 +165,9 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 /torch/_export/ @avikchaudhuri @tugsbayasgalan @zhxchen17 @ydwu4 @angelayi
 # Dynamic Shapes
-/torch/fx/experimental/symbolic_shapes.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/symbolic_shapes.py @bobren @laithsakka
-/torch/fx/experimental/sym_node.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/sym_node.py @bobren @laithsakka
-/torch/fx/experimental/recording.py @bobrenjc93 @laithsakka
+/torch/fx/experimental/recording.py @bobren @laithsakka
 # serialization-related files
 /aten/src/ATen/MapAllocator* @mikaylagawarecki
--- a/aten/src/ATen/NestedTensorImpl.cpp
+++ b/aten/src/ATen/NestedTensorImpl.cpp
@ -182,7 +182,7 @@ NestedTensorImpl::NestedTensorImpl(
      "coverage, and works with torch.compile.");
  auto storage_device = storage_.device();
  TORCH_INTERNAL_ASSERT(
-      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_hpu() || storage_device.is_privateuseone(),
+      storage_device.is_cpu() || storage_device.is_cuda() || storage_device.is_xpu() || storage_device.is_privateuseone(),
      "NestedTensorImpl storage must be either CUDA, CPU, XPU or ", get_privateuse1_backend(), " but got ",
      storage_device);
  validate_nested_tensor_metadata(nested_sizes_, nested_strides_, storage_offsets_);
--- a/aten/src/ATen/OpaqueTensorImpl.h
+++ b/aten/src/ATen/OpaqueTensorImpl.h
@ -29,20 +29,12 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
      bool is_non_overlapping_and_dense = true)
      : TensorImpl(key_set, data_type, device),
        opaque_handle_(std::move(opaque_handle)) {
-    constructor_impl(sizes, is_non_overlapping_and_dense);
+    set_storage_access_should_throw();
-  }
+    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
-
+    sizes_and_strides_.set_sizes(sizes);
-  OpaqueTensorImpl(
+    refresh_numel();
-      TensorImpl::ImplType impl_type,
+    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
-      c10::Storage&& storage,
+    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
      at::DispatchKeySet key_set,
      const caffe2::TypeMeta data_type,
      OpaqueHandle opaque_handle,
      c10::IntArrayRef sizes,
      bool is_non_overlapping_and_dense = true)
      : TensorImpl(impl_type, std::move(storage), key_set, data_type),
        opaque_handle_(std::move(opaque_handle)) {
    constructor_impl(sizes, is_non_overlapping_and_dense);
  }
  // Destructor doesn't call release_resources because it's
@ -189,17 +181,6 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl {
    return "OpaqueTensorImpl";
  }
  void constructor_impl(
      c10::IntArrayRef sizes,
      bool is_non_overlapping_and_dense) {
    set_storage_access_should_throw();
    set_custom_sizes_strides(SizesStridesPolicy::CustomStrides);
    sizes_and_strides_.set_sizes(sizes);
    refresh_numel();
    // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
    is_non_overlapping_and_dense_ = is_non_overlapping_and_dense;
  }
  OpaqueHandle opaque_handle_;
 };
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -10,13 +10,15 @@
 #include <mkl.h>
 #endif
 #if AT_MKLDNN_ENABLED()
 #include <ATen/native/mkldnn/IDeepRegistration.h>
 #endif
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
 namespace at {
 #if AT_MKLDNN_ENABLED()
 namespace native::mkldnn {
 // NOLINTNEXTLINE(misc-use-internal-linkage)
 void clear_computation_cache();
 } // namespace native::mkldnn
 #endif
 namespace {
 // Number of threads set by the user
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -222,8 +222,8 @@ inline Tensor applySlice(
        ? (*self_sizes)[dim]
        : self.sym_size(dim);
    if (!disable_slice_optimization &&
-        TORCH_STATICALLY_KNOWN_TRUE(start.sym_eq(0)) &&
+        TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) &&
-        TORCH_STATICALLY_KNOWN_TRUE(length.sym_eq(stop)) && step == 1) {
+        TORCH_GUARD_SIZE_OBLIVIOUS(length.sym_eq(stop)) && step == 1) {
      return self;
    }
  }
--- a/aten/src/ATen/core/CachingHostAllocator.cpp
+++ b/aten/src/ATen/core/CachingHostAllocator.cpp
@ -1,33 +0,0 @@
 #include <ATen/core/CachingHostAllocator.h>
 #include <array>
 namespace at {
 namespace {
 static std::array<HostAllocator*, at::COMPILE_TIME_MAX_DEVICE_TYPES>
    allocator_array{};
 static std::array<uint8_t, at::COMPILE_TIME_MAX_DEVICE_TYPES>
    allocator_priority{};
 } // anonymous namespace
 void setHostAllocator(
    at::DeviceType device_type,
    at::HostAllocator* allocator,
    uint8_t priority) {
  if (priority >= allocator_priority[static_cast<int>(device_type)]) {
    allocator_array[static_cast<int>(device_type)] = allocator;
    allocator_priority[static_cast<int>(device_type)] = priority;
  }
 }
 at::HostAllocator* getHostAllocator(at::DeviceType device_type) {
  auto* allocator = allocator_array[static_cast<int>(device_type)];
  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
      allocator, "Host Allocator for ", device_type, " is not set.");
  return allocator;
 }
 } // namespace at
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,5 +1,4 @@
 #include <c10/core/Allocator.h>
 #include <c10/core/Stream.h>
 #include <c10/core/thread_pool.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
@ -47,7 +46,7 @@ namespace {
 }
 // Struct containing memory allocator summary statistics for host.
-struct TORCH_API HostStats {
+struct HostStats {
  // COUNT: allocations requested by client code. Note that active
  // count can be extracted by looking at current allocations
  Stat allocation;
@ -275,8 +274,7 @@ struct CachingHostAllocatorImpl {
    }
  }
-  virtual bool record_event(void* ptr, void* ctx, c10::Stream s) {
+  virtual bool record_event(void* ptr, void* ctx, S stream) {
    S stream = S(s);
    auto* block = reinterpret_cast<B*>(ctx);
    // Note: we need to check if the passed-in `ctx` is valid. This is because
@ -622,49 +620,24 @@ protected:
  alignas(64) HostStatsStaged stats_;
 };
-struct TORCH_API HostAllocator : public at::Allocator {
+template <typename T>
-  // Associates the pinned memory allocation with a stream to track
+struct CachingHostAllocatorInterface : public at::Allocator {
  // dependencies. This ensures the memory won't be reused until the stream's
  // operations complete
  virtual bool record_event(void* ptr, void* ctx, c10::Stream stream) = 0;
  // Frees all cached pinned memory and returns it to the system, clearing the
  // allocator's internal cache
  virtual void empty_cache() = 0;
  // Returns comprehensive statistics about the allocator's memory usage,
  // allocation patterns, and timing metrics
  virtual HostStats get_stats() = 0;
  // Resets the cumulative allocation statistics
  virtual void reset_accumulated_stats() = 0;
  // Resets the peak memory usage metrics
  virtual void reset_peak_stats() = 0;
 };
 template <typename T, c10::DeleterFnPtr deleteFunc>
 struct CachingHostAllocatorInterface : public HostAllocator {
  CachingHostAllocatorInterface() : impl_(std::make_unique<T>()) {}
  at::DataPtr allocate(size_t size) override {
-    auto ptr_and_ctx = impl_->allocate(size);
+    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate");
    return {
        ptr_and_ctx.first,
        ptr_and_ctx.second,
        deleteFunc, // Use the template parameter deleter function
        at::DeviceType::CPU};
  }
  void free(void* ctx) {
    impl_->free(ctx);
  }
-  bool record_event(void* ptr, void* ctx, c10::Stream stream) override {
+  template <typename S>
  bool record_event(void* ptr, void* ctx, S stream) {
    return impl_->record_event(ptr, ctx, stream);
  }
-  void empty_cache() override {
+  void empty_cache() {
    impl_->empty_cache();
  }
@ -673,54 +646,20 @@ struct CachingHostAllocatorInterface : public HostAllocator {
    impl_->copy_data(dest, src, count);
  }
-  HostStats get_stats() override {
+  HostStats getStats() {
    return impl_->getStats();
  }
-  void reset_accumulated_stats() override {
+  void resetAccumulatedStats() {
    impl_->resetAccumulatedStats();
  }
-  void reset_peak_stats() override {
+  void resetPeakStats() {
    impl_->resetPeakStats();
  }
  std::unique_ptr<T> impl_;
 };
 #define DECLARE_HOST_ALLOCATOR(name, impl, deleter, instance)       \
  void deleter(void* ptr);                                          \
  struct name final                                                 \
      : public at::CachingHostAllocatorInterface<impl, deleter> {}; \
  static name instance;                                                    \
  void deleter(void* ptr) {                                         \
    instance.free(ptr);                                             \
  }
 /**
 * Set the host allocator for DeviceType `device_type`. This allocator manages
 * pinned memory on the host that can be accessed efficiently by the specified
 * device type. Note that this function is not thread-safe.
 */
 TORCH_API void setHostAllocator(
    at::DeviceType device_type,
    at::HostAllocator* allocator,
    uint8_t priority = 0);
 TORCH_API at::HostAllocator* getHostAllocator(at::DeviceType device_type);
 template <DeviceType device_type>
 struct HostAllocatorRegistry {
  explicit HostAllocatorRegistry(HostAllocator* allocator) {
    at::setHostAllocator(device_type, allocator);
  }
 };
 #define REGISTER_HOST_ALLOCATOR(device_type, allocator) \
  namespace {                                           \
  static at::HostAllocatorRegistry<device_type>         \
      g_host_allocator_registry_instance(allocator);    \
  }
 } // namespace at
 C10_DIAGNOSTIC_POP()
--- a/aten/src/ATen/core/function_schema.cpp
+++ b/aten/src/ATen/core/function_schema.cpp
@ -41,15 +41,9 @@ FunctionSchema FunctionSchema::cloneWithRealTypes(bool with_symint) const {
    }
  };
  std::vector<Argument> new_arguments, new_returns;
-  new_arguments.reserve(arguments().size());
+  std::transform(arguments().begin(), arguments().end(), std::back_inserter(new_arguments), cloneWithRealTypes);
  for (const auto& arg: arguments()) {
    new_arguments.push_back(cloneWithRealTypes(arg));
  }
  // NB: SymInt returns are always SymInt
-  new_returns.reserve(returns().size());
+  std::transform(returns().begin(), returns().end(), std::back_inserter(new_returns), alwaysCloneWithRealTypes);
  for (const auto& ret: returns()) {
    new_returns.push_back(alwaysCloneWithRealTypes(ret));
  }
  return FunctionSchema(
    name(),
    overload_name(),
--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@ -1,7 +1,6 @@
 #include <torch/library.h>
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <fmt/format.h>
 namespace torch {
@ -12,7 +11,7 @@ namespace {
 #ifdef STRIP_ERROR_MESSAGES
    return std::string();
 #else
-    return fmt::format("registered at {}:{}", file, line);
+    return c10::str("registered at ", file, ":", line);
 #endif
  }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -248,6 +248,7 @@ namespace at::cuda::blas {
    CUDABLAS_NONNEGINT_CHECK(bgemm<Dtype>, num_batches);  \
  } while (0)
 namespace {
 // Following the pattern of CuSparseDescriptor
 // Defined here for now because this is the only place cublas_lt interface is
@ -333,10 +334,9 @@ class CuBlasLtMatmulPreference : public CuBlasLtDescriptor<
 } // namespace
-template <typename Dtype, typename C_Dtype = Dtype>
+template <typename Dtype>
-static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t abcType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
 #ifndef USE_ROCM
@ -346,8 +346,7 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
  void * alpha_ptr = &alpha;
  void * beta_ptr = &beta;
  if constexpr (std::is_same_v<Dtype, double>) {
-    abType = CUDA_R_64F;
+    abcType = CUDA_R_64F;
    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
@ -355,13 +354,11 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
  } else if constexpr (std::is_same_v<Dtype, c10::complex<double>>) {
-    abType = CUDA_C_64F;
+    abcType = CUDA_C_64F;
    cType = CUDA_C_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_C_64F;
  } else if constexpr (std::is_same_v<Dtype, c10::complex<float>>) {
-    abType = CUDA_C_32F;
+    abcType = CUDA_C_32F;
    cType = CUDA_C_32F;
    scaleType = CUDA_C_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
@ -374,11 +371,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
      beta_ptr = &hbeta;
    }
 #endif
-    abType = CUDA_R_16F;
+    abcType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abType = CUDA_R_16BF;
+    abcType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  } else {
    static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublaslt: not implemented");
  }
@ -400,9 +395,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
            at::globalContext()._SMCarveout_EXPERIMENTAL().value());
  }
 #endif
-  CuBlasLtMatrixLayout Adesc(abType, m, k, lda, opa == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Bdesc(abType, k, n, ldb, opb == CUBLAS_OP_T);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T);
-  CuBlasLtMatrixLayout Cdesc(cType, m, n, ldc);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc);
  if (num_batches > 1) {
    int num_batches_as_int = static_cast<int>(num_batches);
@ -487,10 +482,8 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
      ldb,
      " ldc ",
      ldc,
-      " abType ",
+      " abcType ",
-      abType,
+      abcType,
      " cType ",
      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -502,9 +495,9 @@ static inline bool bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(D
 }
-template <typename Dtype, typename C_Dtype = Dtype>
+template <typename Dtype>
-inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
-  TORCH_CHECK(false, "at::cuda::blas::bgemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
+  static_assert(false && sizeof(Dtype), "at::cuda::blas::bgemm_internal_cublas: not implemented");
 }
 template <>
@ -563,8 +556,8 @@ void bgemm_internal_cublas<c10::complex<float>>(CUDABLAS_BGEMM_ARGTYPES(c10::com
      reinterpret_cast<cuComplex*>(c), ldc, stridec, num_batches));
 }
-template <typename C_Dtype>
+template <>
-inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -609,33 +602,23 @@ inline void bgemm_internal_cublas_half_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYP
      handle, opa, opb, m, n, k,
      alpha_ptr, a, CUDA_R_16F, lda, stridea,
      b, CUDA_R_16F, ldb, strideb, beta_ptr,
-      c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, stridec,
+      c, CUDA_R_16F, ldc, stridec,
      num_batches, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  } else {
    for (const auto i : c10::irange(num_batches)) {
-      if (std::is_same_v<C_Dtype, float>) {
+      at::cuda::blas::gemm<at::Half>(
-        float* c_ptr = (float*)(c + i * stridec);
+        transa, transb,
-        at::cuda::blas::gemm<at::Half, float>(
+        m, n, k,
-            transa, transb,
+        alpha, (a + i * stridea), lda,
-            m, n, k,
+        (b + i * strideb), ldb, beta,
-            alpha, (a + i * stridea), lda,
+        (c + i * stridec), ldc);
            (b + i * strideb), ldb, beta,
            c_ptr, ldc);
      } else {
        at::cuda::blas::gemm<at::Half>(
            transa, transb,
            m, n, k,
            alpha, (a + i * stridea), lda,
            (b + i * strideb), ldb, beta,
            (c + i * stridec), ldc);
      }
    }
  }
 #endif // USE_ROCM
 }
-template <typename C_Dtype>
+template <>
-inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  BGEMM_CHECK_ARGVALUES(at::BFloat16);
@ -652,37 +635,15 @@ inline void bgemm_internal_cublas_bfloat16_helper(CUDABLAS_BGEMM_ARGTYPES_AND_C_
  auto compute_type = CUDA_R_32F;
 #endif
  TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle,
-                              opa, opb, (int)m, (int)n, (int)k,
+                                  opa, opb, (int)m, (int)n, (int)k,
-                              (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
+                                  (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea,
-                              b, CUDA_R_16BF, (int)ldb, strideb,
+                                  b, CUDA_R_16BF, (int)ldb, strideb,
-                              (void*)&fbeta, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+                                  (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec,
-                              (int)ldc, stridec, (int)num_batches,
+                                  (int)num_batches,
-                              compute_type,
+                                  compute_type,
-                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+                                  CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 }
 template <>
 void bgemm_internal_cublas<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half)) {
  bgemm_internal_cublas_half_helper<at::Half>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
 template <>
 void bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
  bgemm_internal_cublas_half_helper<float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
 template <>
 void bgemm_internal_cublas<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  bgemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
 }
 template <>
 void bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
  bgemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
 }
 template <>
 void bgemm_internal<double>(CUDABLAS_BGEMM_ARGTYPES(double))
 {
@ -781,50 +742,9 @@ void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16))
  }
 }
-template<>
+template <typename DType>
-void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) {
-{
+  tunable::GemmStridedBatchedParams<DType> params;
  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
    // Do not allow fp16 reductions with fp32 output
    TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
  }
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    if (!bgemm_internal_cublaslt<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half))) {
      bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
    }
  }
 #if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
  }
 #endif
  else {
    bgemm_internal_cublas<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
  }
 }
 template<>
 void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
 {
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    if (!bgemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16))) {
      bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
    }
  }
 #if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
  }
 #endif
  else {
    bgemm_internal_cublas<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
  }
 }
 template <typename Dtype, typename C_Dtype = Dtype>
 inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  tunable::GemmStridedBatchedParams<Dtype> params;
  params.transa = transa;
  params.transb = transb;
  params.m = m;
@ -847,19 +767,19 @@ inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  bool transb_ = ((transb != 'n') && (transb != 'N'));
  if (transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::T, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && transb_) {
-    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::T> bgemm{};
    bgemm(&params);
  }
  else if (!transa_ && !transb_) {
-    static tunable::GemmStridedBatchedTunableOp<Dtype, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
+    static tunable::GemmStridedBatchedTunableOp<DType, tunable::BlasOp::N, tunable::BlasOp::N> bgemm{};
    bgemm(&params);
  }
  else {
@ -933,35 +853,9 @@ void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) {
  }
 }
-template <>
+template <typename Dtype>
-void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
+inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
-  #ifdef USE_ROCM
+  static_assert(false && sizeof(Dtype), "at::cuda::blas::gemm_internal_cublas: not implemented");
  TORCH_CHECK(false, "bgemm input type at::Half and output type float is not supported for ROCm");
  #endif
  // TODO: Support tuning for Half inputs and FP32 output
  bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGS(at::Half));
 }
 template <>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
  #ifdef USE_ROCM
  TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is not supported for ROCm");
  #else
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
    if (prop->major < 8)
      TORCH_CHECK(false, "bgemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
  #endif
  // TODO: Support tuning for BFloat16 inputs and FP32 output
  bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGS(at::BFloat16));
 }
 template <typename Dtype, typename C_Dtype = Dtype>
 inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  TORCH_CHECK(false, "at::cuda::blas::gemm: not implemented for input type ", typeid(Dtype).name(), " and output type ", typeid(C_Dtype).name());
 }
 template <>
@ -1020,8 +914,8 @@ void gemm_internal_cublas<c10::complex<float>>(CUDABLAS_GEMM_ARGTYPES(c10::compl
      reinterpret_cast<cuComplex*>(c), ldc));
 }
-template <typename C_Dtype>
+template <>
-inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, C_Dtype)) {
+void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
  // See Note [Writing Nondeterministic Operations]
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
@ -1100,7 +994,7 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
        ldb,
        beta_ptr,
        c,
-        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
+        CUDA_R_16F,
        ldc,
        compute_type,
        CUBLAS_GEMM_DEFAULT_TENSOR_OP));
@ -1122,14 +1016,14 @@ inline void gemm_internal_cublas_half_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(
        ldb,
        &fbeta,
        c,
-        std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F,
+        CUDA_R_16F,
        ldc));
  }
 #endif
 }
-template <typename C_Dtype>
+template <>
-inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, C_Dtype)) {
+void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  globalContext().alertCuBLASConfigNotDeterministic();
  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
  cublasOperation_t opa = _cublasOpFromChar(transa);
@ -1166,35 +1060,15 @@ inline void gemm_internal_cublas_bfloat16_helper(CUDABLAS_GEMM_ARGTYPES_AND_C_DT
      ldb,
      &fbeta,
      c,
-      std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16BF,
+      CUDA_R_16BF,
      ldc,
      compute_type,
      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
  TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH));
 }
-template <>
+template <typename Dtype>
-void gemm_internal_cublas<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half)) {
+inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
  gemm_internal_cublas_half_helper<at::Half>(CUDABLAS_GEMM_ARGS(at::Half));
 }
 template <>
 void gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
  gemm_internal_cublas_half_helper<float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
 template <>
 void gemm_internal_cublas<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  gemm_internal_cublas_bfloat16_helper<at::BFloat16>(CUDABLAS_GEMM_ARGS(at::BFloat16));
 }
 template <>
 void gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
  gemm_internal_cublas_bfloat16_helper<float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
 }
 template <typename Dtype, typename C_Dtype = Dtype>
 inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  // forward to bgemm implementation but set strides and batches to 0
  if (!bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0)) {
    gemm_internal_cublas(CUDABLAS_GEMM_ARGS(Dtype));
@ -1306,45 +1180,8 @@ void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16))
  }
 }
-template<>
+template <typename DType>
-void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float))
+inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) {
 {
  if (at::globalContext().allowFP16AccumulationCuBLAS()) {
    // Do not allow fp16 reductions with fp32 output
    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
  }
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    gemm_internal_cublaslt<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
  }
 #if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
  }
 #endif
  else {
    gemm_internal_cublas<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
  }
 }
 template<>
 void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float))
 {
  if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) {
    gemm_internal_cublaslt<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
 #if defined(USE_ROCM) && !defined(_MSC_VER)
  else if (at::globalContext().blasPreferredBackend() == BlasBackend::Ck) {
    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
  }
 #endif
  else {
    gemm_internal_cublas<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
  }
 }
 template <typename DType, typename C_Dtype>
 inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(DType, C_Dtype)) {
  tunable::GemmParams<DType> params;
  params.transa = transa;
  params.transb = transb;
@ -1450,32 +1287,8 @@ void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) {
  }
 }
 template <>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float)) {
  #ifdef USE_ROCM
  TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
  #endif
  // TODO: Support Tuning for fp16-fp32 gemm
  gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGS(at::Half));
 }
-
+template <typename Dtype>
 template <>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float)) {
  #ifdef USE_ROCM
  TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
  #else
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
    if (prop->major < 8)
      TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is only supported for CUDA devices with compute capability 8.0 or higher");
  #endif
  // TODO: Support Tuning for bf16-fp32 gemm
  gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGS(at::BFloat16));
 }
 template <typename Dtype, typename C_Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1488,27 +1301,13 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    C_Dtype* result_ptr,
+    Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation) {
  if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::BFloat16>) {
    #ifdef USE_ROCM
    TORCH_CHECK(false, "gemm input type at::BFloat16 and output type float is not supported for ROCm");
    #endif
  } else if (std::is_same_v<C_Dtype, float> && std::is_same_v<Dtype, at::Half>) {
    #ifdef USE_ROCM
    TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported for ROCm");
    #endif
    if (at::globalContext().allowFP16AccumulationCuBLAS())
      TORCH_CHECK(false, "gemm input type at::Half and output type float is not supported with allowFP16AccumulationCuBLAS");
  }
  using opmath_t = at::opmath_type<Dtype>;
  opmath_t beta_val = 0; // bias is added in epilogue
-  cudaDataType_t abType = CUDA_R_32F;
+  cudaDataType_t abcType = CUDA_R_32F;
  cudaDataType_t cType = CUDA_R_32F;
  cublasComputeType_t computeType = CUBLAS_COMPUTE_32F;
  cudaDataType_t scaleType = CUDA_R_32F;
  void * alpha_ptr = &alpha_val;
@ -1518,14 +1317,14 @@ bool gemm_and_bias(
  at::Half hbeta_val;
 #endif
  if constexpr (std::is_same_v<Dtype, double>) {
-    abType = CUDA_R_64F;
+    abcType = CUDA_R_64F;
    cType = CUDA_R_64F;
    computeType = CUBLAS_COMPUTE_64F;
    scaleType = CUDA_R_64F;
  } else if constexpr (std::is_same_v<Dtype, float>) {
    if (at::globalContext().allowTF32CuBLAS()) {
      computeType = CUBLAS_COMPUTE_32F_FAST_TF32;
    }
    abcType = CUDA_R_32F;
  } else if constexpr (std::is_same_v<Dtype, at::Half>) {
 #ifndef USE_ROCM
    cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties();
@ -1538,11 +1337,9 @@ bool gemm_and_bias(
      beta_ptr = &hbeta_val;
    }
 #endif
-    abType = CUDA_R_16F;
+    abcType = CUDA_R_16F;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16F;
  } else if constexpr (std::is_same_v<Dtype, at::BFloat16>) {
-    abType = CUDA_R_16BF;
+    abcType = CUDA_R_16BF;
    cType = (std::is_same_v<C_Dtype, float>) ? CUDA_R_32F : CUDA_R_16BF;
  }
  CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType);
@ -1572,9 +1369,9 @@ bool gemm_and_bias(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias);
  }
-  CuBlasLtMatrixLayout Adesc(abType, m, k, mat1_ld, transpose_mat1);
+  CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1);
-  CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2);
+  CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2);
-  CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld);
+  CuBlasLtMatrixLayout Cdesc(abcType, m, n, result_ld);
  CuBlasLtMatmulPreference preference;
  // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind
@ -1652,10 +1449,8 @@ bool gemm_and_bias(
      mat2_ld,
      " result_ld ",
      result_ld,
-      " abType ",
+      " abcType ",
-      abType,
+      abcType,
      " cType ",
      cType,
      " computeType ",
      computeType,
      " scaleType ",
@ -1714,22 +1509,6 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);
 template bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
    int64_t m,
    int64_t n,
    int64_t k,
    at::opmath_type<at::Half> alpha_val,
    const at::Half* mat1_ptr,
    int64_t mat1_ld,
    const at::Half* mat2_ptr,
    int64_t mat2_ld,
    const at::Half* bias,
    float* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);
 template bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -1746,22 +1525,6 @@ template bool gemm_and_bias(
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);
 template bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
    int64_t m,
    int64_t n,
    int64_t k,
    at::opmath_type<at::BFloat16> alpha_val,
    const at::BFloat16* mat1_ptr,
    int64_t mat1_ld,
    const at::BFloat16* mat2_ptr,
    int64_t mat2_ld,
    const at::BFloat16* bias,
    float* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation);
 void scaled_gemm(
    char transa,
    char transb,
--- a/aten/src/ATen/cuda/CUDABlas.h
+++ b/aten/src/ATen/cuda/CUDABlas.h
@ -39,26 +39,18 @@ private:
 /* LEVEL 3 BLAS FUNCTIONS */
-#define CUDABLAS_GEMM_ARGTYPES(Dtype) CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+#define CUDABLAS_GEMM_ARGTYPES(Dtype)                                                       \
 #define CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                  \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,  \
      const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type<Dtype> beta,\
-      C_Dtype *c, int64_t ldc
+      Dtype *c, int64_t ldc
 #define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc
-#define CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT \
+template <typename Dtype>
-    ((std::is_same<Dtype, at::Half>::value || std::is_same<Dtype, at::BFloat16>::value) && std::is_same<C_Dtype, float>::value)
+inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
 template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
 inline void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm: not implemented");
 }
 template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
 void gemm(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
 template <>
 void gemm<double>(CUDABLAS_GEMM_ARGTYPES(double));
 template <>
@ -71,13 +63,9 @@ template <>
 void gemm<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 template<>
 void gemm<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
 template<>
 void gemm<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
-template <typename Dtype, typename C_Dtype = Dtype>
+template <typename Dtype>
-inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::gemm_internal: not implemented");
 }
@ -93,10 +81,6 @@ template <>
 void gemm_internal<at::Half>(CUDABLAS_GEMM_ARGTYPES(at::Half));
 template <>
 void gemm_internal<at::BFloat16>(CUDABLAS_GEMM_ARGTYPES(at::BFloat16));
 template<>
 void gemm_internal<at::Half, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
 template<>
 void gemm_internal<at::BFloat16, float>(CUDABLAS_GEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
 enum GEMMAndBiasActivationEpilogue {
  None,
@ -106,7 +90,7 @@ enum GEMMAndBiasActivationEpilogue {
 // NOTE: GELU activation is not supported prior to CUDA 11.4 and will
 // do nothing if passed in that case.
-template <typename Dtype, typename C_Dtype = Dtype>
+template <typename Dtype>
 bool gemm_and_bias(
    bool transpose_mat1,
    bool transpose_mat2,
@ -119,7 +103,7 @@ bool gemm_and_bias(
    const Dtype* mat2_ptr,
    int64_t mat2_ld,
    const Dtype* bias,
-    C_Dtype* result_ptr,
+    Dtype* result_ptr,
    int64_t result_ld,
    GEMMAndBiasActivationEpilogue activation = GEMMAndBiasActivationEpilogue::None);
@ -161,25 +145,20 @@ void scaled_gemm(
    bool use_fast_accum,
    bool use_rowwise);
-#define CUDABLAS_BGEMM_ARGTYPES(Dtype)  CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, Dtype)
+#define CUDABLAS_BGEMM_ARGTYPES(Dtype)                                                        \
 #define CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)                                   \
  char transa, char transb, int64_t m, int64_t n, int64_t k, at::opmath_type<Dtype> alpha,    \
      const Dtype *a, int64_t lda, int64_t stridea,                                           \
      const Dtype *b, int64_t ldb, int64_t strideb,                                           \
-      at::opmath_type<Dtype> beta, C_Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
+      at::opmath_type<Dtype> beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches
 #define CUDABLAS_BGEMM_ARGS(Dtype) \
  transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches
-template <typename Dtype, typename C_Dtype = Dtype, typename std::enable_if<!CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
+template <typename Dtype>
-inline void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm: not implemented");
 }
 template <typename Dtype, typename C_Dtype, typename std::enable_if<CUDABLAS_GEMM_DTYPE_IS_FLOAT_TYPE_AND_C_DTYPE_IS_FLOAT, Dtype>::type* = nullptr>
 void bgemm(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype));
 template <>
 void bgemm<double>(CUDABLAS_BGEMM_ARGTYPES(double));
 template <>
@ -192,13 +171,9 @@ template <>
 void bgemm<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
 template<>
 void bgemm<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
 template<>
 void bgemm<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
-template <typename Dtype, typename C_Dtype = Dtype>
+template <typename Dtype>
-inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(Dtype, C_Dtype)) {
+inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) {
  static_assert(false&&sizeof(Dtype),"at::cuda::blas::bgemm_internal: not implemented");
 }
@ -214,10 +189,6 @@ template <>
 void bgemm_internal<at::Half>(CUDABLAS_BGEMM_ARGTYPES(at::Half));
 template <>
 void bgemm_internal<at::BFloat16>(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16));
 template<>
 void bgemm_internal<at::Half, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::Half, float));
 template<>
 void bgemm_internal<at::BFloat16, float>(CUDABLAS_BGEMM_ARGTYPES_AND_C_DTYPE(at::BFloat16, float));
 #define CUDABLAS_TRSM_ARGTYPES(Dtype)                                  \
  cublasHandle_t handle, cublasSideMode_t side, cublasFillMode_t uplo, \
--- a/aten/src/ATen/cuda/CachingHostAllocator.cpp
+++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp
@ -249,13 +249,58 @@ struct CUDACachingHostAllocatorImpl
  }
 };
-DECLARE_HOST_ALLOCATOR(
+void raw_local_deleter(void* ptr);
    CUDACachingHostAllocator,
    CUDACachingHostAllocatorImpl,
    raw_local_deleter,
    caching_host_allocator);
-REGISTER_HOST_ALLOCATOR(at::kCUDA, &caching_host_allocator)
+struct CUDACachingHostAllocator final
    : public CachingHostAllocatorInterface<CUDACachingHostAllocatorImpl> {
  at::DataPtr allocate(size_t size) override {
    auto ptr_and_ctx = impl_->allocate(size);
    return {
        ptr_and_ctx.first,
        ptr_and_ctx.second,
        &raw_local_deleter,
        at::DeviceType::CPU};
  }
 };
 CUDACachingHostAllocator caching_host_allocator;
 static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() {
  return caching_host_allocator;
 }
 void raw_local_deleter(void* ptr) {
  getCUDACachingHostAllocator().free(ptr);
 }
 } // anonymous namespace
 bool CachingHostAllocator_recordEvent(
    void* ptr,
    void* ctx,
    at::cuda::CUDAStream stream) {
  return getCUDACachingHostAllocator().record_event(ptr, ctx, stream);
 }
 // Releases cached pinned memory allocations via cudaHostFree
 void CachingHostAllocator_emptyCache() {
  getCUDACachingHostAllocator().empty_cache();
 }
 at::Allocator* getCachingHostAllocator() {
  return &getCUDACachingHostAllocator();
 }
 at::HostStats CachingHostAllocator_getStats() {
  return getCUDACachingHostAllocator().getStats();
 }
 void CachingHostAllocator_resetAccumulatedStats() {
  return getCUDACachingHostAllocator().resetAccumulatedStats();
 }
 void CachingHostAllocator_resetPeakStats() {
  return getCUDACachingHostAllocator().resetPeakStats();
 }
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/CachingHostAllocator.h
+++ b/aten/src/ATen/cuda/CachingHostAllocator.h
@ -18,52 +18,25 @@ namespace at::cuda {
 // call between host and device, and passed the corresponding context from the
 // allocation. This is currently invoked by at::native::copy_kernel_cuda.
 //
-C10_DEPRECATED_MESSAGE(
+TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator();
  "at::cuda::getCachingHostAllocator() is deprecated. Please use at::getHostAllocator(at::kCUDA) instead.")
 inline TORCH_CUDA_CPP_API at::HostAllocator* getCachingHostAllocator() {
  return at::getHostAllocator(at::kCUDA);
 }
 // Records an event in the specified stream. The allocation corresponding to the
 // input `ptr`/`ctx` will not be re-used until the event has occurred.
-C10_DEPRECATED_MESSAGE(
+TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
  "at::cuda::CachingHostAllocator_recordEvent(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->record_event(...) instead.")
 inline TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent(
    void* ptr,
    void* ctx,
-    c10::cuda::CUDAStream stream) {
+    c10::cuda::CUDAStream stream);
  return getHostAllocator(at::kCUDA)->record_event(ptr, ctx, stream.unwrap());
 }
 // Releases cached pinned memory allocations via cudaHostFree
-C10_DEPRECATED_MESSAGE(
+TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache();
  "at::cuda::CachingHostAllocator_emptyCache() is deprecated. Please use at::getHostAllocator(at::kCUDA)->empty_cache() instead.")
 inline TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache() {
  getHostAllocator(at::kCUDA)->empty_cache();
 }
 C10_DEPRECATED_MESSAGE(
  "at::cuda::HostAlloc(...) is deprecated. Please use at::getHostAllocator(at::kCUDA)->allocate(...) instead.")
 inline TORCH_CUDA_CPP_API at::DataPtr HostAlloc(size_t size) {
-  return getHostAllocator(at::kCUDA)->allocate(size);
+  return getCachingHostAllocator()->allocate(size);
 }
-C10_DEPRECATED_MESSAGE(
+TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats();
  "at::cuda::CachingHostAllocator_getStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->get_stats() instead.")
 inline TORCH_CUDA_CPP_API at::HostStats CachingHostAllocator_getStats() {
  return getHostAllocator(at::kCUDA)->get_stats();
 }
-C10_DEPRECATED_MESSAGE(
+TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats();
-  "at::cuda::CachingHostAllocator_resetAccumulatedStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_accumulated_stats() instead.")
+TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats();
 inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetAccumulatedStats() {
  getHostAllocator(at::kCUDA)->reset_accumulated_stats();
 }
 C10_DEPRECATED_MESSAGE(
  "at::cuda::CachingHostAllocator_resetPeakStats() is deprecated. Please use at::getHostAllocator(at::kCUDA)->reset_peak_stats() instead.")
 inline TORCH_CUDA_CPP_API void CachingHostAllocator_resetPeakStats() {
  getHostAllocator(at::kCUDA)->reset_peak_stats();
 }
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/PinnedMemoryAllocator.h
+++ b/aten/src/ATen/cuda/PinnedMemoryAllocator.h
@ -1,10 +1,11 @@
 #pragma once
 #include <c10/core/Allocator.h>
 #include <ATen/cuda/CachingHostAllocator.h>
 namespace at::cuda {
-inline TORCH_CUDA_CPP_API at::HostAllocator* getPinnedMemoryAllocator() {
+inline TORCH_CUDA_CPP_API at::Allocator* getPinnedMemoryAllocator() {
-  return at::getHostAllocator(at::kCUDA);
+  return getCachingHostAllocator();
 }
 } // namespace at::cuda
--- a/aten/src/ATen/cuda/tunable/GemmCommon.h
+++ b/aten/src/ATen/cuda/tunable/GemmCommon.h
@ -469,7 +469,7 @@ private:
  bool duplicate_inputs_{false};
 };
-template <typename T, typename C_Dtype = T>
+template <typename T>
 struct GemmStridedBatchedParams : OpParams {
  std::string BLASSignature() const override {
    std::string alpha_str = to_string_opmath<T>(alpha);
@ -477,7 +477,7 @@ struct GemmStridedBatchedParams : OpParams {
    return fmt::sprintf("- { function: matmul, M: %ld, N: %ld, K: %ld, lda: %ld, ldb: %ld, ldc: %ld, ldd: %ld, stride_a: %ld, stride_b: %ld, stride_c: %ld, stride_d: %ld, "
      "alpha: %s, beta: %s, transA: %c, transB: %c, batch_count: %ld, a_type: %s, b_type: %s, c_type: %s, d_type: %s, scale_type: %s, compute_type: %s }",
      m, n, k, lda, ldb, ldc, ldc, stride_a, stride_b, stride_c, stride_c, alpha_str, beta_str, transa, transb, batch,
-      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<C_Dtype>(C_Dtype{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
+      BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), BLASTypeName<T>(T{}), ComputeTypeFor<T>(), ComputeTypeFor<T>());
  }
  std::string Signature() const override {
@ -517,7 +517,7 @@ struct GemmStridedBatchedParams : OpParams {
    c10::DeviceIndex device = 0;
    AT_CUDA_CHECK(c10::cuda::GetDevice(&device));
    size_t c_size = GetSizeC();
-    copy->c = static_cast<C_Dtype*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
+    copy->c = static_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(c_size));
    AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync(
        copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true));
    if (duplicate_inputs) {
@ -544,7 +544,7 @@ struct GemmStridedBatchedParams : OpParams {
  }
  TuningStatus NumericalCheck(GemmStridedBatchedParams<T> *other) {
-    auto c_dtype = c10::CppTypeToScalarType<C_Dtype>::value;
+    auto c_dtype = c10::CppTypeToScalarType<T>::value;
    return detail::NumericalCheck(c_dtype, c, other->c, GetSizeC()/sizeof(T)) ? OK : FAIL;
  }
@ -561,7 +561,7 @@ struct GemmStridedBatchedParams : OpParams {
  int64_t ldb{};
  int64_t stride_b{};
  at::opmath_type<T> beta;
-  C_Dtype* c{};
+  T* c{};
  int64_t ldc{};
  int64_t stride_c{};
  int64_t batch{};
--- a/aten/src/ATen/native/BatchLinearAlgebra.cpp
+++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp
@ -849,7 +849,10 @@ namespace at::native {
 // linear algebra operations
 template<class scalar_t>
-static void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);
+void lapackCholeskySolve(char uplo, int n, int nrhs, scalar_t *a, int lda, scalar_t *b, int ldb, int *info);
 template<class scalar_t, class value_t=scalar_t>
 void lapackSymeig(char jobz, char uplo, int n, scalar_t *a, int lda, value_t *w, scalar_t *work, int lwork, value_t *rwork, int *info);
 template<> void lapackLu<c10::complex<double>>(int m, int n, c10::complex<double> *a, int lda, int *ipiv, int *info) {
  zgetrf_(&m, &n, reinterpret_cast<std::complex<double>*>(a), &lda, ipiv, info);
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@ -1383,35 +1383,35 @@ Tensor bitwise_right_shift(const Scalar& self, const Tensor& other) {
 }
 template <typename Stub>
-static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
+Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Tensor& other, Stub& stub) {
  auto iter = TensorIterator::comparison_op(result, self, other);
  stub(iter.device_type(), iter);
  return result;
 }
 template <typename OutImpl>
-static Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
+Tensor comparison_op(const Tensor& self, const Tensor& other, OutImpl& out_impl) {
  Tensor result = at::empty({0}, self.options().dtype(kBool));
  return out_impl(result, self, other);
 }
 template <typename OutImpl>
-static Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
+Tensor& comparison_op_(Tensor& self, const Tensor& other, OutImpl& out_impl) {
  return out_impl(self, self, other);
 }
 template <typename OutImpl>
-static Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+Tensor& comparison_op_out(Tensor& result, const Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return out_impl(result, self, wrapped_scalar_tensor(other));
 }
 template <typename OutImpl>
-static Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
+Tensor comparison_op(const Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return comparison_op(self, wrapped_scalar_tensor(other), out_impl);
 }
 template <typename OutImpl>
-static Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
+Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) {
  return out_impl(self, self, wrapped_scalar_tensor(other));
 }
--- a/aten/src/ATen/native/Blas.cpp
+++ b/aten/src/ATen/native/Blas.cpp
@ -7,11 +7,6 @@
 #include <ATen/Config.h>
 #include <ATen/native/mkldnn/Matmul.h>
 #include <ATen/native/mkldnn/Linear.h>
 #include <ATen/native/Resize.h>
 #if !defined(__s390x__) && !defined(__powerpc__)
 #include <cpuinfo.h>
 #endif
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/CPUFunctions.h>
@ -29,9 +24,6 @@
 #include <ATen/ops/mv_native.h>
 #include <ATen/ops/scalar_tensor_native.h>
 #include <ATen/ops/vdot_native.h>
 #include <ATen/ops/_scaled_mm_native.h>
 #include <ATen/ops/mul.h>
 #include <ATen/ops/matmul.h>
 #endif
 namespace at::meta {
@ -230,92 +222,4 @@ Tensor vdot(const Tensor &self, const Tensor &other){
 }
 static Tensor&
 _scaled_mm_out_cpu_emulated(const Tensor& mat1, const Tensor& mat2,
          const Tensor& scale_a,
          const Tensor& scale_b,
          const std::optional<at::Tensor>& bias,
          const std::optional<at::Tensor>& scale_result,
          std::optional<c10::ScalarType> out_dtype,
          bool use_fast_accum,
          Tensor& out) {
  TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix");
  TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix");
  TORCH_CHECK(
      mat1.sizes()[1] == mat2.sizes()[0], "mat1 and mat2 shapes cannot be multiplied (",
      mat1.sizes()[0], "x", mat1.sizes()[1], " and ", mat2.sizes()[0], "x", mat2.sizes()[1], ")");
  TORCH_INTERNAL_ASSERT((scale_a.numel() == 1 && scale_b.numel() == 1), "Now _scaled_mm only supports per-tensor scaling for CPU backend.");
  TORCH_CHECK(!bias || bias->numel() == mat2.sizes()[1], "Bias must be size ", mat2.sizes()[1],
       " but got ", bias->numel());
  // Check types
  TORCH_CHECK(!out_dtype || *out_dtype == out.scalar_type(), "out_dtype must match output matrix type");
  TORCH_CHECK(isFloat8Type(mat1.scalar_type()), "Expected mat1 to be Float8 matrix got ", mat1.scalar_type());
  TORCH_CHECK(isFloat8Type(mat2.scalar_type()), "Expected mat2 to be Float8 matrix got ", mat2.scalar_type());
  auto mat1_c = mat1.contiguous();
  auto mat2_c = mat2.contiguous();
  IntArrayRef mat1_sizes = mat1_c.sizes();
  IntArrayRef mat2_sizes = mat2_c.sizes();
  at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]});
  float input_scale = scale_a.item<float>();
  float weight_scale = scale_b.item<float>();
  auto fp32_mat1 = at::mul(mat1.to(kFloat), input_scale);
  auto fp32_mat2 = at::mul(mat2_c.to(kFloat), weight_scale);
  auto out_tmp = at::matmul(fp32_mat1, fp32_mat2);
  if (bias) {
    out_tmp.add_(bias.value());
  }
  out_tmp = out_tmp.to(out.scalar_type());
  out.copy_(out_tmp);
  return out;
 }
 Tensor&
 _scaled_mm_out_cpu(const Tensor& mat1, const Tensor& mat2,
          const Tensor& scale_a,
          const Tensor& scale_b,
          const std::optional<at::Tensor>& bias,
          const std::optional<at::Tensor>& scale_result,
          std::optional<c10::ScalarType> out_dtype,
          bool use_fast_accum,
          Tensor& out) {
 #if AT_MKLDNN_ENABLED()
  if (at::globalContext().userEnabledMkldnn()) {
    bool mixed_dtype = mat1.scalar_type() != mat2.scalar_type();
    if ((!mixed_dtype && cpuinfo_has_x86_amx_int8()) ||
        (mixed_dtype && cpuinfo_has_x86_amx_fp16())) {
      return mkldnn_scaled_mm(
          mat1,
          mat2,
          scale_a,
          scale_b,
          bias,
          scale_result,
          out_dtype,
          use_fast_accum,
          out);
    }
  }
 #endif
  {
  return _scaled_mm_out_cpu_emulated(mat1, mat2, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
  }
 }
 Tensor
 _scaled_mm_cpu(const Tensor& mat_a, const Tensor& mat_b,
          const Tensor& scale_a,
          const Tensor& scale_b,
          const std::optional<at::Tensor>& bias,
          const std::optional<at::Tensor>& scale_result,
          std::optional<c10::ScalarType> out_dtype,
          bool use_fast_accum) {
  const auto out_dtype_ = out_dtype.value_or(mat_a.scalar_type());
  Tensor out = at::empty({0}, mat_a.options().dtype(out_dtype_));
  return _scaled_mm_out_cpu(mat_a, mat_b, scale_a, scale_b, bias, scale_result, out_dtype, use_fast_accum, out);
 }
 }  // namespace at::native
--- a/aten/src/ATen/native/BlasKernel.cpp
+++ b/aten/src/ATen/native/BlasKernel.cpp
@ -116,44 +116,21 @@ void fp16_gemv_trans(
  fp16_gemv_trans_stub(kCPU, m, n, alpha, a, lda, x, incx, beta, y, incy);
 }
 void bf16_gemv_trans(
    const int m,
    const int n,
    const at::BFloat16 alpha,
    const at::BFloat16* a,
    const int lda,
    const at::BFloat16* x,
    const int incx,
    const at::BFloat16 beta,
    at::BFloat16* y,
    const int incy);
 #endif // !defined(C10_MOBILE)
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
 static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
  for (auto j = 0; j < n; j++) {
    auto vecCol = vdup_n_f16(x[j]);
    const auto* column = a + lda * j;
    for (auto i = 0; i < m; i += 4) {
      auto yf16 = y + i;
      auto matRow = vld1_f16(column + i);
      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
      vst1_f16(yf16, resVec);
    }
  }
 }
 #endif
 static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
  std::vector<float> sum(m);
  for (auto j = 0; j < n; j++) {
    auto vecCol = vdup_n_f32(x[j]);
    const auto* column = a + lda * j;
    for (auto i = 0; i < m; i += 4) {
      auto sf32 = sum.data() + i;
      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
      vst1q_f32(sf32, resVec);
    }
  }
  for (auto i = 0; i < m; i+= 4) {
    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
  }
 }
 void fp16_gemv_notrans(
    const int m,
    const int n,
@ -166,55 +143,17 @@ void fp16_gemv_notrans(
    Half* y,
    const int incy);
 void fp16_gemv_notrans(
    const int m,
    const int n,
    const float alpha,
    const Half* a,
    const int lda,
    const Half* x,
    const int incx,
    const float beta,
    Half* y,
    const int incy) {
  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
    if (at::globalContext().allowFP16ReductionCPU())  {
      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
    }
 #endif
    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
  }
  std::vector<float> sum(m);
  for (const auto j : c10::irange(n)) {
    const auto* column_ = a + lda * j;
    auto z = alpha * x[j * incx];
    for (const auto i : c10::irange(m)) {
      sum[i] += z * column_[i];
    }
  }
  if (beta == 0.0) {
    for (const auto i : c10::irange(m)) {
      y[i * incy] = sum[i];
    }
  } else {
    for (const auto i : c10::irange(m)) {
      y[i * incy] += sum[i];
    }
  }
 }
 #endif // defined(__aarch64__) && !defined(C10_MOBILE)
 template <typename scalar_t>
-static bool scal_use_fast_path(
+bool scal_use_fast_path(
    [[maybe_unused]] int64_t n,
    [[maybe_unused]] int64_t incx) {
  return false;
 }
 template <typename scalar_t>
-static bool gemv_use_fast_path(
+bool gemv_use_fast_path(
    [[maybe_unused]] char trans,
    [[maybe_unused]] int64_t m,
    [[maybe_unused]] int64_t n,
@ -227,7 +166,7 @@ static bool gemv_use_fast_path(
 }
 template <typename scalar_t>
-static void scal_fast_path(
+void scal_fast_path(
    [[maybe_unused]] int* n,
    [[maybe_unused]] scalar_t* a,
    [[maybe_unused]] scalar_t* x,
@ -237,7 +176,7 @@ static void scal_fast_path(
 }
 template <typename scalar_t>
-static void gemv_fast_path(
+void gemv_fast_path(
    [[maybe_unused]] const char* trans,
    [[maybe_unused]] const int* m,
    [[maybe_unused]] const int* n,
@ -319,6 +258,10 @@ template <>
 void gemv_fast_path<float>(const char *trans, const int *m, const int *n, const float *alpha, const float *a, const int *lda, const float *x, const int *incx, const float *beta, float *y, const int *incy) {
  sgemv_(remove_const(trans), remove_const(m), remove_const(n), remove_const(alpha), remove_const(a), remove_const(lda), remove_const(x), remove_const(incx), remove_const(beta), y, remove_const(incy));
 }
 #else
 INSTANTIATE(float)
 INSTANTIATE(double)
 #endif // AT_BUILD_WITH_BLAS
 INSTANTIATE(uint8_t)
 INSTANTIATE(int8_t)
@ -340,7 +283,7 @@ bool gemv_use_fast_path<at::BFloat16>(
      beta == 0.0;
 }
-static void bf16_gemv_trans(
+void bf16_gemv_trans(
  const int m,
  const int n,
  const at::BFloat16 alpha,
@ -425,7 +368,14 @@ void gemv_fast_path<at::Half>(
      y,
      *incy);
 }
-#else // !defined(__aarch64__))
+#else
 template <>
 bool scal_use_fast_path<at::Half>(
    [[maybe_unused]] int64_t n,
    [[maybe_unused]] int64_t incx) {
  return false;
 }
 template <>
 bool gemv_use_fast_path<at::Half>(
    char trans,
@ -441,6 +391,79 @@ bool gemv_use_fast_path<at::Half>(
      (c10::detail::fp16_from_bits(beta.x) == 0.0f || trans == 't' || trans == 'T');
 }
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
 static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
  for (auto j = 0; j < n; j++) {
    auto vecCol = vdup_n_f16(x[j]);
    const auto* column = a + lda * j;
    for (auto i = 0; i < m; i += 4) {
      auto yf16 = y + i;
      auto matRow = vld1_f16(column + i);
      auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0);
      resVec = vfma_lane_f16(resVec, matRow, vecCol, 0);
      vst1_f16(yf16, resVec);
    }
  }
 }
 #endif
 static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) {
  std::vector<float> sum(m);
  for (auto j = 0; j < n; j++) {
    auto vecCol = vdup_n_f32(x[j]);
    const auto* column = a + lda * j;
    for (auto i = 0; i < m; i += 4) {
      auto sf32 = sum.data() + i;
      auto matRow = vcvt_f32_f16(vld1_f16(column + i));
      auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0);
      resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0);
      vst1q_f32(sf32, resVec);
    }
  }
  for (auto i = 0; i < m; i+= 4) {
    vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i)));
  }
 }
 void fp16_gemv_notrans(
    const int m,
    const int n,
    const float alpha,
    const Half* a,
    const int lda,
    const Half* x,
    const int incx,
    const float beta,
    Half* y,
    const int incy) {
  if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) {
 #ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC
    if (at::globalContext().allowFP16ReductionCPU())  {
      return fp16_gemv_notrans_fp16_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
    }
 #endif
    return fp16_gemv_notrans_fp32_arith(m, n, reinterpret_cast<const float16_t*>(a), lda, reinterpret_cast<const float16_t*>(x), reinterpret_cast<float16_t*>(y));
  }
  std::vector<float> sum(m);
  for (const auto j : c10::irange(n)) {
    const auto* column_ = a + lda * j;
    auto z = alpha * x[j * incx];
    for (const auto i : c10::irange(m)) {
      sum[i] += z * column_[i];
    }
  }
  if (beta == 0.0) {
    for (const auto i : c10::irange(m)) {
      y[i * incy] = sum[i];
    }
  } else {
    for (const auto i : c10::irange(m)) {
      y[i * incy] += sum[i];
    }
  }
 }
 template <>
 void gemv_fast_path<at::Half>(
    const char* trans,
@ -488,7 +511,6 @@ void gemv_fast_path<at::Half>(
 INSTANTIATE(c10::Half)
 INSTANTIATE(c10::BFloat16)
 #endif // !defined(C10_MOBILE)
 #endif // AT_BUILD_WITH_BLAS
 #undef INSTANTIATE
 } // namespace blas_impl
--- a/aten/src/ATen/native/CPUBlas.cpp
+++ b/aten/src/ATen/native/CPUBlas.cpp
@ -554,7 +554,7 @@ using is_blas_library_type = std::integral_constant<bool,
    std::is_same_v<scalar_t, c10::complex<float>>>;
 template <typename scalar_t>
-static void gemm_batched_generic(
+void gemm_batched_generic(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -568,7 +568,7 @@ static void gemm_batched_generic(
 }
 template <typename scalar_t>
-static void gemm_batched(
+void gemm_batched(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -596,7 +596,7 @@ static void gemm_batched(
 }
 template <typename scalar_t>
-static void gemm_batched_with_stride_generic(
+void gemm_batched_with_stride_generic(
    TransposeType transa, TransposeType transb,
    int64_t batch_size, int64_t m, int64_t n, int64_t k,
    scalar_t alpha,
@ -945,7 +945,7 @@ struct PackKey {
  }
 };
-static inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
+inline dnnl::memory::data_type get_dnnl_dtype(ScalarType dtype) {
  if (dtype == ScalarType::Float) {
    return dnnl::memory::data_type::f32;
  } else if (dtype == ScalarType::BFloat16) {
--- a/aten/src/ATen/native/ComparisonUtils.cpp
+++ b/aten/src/ATen/native/ComparisonUtils.cpp
@ -13,13 +13,15 @@ class Tensor;
 namespace native {
 template<typename O, typename C>
-static void _assert_match(const O& original, const C& compared, const std::string& name) {
+void _assert_match(const O& original, const C& compared, const std::string& name) {
  if (compared) {
    bool equal = (original == compared.value());
    if (!equal) {
      std::stringstream msg;
-      msg << "Tensor " << name << " mismatch! Expected: " << compared.value() << ", Got: " << original;
+      msg << "Tensor " << name << " mismatch!";
-      throw std::runtime_error(msg.str());
+      if (!equal) {
        throw std::runtime_error(msg.str());
      }
    }
  }
 }
--- a/aten/src/ATen/native/ConvUtils.h
+++ b/aten/src/ATen/native/ConvUtils.h
@ -437,19 +437,4 @@ inline bool xpu_conv_use_channels_last(const at::Tensor& input, const at::Tensor
  return is_channel_last(input) || is_channel_last(weight);
 }
 inline bool mps_conv_use_channels_last(const at::Tensor& input, const at::Tensor& weight) {
  // check layout only for mps tensor.
  if (!input.is_mps() || !weight.is_mps()) {
    return false;
  }
  if (!input.defined() || input.is_sparse()) {
    // suggest channels_first
    return false;
  }
  auto fmt = input.suggest_memory_format();
  return fmt == at::MemoryFormat::ChannelsLast || fmt == at::MemoryFormat::ChannelsLast3d;
 }
 } // namespace at::native
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -30,10 +30,6 @@
 #include <ATen/native/mkldnn/Utils.h>
 #endif
 #ifdef USE_MPS
 #include <ATen/mps/MPSDevice.h>
 #endif
 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
 #include <ATen/NativeFunctions.h>
@ -97,7 +93,7 @@ static bool conv_benchmark_empty_cache = true;
 // Check workload to activate fast depthwise FP16 cudnn conv kernels
 template <typename T>
-static bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
+bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
  auto w = at::symint::size<T>(input, 3);  // same as h
  auto ch = at::symint::size<T>(input, 1);
  auto bs = at::symint::size<T>(input, 0);
@ -220,7 +216,7 @@ static bool check_cudnn_depthwise_workload(const at::Tensor& input, T stride) {
 // simplified version for cudnn 8.2 and above
 template <typename T>
-static bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
+bool check_cudnn_depthwise_workload_with_filter(const at::Tensor& input, T stride, const at::Tensor& weight) {
  // 1D conv
  if(at::symint::size<T>(input, 2) == 1 && stride == 1){
    return true;
@ -640,7 +636,7 @@ REGISTER_NO_CPU_DISPATCH(miopen_convolution_transpose_backward_stub)
 REGISTER_NO_CPU_DISPATCH(miopen_depthwise_convolution_backward_stub)
 template <typename T>
-static std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
+std::ostream& operator<<(std::ostream & out, const ConvParams<T>& params) {
  out << "ConvParams {"
      << "  stride = " << IntArrayRef{params.stride}
      << "  padding = " << ArrayRef<T>{params.padding}
@ -1203,7 +1199,7 @@ at::Tensor convolution_overrideable(
 // a bool indicating whether the bias is defined. This is done to save memory by
 // avoiding saving the full bias tensor for backward.
 template <typename T>
-static ConvBackend _select_conv_backend(
+ConvBackend _select_conv_backend(
    const Tensor& input,
    const Tensor& weight,
    const std::optional<Tensor>& bias,
@ -1417,7 +1413,7 @@ static inline at::MemoryFormat determine_backend_memory_format(
    const Tensor& input,
    const Tensor& weight,
    const ConvBackend backend) {
-  auto backend_memory_format = at::MemoryFormat::Contiguous;
+  at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous;
 #if !defined(C10_MOBILE)
  auto k = weight.ndimension();
  // See Note [Mobile check segfaults]
@ -1455,17 +1451,6 @@ static inline at::MemoryFormat determine_backend_memory_format(
        backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast;
      }
      break;
    case ConvBackend::Mps:
    case ConvBackend::MpsTranspose:
      if (mps_conv_use_channels_last(input, weight)) {
 #ifdef USE_MPS
        if (!mps::is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_15_0_PLUS)) {
          break;
        }
 #endif
        backend_memory_format = (k == 5) ? MemoryFormat::ChannelsLast3d : MemoryFormat::ChannelsLast;
      }
      break;
    default:
      backend_memory_format = at::MemoryFormat::Contiguous;
  }
--- a/aten/src/ATen/native/EmbeddingBag.cpp
+++ b/aten/src/ATen/native/EmbeddingBag.cpp
@ -1059,7 +1059,7 @@ static Tensor apply_bag_size_backward(
 }
 template <typename scalar_t>
-static void embedding_bag_cpu_max_out(
+void embedding_bag_cpu_max_out(
    Tensor* max_indices,
    const Tensor& weight,
    const Tensor& indices,
@ -1505,7 +1505,7 @@ static std::vector<index_t> compute_counts_uniq(
 }
 template <typename scalar_t>
-static void _embedding_bag_dense_backward_cpu_sum_mean(
+void _embedding_bag_dense_backward_cpu_sum_mean(
    const Tensor& grad,
    const Tensor& indices_,
    const Tensor& offset2bag_,
@ -1641,7 +1641,7 @@ Tensor _embedding_bag_dense_backward_cpu(const Tensor &grad_, const Tensor &indi
 }
 template<typename scalar_t>
-static Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
+Tensor _embedding_bag_per_sample_weights_backward_cpu_template(
    const Tensor& grad,
    const Tensor& weight,  // NB: embedding table, not per_sample_weights
    const Tensor& indices_,
--- a/aten/src/ATen/native/Linear.cpp
+++ b/aten/src/ATen/native/Linear.cpp
@ -5,7 +5,6 @@
 #include <ATen/WrapDimUtilsMulti.h>
 #include <ATen/TensorOperators.h>
 #include <c10/util/irange.h>
 #include <c10/core/GradMode.h>
 #include <c10/core/SymInt.h>
 #include <c10/util/MaybeOwned.h>
 #include <ATen/TensorSubclassLikeUtils.h>
@ -159,11 +158,11 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
  Tensor left = left_;
  Tensor right = right_;
  for (const auto i : c10::irange(dim)) {
-    auto sl = TORCH_GUARD_SIZE_OBLIVIOUS(left.sym_size(i).sym_ne(1));
+    auto sl = left.sym_size(i)!=1;
-    auto sr = TORCH_GUARD_SIZE_OBLIVIOUS(right.sym_size(i).sym_ne(1));
+    auto sr = right.sym_size(i)!=1;
    if (sum_dims[i]) { // first dimensions that will be summed over after multiplication
      if (sl && sr) {  // dimensions nontrivially in both left and right must be of the same size
-        TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
+        TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
        sum_size *= left.sym_size(i);
      } else if (sl) { // if it is only in one of left and right, we can sum right away
        left = left.sum(i, true);
@ -172,7 +171,7 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra
      }
    } else if (sl && sr) { // now deal with dimensions that will be in the output
      // dimensions nontrivially in both left and right must be of the same size
-      TORCH_SYM_CHECK(left.sym_size(i).sym_eq(right.sym_size(i)), "non-broadcast dimensions must match");
+      TORCH_CHECK(left.sym_size(i)==right.sym_size(i), "non-broadcast dimensions must match");
      lro.push_back(i);
      lro_size *= left.sym_size(i);
    } else if (sl) { // keep track of dimensions appearing only once
@ -482,10 +481,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
        // Iterate over each dimension covered by ellipsis
        const auto ndim = operands[i].ndimension() - (static_cast<int64_t>(op_labels[i].size()) - 1);
        for (auto j = ell_num_dim - ndim; j < ell_num_dim; ++j) {
-          if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+          if (op.sym_size(dim) != 1) {
            // Update ellipsis size
-            TORCH_SYM_CHECK(
+            TORCH_CHECK(
-                ell_sizes[j].sym_eq(1).sym_or(ell_sizes[j].sym_eq(op.sym_size(dim))),
+                ell_sizes[j] == 1 || ell_sizes[j] == op.sym_size(dim),
                "einsum(): dimension ",
                dim,
                " covered by ellipsis in operand ",
@ -501,10 +500,10 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
          permutation[ell_index + j] = dim++;
        }
      } else if (permutation[label_perm_index[s]] == -1) {
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(op.sym_size(dim).sym_ne(1))) {
+        if (op.sym_size(dim) != 1) {
          // Update subscript
-          TORCH_SYM_CHECK(
+          TORCH_CHECK(
-              label_size[s].sym_eq(1).sym_or(label_size[s].sym_eq(op.sym_size(dim))),
+              label_size[s] == 1 || label_size[s] == op.sym_size(dim),
              "einsum(): subscript ",
              subscript_to_label(s),
              " has size ",
@ -579,17 +578,16 @@ Tensor einsum(std::string_view equation, TensorList operands, at::OptionalIntArr
    SmallVector<int64_t, 5> a_dims_to_sum;
    SmallVector<int64_t, 5> b_dims_to_sum;
    for (auto dim = out_num_dim; dim < perm_index; ++dim) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))
+      if (a.sym_size(dim) != 1 && b.sym_size(dim) != 1) {
        && TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
        if (--dim_counts[dim] == 1) {
          sum_dims.push_back(dim);
          dim_counts[dim] = 0;
        }
      } else if (dim_counts[dim] == 1) {
-        if (TORCH_GUARD_SIZE_OBLIVIOUS(a.sym_size(dim).sym_ne(1))) {
+        if (a.sym_size(dim) != 1) {
          a_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
-        } else if (TORCH_GUARD_SIZE_OBLIVIOUS(b.sym_size(dim).sym_ne(1))) {
+        } else if (b.sym_size(dim) != 1) {
          b_dims_to_sum.push_back(dim);
          dim_counts[dim] = 0;
        }
@ -833,14 +831,6 @@ Tensor &tensordot_out(const Tensor& input1, const Tensor& input2, IntArrayRef di
  auto output_device = result.device();
  auto input1_device = input1.device();
  auto input2_device = input2.device();
  if(result.defined()) {
    TORCH_CHECK(
      !(result.requires_grad() && at::GradMode::is_enabled() && result.sizes() != result_tmp.sizes()),
      "tensordot(): the 'out' tensor was specified and requires gradients, and its shape does not match the expected result. "
      "Either remove the 'out' argument, ensure it does not require gradients, or make sure its shape matches the expected output."
    );
  }
  // check if the input & output tensors are on the same device.
  TORCH_CHECK(
    (output_device == input1_device) && (input1_device == input2_device),
--- a/aten/src/ATen/native/LinearAlgebra.cpp
+++ b/aten/src/ATen/native/LinearAlgebra.cpp
@ -285,7 +285,7 @@ TORCH_META_FUNC(_linalg_slogdet)(const Tensor& A) {
 }
 template <typename Meta>
-static void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
+void common_checks_baddbmm_bmm(Meta& meta, const Tensor& batch1, const Tensor& batch2, const Scalar& beta, const Scalar& alpha, bool is_bmm, const std::optional<Tensor>& self_baddbmm = std::nullopt) {
  TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor");
  TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor");
@ -1639,7 +1639,7 @@ TORCH_IMPL_FUNC(mm_out_cpu)(const Tensor & self, const Tensor & mat2, const Tens
 }
 template <typename scalar_t, bool is_bmm>
-static inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) {
+inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const Tensor& mat2, const Scalar& beta_, const Scalar& alpha_) {
  int64_t bs = result.size(0);
  int64_t is = result.size(1);
  int64_t js = result.size(2);
--- a/aten/src/ATen/native/LossCTC.cpp
+++ b/aten/src/ATen/native/LossCTC.cpp
@ -126,7 +126,6 @@ std::tuple<Tensor, Tensor, size_t, std::vector<int64_t>> ctc_loss_allocate_outpu
 // the alphas from the user by only returning the loss.
 template<typename scalar_t, ScalarType target_scalar_type>
 std::tuple<Tensor, Tensor> ctc_loss_cpu_template(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK) {
  TORCH_CHECK(log_probs.numel() > 0, "log_probs tensor must not be empty");
  // log_probs: input_len x batch_size x num_labels
  // targets [int64]: batch_size x target_length OR sum(target_lengths)
  constexpr scalar_t neginf = -std::numeric_limits<scalar_t>::infinity();
--- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
+++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp
@ -20,6 +20,9 @@
 namespace at::native {
 template<typename scalar_t>
 void gemv(char trans, int64_t m, int64_t n, scalar_t alpha, scalar_t *a, int64_t lda, scalar_t *x, int64_t incx, scalar_t beta, scalar_t *y, int64_t incy);
 namespace {
 static inline void slow_conv_transpose3d_shape_check(
--- a/aten/src/ATen/native/Normalization.cpp
+++ b/aten/src/ATen/native/Normalization.cpp
@ -132,7 +132,7 @@ static inline MemoryFormat suggest_memory_format_contig(const Tensor& t) {
 }
 template<typename scalar_t, typename param_t>
-static std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
+std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
    const Tensor& input, const Tensor& weight, const Tensor& bias,
    const Tensor& save_mean /* optional */, const Tensor& save_invstd /* optional */,
    const Tensor& running_mean /* optional */, const Tensor& running_var /* optional */,
@ -197,7 +197,7 @@ static std::tuple<Tensor,Tensor,Tensor> batch_norm_cpu_transform_input_template(
 }
 template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
-static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
    const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
    double momentum, double eps, Tensor& save_mean, Tensor& save_var_transform) {
@ -287,7 +287,7 @@ static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
 }
 template<typename scalar_t, typename param_t, template<typename T> class VarTransform>
-static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
+std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
    const Tensor& input, const Tensor& running_mean, const Tensor& running_var,
    double momentum, double eps) {
  int64_t n_input = input.size(1);
@ -306,7 +306,7 @@ static std::tuple<Tensor,Tensor> batch_norm_cpu_update_stats_template(
 }
 template<typename scalar_t, typename param_t>
-static std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
+std::tuple<Tensor, Tensor, Tensor> batch_norm_backward_cpu_template(
    const Tensor& grad_out_, const Tensor& input, const Tensor& weight,
    const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean, const Tensor& save_invstd,
    bool train, double eps, std::array<bool,3> grad_input_mask) {
--- a/aten/src/ATen/native/ReduceOps.cpp
+++ b/aten/src/ATen/native/ReduceOps.cpp
@ -472,7 +472,7 @@ Tensor& logcumsumexp_out(const Tensor& self, int64_t dim, Tensor& result) {
 }
 template <class Stub>
-static void impl_func_cum_ops(
+void impl_func_cum_ops(
    const Tensor& self,
    int64_t dim,
    const Tensor& result,
@ -769,7 +769,7 @@ inline bool isnan_(T x) {
 }
 template<typename T1, typename T2, typename Operation>
-static void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data,
+void cummax_cummin_helper(const T1* self_data, T1* values_data, T2* indices_data,
          int self_dim_size, int self_stride, int values_stride, int indices_stride) {
      Operation op;
      T1 out = c10::load(self_data);
@ -1182,7 +1182,7 @@ std::vector<Tensor> gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o
 // ALL REDUCE #################################################################
-static inline bool should_use_acc_buffer(at::TensorIterator& iter) {
+inline bool should_use_acc_buffer(at::TensorIterator& iter) {
  const auto ndim = iter.ndim();
  if (!iter.device().is_cpu() || iter.noutputs() != 1) {
    return false;
@ -1591,7 +1591,7 @@ Tensor norm(const Tensor& self, const Scalar& p) {
  return at::norm(self, p, IntArrayRef{}, false);
 }
-static inline TensorIterator get_allany_iter(
+inline TensorIterator get_allany_iter(
    const Tensor& self,
    const Tensor& result,
    OptionalIntArrayRef dims,
@ -1608,7 +1608,7 @@ static inline TensorIterator get_allany_iter(
 }
 template <int identity, typename Stub>
-static inline void allany_impl(
+inline void allany_impl(
    const Tensor& self,
    const Tensor& result,
    OptionalIntArrayRef dims,
@ -1653,7 +1653,7 @@ TORCH_IMPL_FUNC(any_all_out)(const Tensor& self, const Tensor& result) {
 }
 template <bool is_all>
-static Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) {
+Tensor allany_dims_default(const Tensor &self, OptionalIntArrayRef dim, bool keepdim) {
  // Default implementation in terms of all-reduce or single dim reduce
  if (!dim) {
    Tensor out;
@ -1732,7 +1732,7 @@ TORCH_IMPL_FUNC(amax_out) (const Tensor& self, IntArrayRef dim, bool keepdim, co
 }
 template <class Stub>
-static void argmax_argmin_impl(
+void argmax_argmin_impl(
    const Tensor& self,
    std::optional<int64_t> dim,
    bool keepdim,
--- a/aten/src/ATen/native/Resize.cpp
+++ b/aten/src/ATen/native/Resize.cpp
@ -9,7 +9,6 @@
 #include <ATen/NativeFunctions.h>
 #else
 #include <ATen/ops/resize_as_native.h>
 #include <ATen/ops/resize_as_sparse_native.h>
 #include <ATen/ops/resize_native.h>
 #include <ATen/ops/resize.h>
 #include <ATen/ops/_resize_output.h>
@ -22,7 +21,7 @@ namespace at::native {
 // Returns true if resize is necessary
 template <typename T>
-static bool _resize_output_check(const Tensor& output, ArrayRef<T> shape) {
+bool _resize_output_check(const Tensor& output, ArrayRef<T> shape) {
  // Tests for resizing of tensors with one or more elements
  if (at::symint::sizes<T>(output).equals(shape)) {
    return false;
@ -57,7 +56,7 @@ static void native_resize_(const Tensor& output, SymIntArrayRef shape) {
 }
 template <typename T>
-static bool _resize_output(const Tensor& output, ArrayRef<T> shape) {
+bool _resize_output(const Tensor& output, ArrayRef<T> shape) {
  if (_resize_output_check<T>(output, shape)) {
    // avoid a redispatch for cpu and cuda.
    // TODO: when resize_cuda_ is re-written to be unified with resize_,
@ -197,7 +196,7 @@ static void _maybe_resize_storage(TensorImpl* self, c10::SymInt new_size_bytes)
 }
 template <typename T>
-static TensorImpl* _resize_impl_(
+TensorImpl* _resize_impl_(
    TensorImpl* self,
    ArrayRef<T> size,
    at::OptionalArrayRef<T> stride,
@ -235,7 +234,7 @@ TensorImpl* resize_impl_cpu_(
 }
 template <typename T>
-static const Tensor& _resize_(
+const Tensor& _resize_(
    const Tensor& self,
    ArrayRef<T> size,
    std::optional<MemoryFormat> optional_memory_format) {
--- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp
+++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp
@ -147,6 +147,7 @@
 namespace at::native {
 std::string shapes_as_str(TensorList tensors);
 AdvancedIndex make_info(Tensor self, IOptTensorListRef orig);
 } // namespace at::native
@ -185,7 +186,7 @@ TORCH_META_FUNC(gather)
 }
 template <bool use_new_options = false, typename Meta>
-static void scatter_meta_impl(
+void scatter_meta_impl(
    Meta& meta,
    const Tensor& self,
    int64_t dim,
@ -357,7 +358,7 @@ TORCH_PRECOMPUTE_META_FUNC(index_copy)
 }
 template <typename Meta>
-static void index_func_meta_impl(
+void index_func_meta_impl(
    Meta& meta,
    const Tensor& self,
    int64_t dim,
@ -592,6 +593,21 @@ static bool all_strides_match(TensorList tensors) {
  return true;
 }
 inline std::string shapes_as_str(TensorList tensors) {
  std::ostringstream os;
  bool first = true;
  for (auto& tensor : tensors) {
    if (tensor.defined()) {
      if (!first) {
        os << ", ";
      }
      os << tensor.sizes();
      first = false;
    }
  }
  return os.str();
 }
 // Replace indexed dimensions in src with stride 0 and the size of the result
 // tensor. The offset in these dimensions is computed by the kernel using the
 // index tensor's values and the stride of src. The new shape is not meaningful.
@ -2233,7 +2249,7 @@ template <
    typename T,
    typename ReduceStub,
    typename FillStub>
-static void scatter_impl(
+void scatter_impl(
    const Tensor& self,
    int64_t dim,
    const Tensor& index,
@ -2806,7 +2822,7 @@ Tensor _gather_sparse_backward(
 }
 template <typename scalar_t>
-static int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) {
+int64_t count_nonzero_impl(TensorIteratorBase& iter, Range range) {
  int64_t num_nonzero = 0;
  auto loop = [&](char** data, const int64_t* strides, int64_t n) {
--- a/aten/src/ATen/native/TensorCompare.cpp
+++ b/aten/src/ATen/native/TensorCompare.cpp
@ -569,7 +569,7 @@ static void isin_sorting(
 }
 template <typename... Args>
-static Device out_device(Args&... inps) {
+Device out_device(Args&... inps) {
  for (const auto& i : {inps...}) {
    if (i.device() != at::kCPU) {
      return i.device();
@ -739,7 +739,7 @@ std::tuple<Tensor&, Tensor&> mode_out(
 }
 template <class Stub>
-static void minmax_out_impl(
+void minmax_out_impl(
    const Tensor& self,
    int64_t dim,
    bool keepdim,
--- a/aten/src/ATen/native/TensorConversions.cpp
+++ b/aten/src/ATen/native/TensorConversions.cpp
@ -806,7 +806,7 @@ Tensor sparse_compressed_to_dense(
 // Computes the strides for view_dtype output when the view dtype is
 // smaller than the original dtype
-static inline SymDimVector compute_strides_for_view_dtype_downsize(
+inline SymDimVector compute_strides_for_view_dtype_downsize(
    SymIntArrayRef old_strides,
    int64_t size_ratio,
    ScalarType old_dtype,
@ -832,7 +832,7 @@ static inline SymDimVector compute_strides_for_view_dtype_downsize(
 // Computes the strides for view_dtype output when the view dtype is
 // larger than the original dtype
-static inline SymDimVector compute_strides_for_view_dtype_upsize(
+inline SymDimVector compute_strides_for_view_dtype_upsize(
    SymIntArrayRef old_strides,
    int64_t size_ratio,
    ScalarType old_dtype,
@ -1023,9 +1023,22 @@ static Tensor _mask_to_indices(const Tensor& mask) {
 }
 static std::pair<Tensor, Tensor> _not_zero_mask_to_col_row_indices(
-    Tensor not_zero_mask) {
+    Tensor not_zero_mask,
-  auto nz = not_zero_mask.nonzero();
+    ScalarType index_dtype,
-  return {nz.select(1, 1), nz.select(1, 0)};
+    Device index_device) {
  auto col_indices =
      at::native::arange(
          not_zero_mask.size(-1), index_dtype, kStrided, index_device)
          .view({1, not_zero_mask.size(-1)})
          .expand_as(not_zero_mask)
          .masked_select(not_zero_mask);
  auto row_indices =
      at::native::arange(
          not_zero_mask.size(-2), index_dtype, kStrided, index_device)
          .view({not_zero_mask.size(-2), 1})
          .expand_as(not_zero_mask)
          .masked_select(not_zero_mask);
  return std::pair<Tensor, Tensor>(col_indices, row_indices);
 }
 // Sparse layout conversions Start
@ -1306,8 +1319,8 @@ static Tensor dense_to_sparse_compressed(
  Tensor col_indices;
  Tensor compressed_indices;
  if (compressed_rows_layout) {
-    std::tie(col_indices, row_indices) =
+    std::tie(col_indices, row_indices) = _not_zero_mask_to_col_row_indices(
-        _not_zero_mask_to_col_row_indices(not_zero_mask);
+        not_zero_mask, at::kLong, not_zero_mask.device());
    compressed_indices = at::_convert_indices_from_coo_to_csr(
        row_indices, not_zero_mask.size(0), false /*out_int32*/);
    {
@ -1315,8 +1328,8 @@ static Tensor dense_to_sparse_compressed(
      values = values.flatten(0, 1).index_select(0, mask_indices);
    }
  } else {
-    std::tie(row_indices, col_indices) =
+    std::tie(row_indices, col_indices) = _not_zero_mask_to_col_row_indices(
-        _not_zero_mask_to_col_row_indices(not_zero_mask.transpose(1, 0));
+        not_zero_mask.transpose(1, 0), at::kLong, not_zero_mask.device());
    compressed_indices = at::_convert_indices_from_coo_to_csr(
        col_indices, not_zero_mask.size(-1), false /*out_int32*/);
    {
@ -1976,7 +1989,7 @@ TORCH_IMPL_FUNC(_convert_indices_from_csr_to_coo_structured_cpu)
 * Modified to ensure sorted BSR column indices.
 */
 template <class index_t, class scalar_t, bool compressed_rows>
-static void _compressed_to_block_compressed_cpu_kernel(
+void _compressed_to_block_compressed_cpu_kernel(
    const index_t n_compressed, // Tensor size along compressed dimension
    const index_t n_plain, // Tensor size along plain dimension
    const index_t C, // Block size along compressed dimensions
@ -2073,7 +2086,7 @@ static void _compressed_to_block_compressed_cpu_kernel(
 * https://github.com/scipy/scipy/blob/8a64c938ddf1ae4c02a08d2c5e38daeb8d061d38/scipy/sparse/sparsetools/csr.h
 */
 template <class index_t>
-static index_t compressed_count_blocks(
+index_t compressed_count_blocks(
    const index_t n_compressed, // Tensor size along compressed dimension
    const index_t n_plain, // Tensor size along plain dimension
    const index_t C, // Block size along compressed dimensions
@ -2097,7 +2110,7 @@ static index_t compressed_count_blocks(
 }
 template <Layout target_layout>
-static Tensor _compressed_to_block_compressed_cpu(
+Tensor _compressed_to_block_compressed_cpu(
    const Tensor& self,
    IntArrayRef blocksize) {
  static_assert(
--- a/aten/src/ATen/native/TensorFactories.cpp
+++ b/aten/src/ATen/native/TensorFactories.cpp
@ -2072,24 +2072,22 @@ Tensor vander(const Tensor& x, std::optional<int64_t> N, bool increasing) {
 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tensor ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 template <typename T>
-static Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
+Tensor tensor_cpu(ArrayRef<T> values, const TensorOptions& options) {
  return at::detail::tensor_cpu(values, options);
 }
 template <typename T>
-static Tensor tensor_backend(ArrayRef<T> values, const TensorOptions& options) {
+Tensor tensor_backend(ArrayRef<T> values, const TensorOptions& options) {
  return at::detail::tensor_backend(values, options);
 }
 template <typename T>
-static Tensor tensor_complex_cpu(
+Tensor tensor_complex_cpu(ArrayRef<T> values, const TensorOptions& options) {
    ArrayRef<T> values,
    const TensorOptions& options) {
  return at::detail::tensor_complex_cpu(values, options);
 }
 template <typename T>
-static Tensor tensor_complex_backend(
+Tensor tensor_complex_backend(
    ArrayRef<T> values,
    const TensorOptions& options) {
  return at::detail::tensor_complex_backend(values, options);
--- a/aten/src/ATen/native/TensorShape.cpp
+++ b/aten/src/ATen/native/TensorShape.cpp
@ -216,7 +216,7 @@
 namespace at::meta {
-static inline c10::MemoryFormat cat_compute_output_memory_format(
+inline c10::MemoryFormat cat_compute_output_memory_format(
    const MaterializedITensorListRef& inputs) {
  std::optional<c10::MemoryFormat> format = std::nullopt;
  for (const Tensor& t : inputs) {
@ -1119,7 +1119,7 @@ std::vector<Tensor> tensor_split_sections_symint(
 }
 template <typename T>
-static std::vector<Tensor> _tensor_split_indices(
+std::vector<Tensor> _tensor_split_indices(
    const Tensor& self,
    ArrayRef<T> indices,
    int64_t dim) {
@ -1417,7 +1417,7 @@ Tensor as_strided_tensorimpl(
 }
 template <typename T>
-static inline void setStridedUnchecked(
+inline void setStridedUnchecked(
    const Tensor& self,
    ArrayRef<T> size,
    ArrayRef<T> stride,
@ -1922,7 +1922,7 @@ Tensor tile_symint(const Tensor& self, SymIntArrayRef reps) {
 // templated for ArrayRef<int64_t> and SmallVector<int64_t> use cases
 //
 template <typename Vec>
-static Tensor alias_with_sizes_and_strides(
+Tensor alias_with_sizes_and_strides(
    const Tensor& self,
    const Vec& sizes,
    const Vec& strides) {
@ -1958,7 +1958,7 @@ static Tensor alias_with_sizes_and_strides(
 // SymIntArrayRef/ArrayRef<c10::SymInt> and
 // SmallVector<c10::SymInt>/SymDimVector
 template <template <typename...> typename Container>
-static Tensor alias_with_sizes_and_strides(
+Tensor alias_with_sizes_and_strides(
    const Tensor& self,
    const Container<c10::SymInt>& sizes,
    const Container<c10::SymInt>& strides) {
@ -3290,7 +3290,7 @@ static inline std::vector<Tensor> get_stack_inputs(
  return inputs;
 }
-static bool inline maybe_native_stack(
+bool inline maybe_native_stack(
    Tensor& result,
    TensorList tensors,
    int64_t dim) {
@ -3366,7 +3366,7 @@ static std::vector<Tensor> _pad_chunk(
    std::vector<int64_t> view_sizes(
        tensor_size.begin(), tensor_size.begin() + dim);
    view_sizes.insert(view_sizes.end(), {num_chunks, -1});
-    padded_tensors.push_back(padded_tensor.reshape(view_sizes));
+    padded_tensors.push_back(padded_tensor.view(view_sizes));
  }
  return padded_tensors;
 }
@ -4021,7 +4021,7 @@ Tensor& squeeze_(Tensor& self, IntArrayRef dims) {
 // This is a hack because in-place operations on tensors treated like views
 // can be much more expensive than the same operations on non-view tensors.
-static inline Tensor view_impl(const Tensor& self, IntArrayRef size) {
+inline Tensor view_impl(const Tensor& self, IntArrayRef size) {
  at::DimVector inferred_size = at::infer_size_dv(size, self.numel());
  auto stride =
      at::detail::computeStride(self.sizes(), self.strides(), inferred_size);
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Michael Lazos	b862ec0bb0	[Hierarchical Compile] Handle autocast ctx manager	2025-04-09 23:51:23 -07:00
Michael Lazos	b9291698c3	[Hierarchical Compile] Fix small bug	2025-04-09 23:51:23 -07:00
Michael Lazos	8e81416492	Disable optimizer and enable graph deduplication	2025-04-09 23:51:23 -07:00
`@ -1 +1 @@`
	`381ae5d57d35c165d98df728380b20fbde350392`	`7e487c24e1c20c3f4606c2d8aca2778873b00b4c`