Fix numerical instability for norm (#129352 )

Fixes #123645 When the reduce size is large, reducing directly may exceed the range that FP32 can represent, resulting in incorrect results. Reducing in group and using double as the intermediate cumulative type can avoid exceeding the representation range. Pull Request resolved: https://github.com/pytorch/pytorch/pull/129352 Approved by: https://github.com/jgong5, https://github.com/peterbell10
[lintrunner] auto apply formatting changes as suggestions (#136239 )
2025-10-21 13:44:15 +08:00 · 2024-09-27 00:51:31 +00:00 · 2024-09-27 00:51:21 +00:00 · 2024-09-27 00:44:40 +00:00 · 2024-09-27 00:38:10 +00:00 · 2024-09-26 17:34:52 -07:00
1209 changed files with 30046 additions and 28210 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
-0.6b
+0.7b
 manylinux_2_17
 rocm6.2
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-e4ab195d2bd19e939c675a13280c29714c6ef9f2cf420690da150fa0cac043b1
+9be04068c3c0857a4cfd17d7e39e71d0423ebac2
+3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -286,18 +286,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.8
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=6.0
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.8
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -307,6 +296,17 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.10
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.2
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -379,6 +379,7 @@ case "$image" in
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
+    TRITON=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-cc981feba10a3f4c2e46f3fe368e8fcf5f5643df
+91b14bf5593cf58a8541f3e6b9125600a867d4ef
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-757b6a61e7df814ba806f498f8bb3160f84b120c
+5fe38ffd73c2ac6ed6323b554205186696631c6f
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,12 +4,12 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.bz2'
+TARBALL='aotriton.tar.gz'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}

 function check_var {
    if [ -z "$1" ]; then
@ -22,6 +22,13 @@ function do_cpython_build {
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
+
+    local additional_flags=""
+    if [ "$py_ver" == "3.13.0t" ]; then
+        additional_flags=" --disable-gil"
+        mv cpython-3.13/ cpython-3.13t/
+    fi
+
    pushd $py_folder

    local prefix="/opt/_internal/cpython-${py_ver}"
@ -37,8 +44,10 @@ function do_cpython_build {
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi

+
+
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null

    make -j40 > /dev/null
    make install > /dev/null
@ -69,7 +78,14 @@ function build_cpython {
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
-    if [ "$py_ver" = "3.13.0" ]; then
+
+    if [ "$py_ver" = "3.13.0t" ]; then
+        PY_VER_SHORT="3.13"
+        PYT_VER_SHORT="3.13t"
+        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
+        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
+        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
+    elif [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,7 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,6 +10,21 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi

+IS_UBUNTU=0
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    IS_UBUNTU=1
+    ;;
+  centos)
+    IS_UBUNTU=0
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -57,9 +72,11 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
+if [[ $ROCM_INT -ge 60300 ]]; then
+    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
+elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
 elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
@ -93,12 +110,21 @@ else
    exit 1
 fi

-yum remove -y miopen-hip
+
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get remove -y miopen-hip
+else
+  yum remove -y miopen-hip
+fi

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
+# Don't build CK to save docker build time
+if [[ $ROCM_INT -ge 60200 ]]; then
+    sed -i '/composable_kernel/d' requirements.txt
+fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
@ -111,10 +137,15 @@ cmake -P install_deps.cmake --minimum

 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-yum clean all
-rm -rf /var/cache/yum
-rm -rf /var/lib/yum/yumdb
-rm -rf /var/lib/yum/history
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+else
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+fi

 ## Build MIOpen
 mkdir -p build
@ -131,7 +162,11 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget

-yum install -y miopen-*.rpm
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  sudo dpkg -i miopen-hip*.deb
+else
+  yum install -y miopen-*.rpm
+fi

 popd
 rm -rf MIOpen
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -37,6 +37,12 @@ esac

 (
  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
  docker build \
    --target final \
    --progress plain \
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,6 +10,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=9
+
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -124,7 +124,14 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-    DOCKER_BUILDKIT=1 docker build \
+
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
+
+    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -90,7 +90,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.11.2
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
@ -337,3 +337,8 @@ onnxscript==0.1.0.dev20240817
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
+
+parameterized==0.8.1
+#Description: Parameterizes unittests, both the tests themselves and the entire testing class
+#Pinned versions:
+#test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+3.1.0
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,6 +68,8 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -121,5 +123,8 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
 USER jenkins
 CMD ["bash"]
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -49,13 +49,8 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi

 # Enable LLVM dependency for TensorExpr testing
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export USE_LLVM=/opt/rocm/llvm
-  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
-else
-  export USE_LLVM=/opt/llvm
-  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
-fi
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -237,7 +232,7 @@ fi

 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -283,6 +278,7 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
@ -345,11 +341,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"

    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -359,10 +355,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +370,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -407,6 +403,6 @@ fi

 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from tempfile import mkdtemp

 from cryptography import x509
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -9,15 +9,13 @@ if [[ -n "$CONDA_ENV" ]]; then
  export PATH="$CONDA_ENV/bin":$PATH
 fi

-# Test that OpenMP is enabled for non-arm64 build
-if [[ ${BUILD_ENVIRONMENT} != *arm64* ]]; then
-  pushd test
-  if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
-    echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
-    exit 1
-  fi
-  popd
+# Test that OpenMP is enabled
+pushd test
+if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
+  echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
+  exit 1
 fi
+popd

 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
@ -27,8 +25,9 @@ setup_test_python() {
  echo "Ninja version: $(ninja --version)"
  echo "Python version: $(which python) ($(python --version))"

-  # Increase default limit on open file handles from 256 to 1024
-  ulimit -n 1024
+  # Set the limit on open file handles to 16384
+  # might help with intermittent compiler test failures
+  ulimit -n 16384
 }

 test_python_all() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -401,9 +401,9 @@ pr_time_benchmarks() {

  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
-  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
-  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_after.txt"
+  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"

 }

@ -1383,14 +1383,16 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64(){
+test_linux_aarch64() {
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-       test_transformers test_multiprocessing test_numpy_interop --verbose
+        test_transformers test_multiprocessing test_numpy_interop \
+        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose

  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
@ -1400,7 +1402,8 @@ test_linux_aarch64(){
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
+       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -43,6 +43,9 @@ python -m pip install z3-solver==4.12.2.0
 # Install tlparse for test\dynamo\test_structured_trace.py UTs.
 python -m pip install tlparse==0.3.25

+# Install parameterized
+python -m pip install parameterized==0.8.1
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -32,30 +32,6 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
-    - amz2023.linux.large
-    - amz2023.linux.2xlarge
-    - amz2023.linux.4xlarge
-    - amz2023.linux.12xlarge
-    - amz2023.linux.24xlarge
-    - amz2023.linux.arm64.2xlarge
-    - amz2023.linux.arm64.m7g.4xlarge
-    - amz2023.linux.arm64.m7g.4xlarge.ephemeral
-    - amz2023.linux.4xlarge.nvidia.gpu
-    - amz2023.linux.8xlarge.nvidia.gpu
-    - amz2023.linux.16xlarge.nvidia.gpu
-    - amz2023.linux.g5.4xlarge.nvidia.gpu
-    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
-    - amz2023.lf.linux.large
-    - amz2023.lf.linux.2xlarge
-    - amz2023.lf.linux.4xlarge
-    - amz2023.lf.linux.12xlarge
-    - amz2023.lf.linux.24xlarge
-    - amz2023.lf.linux.arm64.2xlarge
-    - amz2023.lf.linux.4xlarge.nvidia.gpu
-    - amz2023.lf.linux.8xlarge.nvidia.gpu
-    - amz2023.lf.linux.16xlarge.nvidia.gpu
-    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-97ed7b36b7a741253d4e41e4da3c901d83294503
+ba696ea3dfec4cbe693bf06a84c75dc196077f5b
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -7,10 +7,14 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
-# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
-#                     to avoid RequestLimitExceeded issues
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -31,58 +35,36 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.12xlarge.ephemeral:
@ -91,240 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -7,10 +7,14 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
-# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
-#                     to avoid RequestLimitExceeded issues
+# NOTES:
+#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
+#    to avoid RequestLimitExceeded issues
+#  - When updating this file, run the following command to validate the YAML and to generate
+#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
+#    pytorch/pytorch changes before merging these changes.
+#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -31,58 +35,36 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.12xlarge.ephemeral:
@ -91,240 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-arm64-gp2
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -86,6 +86,18 @@
  - pull
  - inductor

+- name: OSS CI / pytorchbot / slow tests
+  patterns:
+  - test/slow_tests.json
+  approved_by:
+  - pytorchbot
+  ignore_flaky_failures: false
+  mandatory_checks_name:
+  - EasyCLA
+  - Lint
+  - pull
+  - slow
+
 - name: OSS CI /pytorchbot / Executorch
  patterns:
  - .ci/docker/ci_commit_pins/executorch.txt
@ -532,6 +544,7 @@
  - anijain2305
  - bdhirsh
  - zou3519
+  - isuruf
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -31,3 +31,4 @@ optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
+parameterized==0.8.1
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -412,8 +412,8 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                if python_version == "3.10" and arch_version == "12.1":
+                # Special build building to use on Colab. Python 3.11 for 12.1 CUDA
+                if python_version == "3.11" and arch_version == "12.1":
                    ret.append(
                        {
                            "python_version": python_version,
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -70,17 +70,15 @@ class BinaryBuildWorkflow:
            )
        else:
            self.build_environment = f"{self.os}-binary-{self.package_type}"
+        if self.use_split_build:
+            # added to distinguish concurrency groups
+            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
            GITHUB_DIR
            / f"workflows/generated-{self.build_environment}-{self.branches}.yml"
        )
-        if self.use_split_build:
-            output_file_path = (
-                GITHUB_DIR
-                / f"workflows/generated-{self.build_environment}-{self.branches}-split.yml"
-            )
        with open(output_file_path, "w") as output_file:
            GENERATED = "generated"  # Note that please keep the variable GENERATED otherwise phabricator will hide the whole file
            output_file.writelines([f"# @{GENERATED} DO NOT EDIT MANUALLY\n"])
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -168,6 +168,14 @@ def gh_post_commit_comment(
    )


+def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
+    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
+    if dry_run:
+        print(f"Dry run closing PR {pr_num}")
+    else:
+        gh_fetch_url(url, method="PATCH", data={"state": "closed"})
+
+
 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
    gh_fetch_url(url, method="DELETE")
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -17,6 +17,11 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
    cp -r "${CACHE_DIRECTORY}" . || true
 fi

+# if lintrunner is not installed, install it
+if ! command -v lintrunner &> /dev/null; then
+    python3 -m pip install lintrunner==0.12.5
+fi
+
 # This has already been cached in the docker image
 lintrunner init 2> /dev/null

@ -33,7 +38,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
-if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -3,49 +3,94 @@
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
-https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-which users will get their jobs to run on experimental runners. This user list
-is also a comma separated list of additional features or experiments which the
-user could be opted in to.
+https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+of which runners should be used to run which job.
+
+The configuration has two parts, the settings and a list of opted-in users,
+separated by a line containing "---".  If the line is not present, the
+settings are considered to be empty with only the second part, the user
+list, defined.
+
+The first part is a YAML block that defines the rollout settings. This can be
+used to define any settings that are needed to determine which runners to use.
+It's fields are defined by the RolloutSettings class below.
+
+The second part is a list of users who are explicitly opted in to the LF fleet.
+The user list is also a comma separated list of additional features or
+experiments which the user could be opted in to.

 The user list has the following rules:

- Users are GitHub usernames with the @ prefix
- If the first line is a "*" then all users will use the new runners
- If the first line is a "!" then all users will use the old runners
+- Users are GitHub usernames, which must start with the @ prefix
 - Each user is also a comma-separated list of features/experiments to enable
- A "#" prefix indicates the user is opted out of the new runners but is opting
-  into features/experiments.
+- A "#" prefix opts the user out of all experiments

-Example user list:
+Example config:
+    # A list of experiments that can be opted into.
+    # This defines the behavior they'll induce when opted into.
+    # Expected syntax is:
+    #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+    #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-    @User1
-    @User2,amz2023
-    #@UserOptOutOfNewRunner,amz2023
+    experiments:
+      lf:
+        rollout_percent: 25
+
+    ---
+
+    # Opt-ins:
+    # Users can opt into the LF fleet by adding their GitHub username to this list
+    # and specifying experiments to enable in a comma-separated list.
+    # Experiments should be from the above list.
+
+    @User1,lf,split_build
+    @User2,lf
+    @User3,split_build
 """

 import logging
 import os
+import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Iterable
+from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+import yaml
 from github import Auth, Github
 from github.Issue import Issue


-WORKFLOW_LABEL_META = ""  # use meta runners
+DEFAULT_LABEL_PREFIX = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-RUNNER_AMI_LEGACY = ""
-RUNNER_AMI_AMZ2023 = "amz2023"
-
 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
 GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+SETTING_EXPERIMENTS = "experiments"
+
+LF_FLEET_EXPERIMENT = "lf"
+CANARY_FLEET_SUFFIX = ".c"
+
+
+class Experiment(NamedTuple):
+    rollout_perc: float = (
+        0  # Percentage of workflows to experiment on when user is not opted-in.
+    )
+
+    # Add more fields as needed
+
+
+class Settings(NamedTuple):
+    """
+    Settings for the experiments that can be opted into.
+    """
+
+    experiments: Dict[str, Experiment] = {}
+
+
 class ColorFormatter(logging.Formatter):
    """Color codes the log messages based on the log level"""

@ -172,85 +217,180 @@ def is_exception_branch(branch: str) -> bool:
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-    """
-    Determines if the job should run on the LF fleet or the Meta fleet
-
-    Returns:
-        The appropriate label prefix for the runner, corresponding to the fleet to use.
-        This gets prefixed to the very start of the runner label.
-    """
-
+def load_yaml(yaml_text: str) -> Any:
    try:
-        if rollout_state[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif rollout_state[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ").split(",")[0]
-                for usr_raw in rollout_state.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
+        data = yaml.safe_load(yaml_text)
+        return data
+    except yaml.YAMLError as exc:
+        log.exception("Error loading YAML")
+        raise


-def get_optin_feature(
-    rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+    """
+    Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the rest is the users.
+    """
+    rollout_state_parts = rollout_state.split("---")
+    if len(rollout_state_parts) >= 2:
+        return rollout_state_parts[0], rollout_state_parts[1]
+    else:
+        return "", rollout_state
+
+
+class UserOptins(Dict[str, List[str]]):
+    """
+    Dictionary of users with a list of features they have opted into
+    """
+
+
+def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+    """
+    Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+    Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+        - Example line: "@User1,lf,split_build"
+        - A "#" prefix indicates the user is opted out of all experiments
+
+
+    """
+    optins = UserOptins()
+    for user in user_optin_text.split("\n"):
+        user = user.strip("\r\n\t -")
+        if not user or not user.startswith("@"):
+            # Not a valid user. Skip
+            continue
+
+        if user:
+            usr_name = user.split(",")[0].strip("@")
+            optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+    return optins
+
+
+def parse_settings_from_text(settings_text: str) -> Settings:
+    """
+    Parse the experiments from the issue body into a list of ExperimentSettings
+    """
+    try:
+        if settings_text:
+            # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+            # for easy reading
+            # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+            #       the backtick character in shell commands.
+            backtick = chr(96)  # backtick character
+            settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+            settings = load_yaml(settings_text)
+
+            # For now we just load experiments. We can expand this if/when we add more settings
+            experiments = {}
+
+            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                valid_settings = {}
+                for setting in exp_settings:
+                    if setting not in Experiment._fields:
+                        log.warning(
+                            f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                        )
+                    else:
+                        valid_settings[setting] = exp_settings[setting]
+
+                experiments[exp_name] = Experiment(**valid_settings)
+            return Settings(experiments)
+
+    except Exception:
+        log.exception("Failed to parse settings")
+
+    return Settings()
+
+
+def parse_settings(rollout_state: str) -> Settings:
+    """
+    Parse settings, if any, from the rollout state.
+
+    If the issue body contains "---" then the text above that is the settings
+    and the text below is the list of opted in users.
+
+    If it doesn't contain "---" then the settings are empty and the default values are used.
+    """
+    settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_settings_from_text(settings_text)
+
+
+def parse_users(rollout_state: str) -> UserOptins:
+    """
+    Parse users from the rollout state.
+
+    """
+    _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+    return parse_user_opt_in_from_text(users_text)
+
+
+def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+    """
+    Check if a user is opted into an experiment
+    """
+    return experiment_name in user_optins.get(user, [])
+
+
+def get_runner_prefix(
+    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
 ) -> str:
-    """
-    Used to dynamically opt in jobs to specific runner-type variants.
+    settings = parse_settings(rollout_state)
+    user_optins = parse_users(rollout_state)

-    Returns:
-        The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-        This variant name is prefixed to the runner-type in the label.
-    """
-    try:
-        userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-        all_opted_in_users = set()
-        for user in userlist:
-            for i in user.split(","):
-                if i == feature:
-                    all_opted_in_users.add(user.split(",")[0])
-        opted_in_requestors = {
-            usr for usr in workflow_requestors if usr in all_opted_in_users
-        }
+    fleet_prefix = ""
+    prefixes = []
+    for experiment_name, experiment_settings in settings.experiments.items():
+        enabled = False

-        if opted_in_requestors:
+        # Is any workflow_requestor opted in to this experiment?
+        opted_in_users = [
+            requestor
+            for requestor in workflow_requestors
+            if is_user_opted_in(requestor, user_optins, experiment_name)
+        ]
+
+        if opted_in_users:
            log.info(
-                f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
            )
-            return feature
-        else:
-            log.info(
-                f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-            )
-            return fallback
+            enabled = True
+        elif experiment_settings.rollout_perc:
+            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                log.info(
+                    f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                )
+                enabled = True

-    except Exception as e:
+        if enabled:
+            label = experiment_name
+            if experiment_name == LF_FLEET_EXPERIMENT:
+                # We give some special treatment to the "lf" experiment since determines the fleet we use
+                #  - If it's enabled, then we always list it's prefix first
+                #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                if is_canary:
+                    label += CANARY_FLEET_SUFFIX
+                fleet_prefix = label
+            else:
+                prefixes.append(label)
+
+    if len(prefixes) > 1:
        log.error(
-            f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+            f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
        )
-        return fallback
+        prefixes = prefixes[:1]
+
+    # Fleet always comes first
+    if fleet_prefix:
+        prefixes.insert(0, fleet_prefix)
+
+    return ".".join(prefixes) + "." if prefixes else ""


 def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -268,9 +408,10 @@ def main() -> None:
    args = parse_args()

    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-        runner_ami = RUNNER_AMI_LEGACY
+        log.info(
+            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+        )
+        runner_label_prefix = DEFAULT_LABEL_PREFIX
    else:
        try:
            rollout_state = get_rollout_state_from_issue(
@ -285,35 +426,18 @@ def main() -> None:
                args.github_branch,
            )

-            label_type = get_fleet(
-                rollout_state,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-            runner_ami = get_optin_feature(
-                rollout_state=rollout_state,
-                workflow_requestors=(
-                    args.github_issue_owner,
-                    username,
-                ),
-                feature=RUNNER_AMI_AMZ2023,
-                fallback=RUNNER_AMI_LEGACY,
+            is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+            runner_label_prefix = get_runner_prefix(
+                rollout_state, (args.github_issue_owner, username), is_canary
            )
+
        except Exception as e:
            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
            )
-            label_type = WORKFLOW_LABEL_META
-            runner_ami = RUNNER_AMI_LEGACY

-    # For Canary builds use canary runners
-    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-        label_type = WORKFLOW_LABEL_LF_CANARY
-
-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-    set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


 if __name__ == "__main__":
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -1,35 +0,0 @@
-#!/bin/bash
-
-set -eoux pipefail
-
-SYNC_BRANCH=pytorch-stable-prototype
-
-git config user.email "fake@example.com"
-git config user.name  "PyTorch Stable Bot"
-
-git fetch origin main
-git fetch origin "$SYNC_BRANCH"
-git checkout "$SYNC_BRANCH"
-
-# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
-# This specific SHA was chosen as it was before the "branch point" of the stable branch
-for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
-do
-    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
-    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
-    then
-        echo "Skipping $SHA"
-        continue
-    fi
-    echo "Copying $SHA"
-    git cherry-pick -x "$SHA" -X theirs
-    git reset --soft HEAD~1
-    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
-    git checkout .
-    git commit --reuse-message=HEAD@{1}
-    git clean -f
-done
-
-if [[ "${WITH_PUSH}" == true ]]; then
-  git push
-fi
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -51,6 +51,8 @@ def main() -> None:

    for platform_image in platform_images:  # type: ignore[attr-defined]
        for arch in platform_image.keys():  # type: ignore[attr-defined]
+            if arch == "cpu-s390x":
+                continue
            tag_image(
                platform_image[arch],  # type: ignore[index]
                default_tag,
--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -0,0 +1,237 @@
+from unittest import main, TestCase
+from unittest.mock import Mock, patch
+
+import runner_determinator as rd
+
+
+class TestRunnerDeterminatorIssueParser(TestCase):
+    def test_parse_settings(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_settings_in_code_block(self) -> None:
+        settings_text = """
+
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 0
+
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
+    def test_parse_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+    def test_parse_users_without_settings(self) -> None:
+        settings_text = """
+
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        users = rd.parse_users(settings_text)
+        self.assertDictEqual(
+            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
+            users,
+            "Users not parsed correctly",
+        )
+
+
+class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
+    def test_opted_in_user(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
+
+    def test_opted_in_user_two_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
+
+    @patch("random.uniform", return_value=50)
+    def test_opted_out_user(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    @patch("random.uniform", return_value=10)
+    def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 25
+            otherExp:
+                rollout_perc: 25
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        # User3 is opted out, but is pulled into both experiments by the 10% rollout
+        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_lf_prefix_always_comes_first(self) -> None:
+        settings_text = """
+        experiments:
+            otherExp:
+                rollout_perc: 0
+            lf:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf
+        @User2,otherExp,lf
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_commented_users(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+        ---
+
+        Users:
+        #@User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_ignores_extra_experiments(self) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 0
+            otherExp:
+                rollout_perc: 0
+            foo:
+                rollout_perc: 0
+        ---
+
+        Users:
+        @User1,lf,otherExp,foo
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -36,6 +36,7 @@ from warnings import warn

 import yaml
 from github_utils import (
+    gh_close_pr,
    gh_fetch_json_list,
    gh_fetch_merge_base,
    gh_fetch_url,
@ -1174,11 +1175,11 @@ class GitHubPR:
            for pr in additional_merged_prs:
                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)

-        if comment_id and self.pr_num:
-            # When the merge process reaches this part, we can assume that the commit
-            # has been successfully pushed to trunk
-            merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
+        # When the merge process reaches this part, we can assume that the commit
+        # has been successfully pushed to trunk
+        merge_commit_sha = repo.rev_parse(name=self.default_branch())

+        if comment_id and self.pr_num:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
@ -1203,6 +1204,17 @@ class GitHubPR:
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")

+        # Usually Github will see that the commit has "resolves <pr_num>" in the
+        # commit message and close the PR, but sometimes it doesn't, leading to
+        # confusion.  When it doesn't, we close it manually.
+        time.sleep(60)  # Give Github some time to close the PR
+        manually_close_merged_pr(
+            pr=self,
+            additional_merged_prs=additional_merged_prs,
+            merge_commit_sha=merge_commit_sha,
+            dry_run=dry_run,
+        )
+
    def merge_changes(
        self,
        repo: GitRepo,
@ -1503,6 +1515,34 @@ def checks_to_markdown_bullets(
    ]


+def manually_close_merged_pr(
+    pr: GitHubPR,
+    additional_merged_prs: List[GitHubPR],
+    merge_commit_sha: str,
+    dry_run: bool,
+) -> None:
+    def _comment_and_close(pr: GitHubPR, comment: str) -> None:
+        pr = GitHubPR(pr.org, pr.project, pr.pr_num)  # Refresh the PR
+        if not pr.is_closed():
+            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
+            gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
+
+    message = (
+        f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
+        "so mergebot is closing it manually.  If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+    )
+    _comment_and_close(pr, message)
+    for additional_pr in additional_merged_prs:
+        message = (
+            f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
+            "but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
+            "If you think this is a mistake, please feel free to reopen and contact Dev Infra."
+        )
+        _comment_and_close(additional_pr, message)
+
+    print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
+
+
@retries_decorator()
 def save_merge_record(
    comment_id: int,
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -109,6 +109,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

@ -118,13 +119,16 @@ jobs:
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@v3
-        if: ${{ inputs.aws-role-to-assume != '' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
@ -133,11 +137,13 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
@ -147,6 +153,7 @@ jobs:

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -174,6 +181,7 @@ jobs:
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
@ -195,6 +203,7 @@ jobs:
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
@ -202,7 +211,21 @@ jobs:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            JENKINS_USER=
+            USED_IMAGE="${DOCKER_IMAGE_S390X}"
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          else
+            JENKINS_USER="--user jenkins"
+            USED_IMAGE="${DOCKER_IMAGE}"
+          fi
+
          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for JENKINS_USER, which can be empty
+          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
@ -225,10 +248,10 @@ jobs:
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
-            --user jenkins \
+            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
+            "${USED_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

@ -239,7 +262,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -249,7 +272,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
@ -257,8 +280,26 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts for s390x
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
+      - name: Store PyTorch Build Artifacts for s390x for split build
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
      - name: Upload sccache stats
-        if: steps.build.outcome != 'skipped'
+        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-prefix: |
@ -270,4 +311,13 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
+
+      - name: Cleanup docker
+        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -88,6 +88,13 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Install PyTorch and run MPS tests
        id: test
        env:
@ -103,6 +110,14 @@ jobs:
          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
        run: |
          # shellcheck disable=SC1090
@ -144,13 +159,6 @@ jobs:
        run: |
          cat test/**/*_toprint.log || true

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -62,49 +62,94 @@ jobs:
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
-          https://github.com/pytorch/test-infra/issues/5132) as a user list to determine
-          which users will get their jobs to run on experimental runners. This user list
-          is also a comma separated list of additional features or experiments which the
-          user could be opted in to.
+          https://github.com/pytorch/test-infra/issues/5132) to define the configuration
+          of which runners should be used to run which job.
+
+          The configuration has two parts, the settings and a list of opted-in users,
+          separated by a line containing "---".  If the line is not present, the
+          settings are considered to be empty with only the second part, the user
+          list, defined.
+
+          The first part is a YAML block that defines the rollout settings. This can be
+          used to define any settings that are needed to determine which runners to use.
+          It's fields are defined by the RolloutSettings class below.
+
+          The second part is a list of users who are explicitly opted in to the LF fleet.
+          The user list is also a comma separated list of additional features or
+          experiments which the user could be opted in to.

          The user list has the following rules:

-          - Users are GitHub usernames with the @ prefix
-          - If the first line is a "*" then all users will use the new runners
-          - If the first line is a "!" then all users will use the old runners
+          - Users are GitHub usernames, which must start with the @ prefix
          - Each user is also a comma-separated list of features/experiments to enable
-          - A "#" prefix indicates the user is opted out of the new runners but is opting
-            into features/experiments.
+          - A "#" prefix opts the user out of all experiments

-          Example user list:
+          Example config:
+              # A list of experiments that can be opted into.
+              # This defines the behavior they'll induce when opted into.
+              # Expected syntax is:
+              #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
+              #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.

-              @User1
-              @User2,amz2023
-              #@UserOptOutOfNewRunner,amz2023
+              experiments:
+                lf:
+                  rollout_percent: 25
+
+              ---
+
+              # Opt-ins:
+              # Users can opt into the LF fleet by adding their GitHub username to this list
+              # and specifying experiments to enable in a comma-separated list.
+              # Experiments should be from the above list.
+
+              @User1,lf,split_build
+              @User2,lf
+              @User3,split_build
          """

          import logging
          import os
+          import random
          from argparse import ArgumentParser
          from logging import LogRecord
-          from typing import Any, Iterable
+          from typing import Any, Dict, Iterable, List, NamedTuple, Tuple

+          import yaml
          from github import Auth, Github
          from github.Issue import Issue


-          WORKFLOW_LABEL_META = ""  # use meta runners
+          DEFAULT_LABEL_PREFIX = ""  # use meta runners
          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
          WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

-          RUNNER_AMI_LEGACY = ""
-          RUNNER_AMI_AMZ2023 = "amz2023"
-
          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
          GH_OUTPUT_KEY_AMI = "runner-ami"
          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


+          SETTING_EXPERIMENTS = "experiments"
+
+          LF_FLEET_EXPERIMENT = "lf"
+          CANARY_FLEET_SUFFIX = ".c"
+
+
+          class Experiment(NamedTuple):
+              rollout_perc: float = (
+                  0  # Percentage of workflows to experiment on when user is not opted-in.
+              )
+
+              # Add more fields as needed
+
+
+          class Settings(NamedTuple):
+              """
+              Settings for the experiments that can be opted into.
+              """
+
+              experiments: Dict[str, Experiment] = {}
+
+
          class ColorFormatter(logging.Formatter):
              """Color codes the log messages based on the log level"""

@ -231,85 +276,180 @@ jobs:
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-          def get_fleet(rollout_state: str, workflow_requestors: Iterable[str]) -> str:
-              """
-              Determines if the job should run on the LF fleet or the Meta fleet
-
-              Returns:
-                  The appropriate label prefix for the runner, corresponding to the fleet to use.
-                  This gets prefixed to the very start of the runner label.
-              """
-
+          def load_yaml(yaml_text: str) -> Any:
              try:
-                  if rollout_state[0] == "!":
-                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
-                      return WORKFLOW_LABEL_META
-                  elif rollout_state[0] == "*":
-                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
-                      return WORKFLOW_LABEL_LF
-                  else:
-                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ").split(",")[0]
-                          for usr_raw in rollout_state.split()
-                      }
-                      opted_in_requestors = {
-                          usr for usr in workflow_requestors if usr in all_opted_in_users
-                      }
-                      if opted_in_requestors:
-                          log.info(
-                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                          )
-                          return WORKFLOW_LABEL_LF
-                      else:
-                          log.info(
-                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                          )
-                          return WORKFLOW_LABEL_META
-
-              except Exception as e:
-                  log.error(
-                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-                  )
-                  return WORKFLOW_LABEL_META
+                  data = yaml.safe_load(yaml_text)
+                  return data
+              except yaml.YAMLError as exc:
+                  log.exception("Error loading YAML")
+                  raise


-          def get_optin_feature(
-              rollout_state: str, workflow_requestors: Iterable[str], feature: str, fallback: str
+          def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
+              """
+              Extracts the text with settings, if any, and the opted in users from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the rest is the users.
+              """
+              rollout_state_parts = rollout_state.split("---")
+              if len(rollout_state_parts) >= 2:
+                  return rollout_state_parts[0], rollout_state_parts[1]
+              else:
+                  return "", rollout_state
+
+
+          class UserOptins(Dict[str, List[str]]):
+              """
+              Dictionary of users with a list of features they have opted into
+              """
+
+
+          def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
+              """
+              Parse the user opt-in text into a key value pair of username and the list of features they have opted into
+
+              Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
+                  - Example line: "@User1,lf,split_build"
+                  - A "#" prefix indicates the user is opted out of all experiments
+
+
+              """
+              optins = UserOptins()
+              for user in user_optin_text.split("\n"):
+                  user = user.strip("\r\n\t -")
+                  if not user or not user.startswith("@"):
+                      # Not a valid user. Skip
+                      continue
+
+                  if user:
+                      usr_name = user.split(",")[0].strip("@")
+                      optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
+
+              return optins
+
+
+          def parse_settings_from_text(settings_text: str) -> Settings:
+              """
+              Parse the experiments from the issue body into a list of ExperimentSettings
+              """
+              try:
+                  if settings_text:
+                      # Escape the backtick as well so that we can have the settings in a code block on the GH issue
+                      # for easy reading
+                      # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
+                      #       the backtick character in shell commands.
+                      backtick = chr(96)  # backtick character
+                      settings_text = settings_text.strip(f"\r\n\t{backtick} ")
+                      settings = load_yaml(settings_text)
+
+                      # For now we just load experiments. We can expand this if/when we add more settings
+                      experiments = {}
+
+                      for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
+                          valid_settings = {}
+                          for setting in exp_settings:
+                              if setting not in Experiment._fields:
+                                  log.warning(
+                                      f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
+                                  )
+                              else:
+                                  valid_settings[setting] = exp_settings[setting]
+
+                          experiments[exp_name] = Experiment(**valid_settings)
+                      return Settings(experiments)
+
+              except Exception:
+                  log.exception("Failed to parse settings")
+
+              return Settings()
+
+
+          def parse_settings(rollout_state: str) -> Settings:
+              """
+              Parse settings, if any, from the rollout state.
+
+              If the issue body contains "---" then the text above that is the settings
+              and the text below is the list of opted in users.
+
+              If it doesn't contain "---" then the settings are empty and the default values are used.
+              """
+              settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_settings_from_text(settings_text)
+
+
+          def parse_users(rollout_state: str) -> UserOptins:
+              """
+              Parse users from the rollout state.
+
+              """
+              _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
+              return parse_user_opt_in_from_text(users_text)
+
+
+          def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
+              """
+              Check if a user is opted into an experiment
+              """
+              return experiment_name in user_optins.get(user, [])
+
+
+          def get_runner_prefix(
+              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
          ) -> str:
-              """
-              Used to dynamically opt in jobs to specific runner-type variants.
+              settings = parse_settings(rollout_state)
+              user_optins = parse_users(rollout_state)

-              Returns:
-                  The runner-type's variant name if the user has opted in to the feature, otherwise returns an empty string.
-                  This variant name is prefixed to the runner-type in the label.
-              """
-              try:
-                  userlist = {u.lstrip("#").strip("\n\t@ ") for u in rollout_state.split()}
-                  all_opted_in_users = set()
-                  for user in userlist:
-                      for i in user.split(","):
-                          if i == feature:
-                              all_opted_in_users.add(user.split(",")[0])
-                  opted_in_requestors = {
-                      usr for usr in workflow_requestors if usr in all_opted_in_users
-                  }
+              fleet_prefix = ""
+              prefixes = []
+              for experiment_name, experiment_settings in settings.experiments.items():
+                  enabled = False

-                  if opted_in_requestors:
+                  # Is any workflow_requestor opted in to this experiment?
+                  opted_in_users = [
+                      requestor
+                      for requestor in workflow_requestors
+                      if is_user_opted_in(requestor, user_optins, experiment_name)
+                  ]
+
+                  if opted_in_users:
                      log.info(
-                          f"Feature {feature} is enabled for {', '.join(opted_in_requestors)}. Using feature {feature}."
+                          f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
                      )
-                      return feature
-                  else:
-                      log.info(
-                          f"Feature {feature} is disabled for {', '.join(workflow_requestors)}. Using fallback \"{fallback}\"."
-                      )
-                      return fallback
+                      enabled = True
+                  elif experiment_settings.rollout_perc:
+                      # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
+                      if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+                          log.info(
+                              f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                          )
+                          enabled = True

-              except Exception as e:
+                  if enabled:
+                      label = experiment_name
+                      if experiment_name == LF_FLEET_EXPERIMENT:
+                          # We give some special treatment to the "lf" experiment since determines the fleet we use
+                          #  - If it's enabled, then we always list it's prefix first
+                          #  - If we're in the canary branch, then we append ".c" to the lf prefix
+                          if is_canary:
+                              label += CANARY_FLEET_SUFFIX
+                          fleet_prefix = label
+                      else:
+                          prefixes.append(label)
+
+              if len(prefixes) > 1:
                  log.error(
-                      f'Failed to determine if user has opted-in to feature {feature}. Using fallback "{fallback}". Exception: {e}'
+                      f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
                  )
-                  return fallback
+                  prefixes = prefixes[:1]
+
+              # Fleet always comes first
+              if fleet_prefix:
+                  prefixes.insert(0, fleet_prefix)
+
+              return ".".join(prefixes) + "." if prefixes else ""


          def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
@ -327,9 +467,10 @@ jobs:
              args = parse_args()

              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-                  label_type = WORKFLOW_LABEL_META
-                  runner_ami = RUNNER_AMI_LEGACY
+                  log.info(
+                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+                  )
+                  runner_label_prefix = DEFAULT_LABEL_PREFIX
              else:
                  try:
                      rollout_state = get_rollout_state_from_issue(
@ -344,35 +485,18 @@ jobs:
                          args.github_branch,
                      )

-                      label_type = get_fleet(
-                          rollout_state,
-                          (
-                              args.github_issue_owner,
-                              username,
-                          ),
-                      )
-                      runner_ami = get_optin_feature(
-                          rollout_state=rollout_state,
-                          workflow_requestors=(
-                              args.github_issue_owner,
-                              username,
-                          ),
-                          feature=RUNNER_AMI_AMZ2023,
-                          fallback=RUNNER_AMI_LEGACY,
+                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+
+                      runner_label_prefix = get_runner_prefix(
+                          rollout_state, (args.github_issue_owner, username), is_canary
                      )
+
                  except Exception as e:
                      log.error(
-                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
                      )
-                      label_type = WORKFLOW_LABEL_META
-                      runner_ami = RUNNER_AMI_LEGACY

-              # For Canary builds use canary runners
-              if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
-                  label_type = WORKFLOW_LABEL_LF_CANARY
-
-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-              set_github_output(GH_OUTPUT_KEY_AMI, runner_ami)
+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)


          if __name__ == "__main__":
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -32,7 +32,7 @@ concurrency:
 jobs:
  build-docker:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -29,9 +29,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -66,7 +76,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -101,7 +112,8 @@ jobs:
          .ci/docker/libtorch/build.sh libtorch-cxx11-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -33,9 +33,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -73,7 +83,8 @@ jobs:
  # NOTE: manylinux_2_28 are still experimental, see https://github.com/pytorch/pytorch/issues/123649
  build-docker-cuda-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -110,7 +121,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cuda${{matrix.cuda_version}}
  build-docker-cuda-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4"]
@ -143,7 +155,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cuda${{matrix.cuda_version}}
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -178,7 +191,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:rocm${{matrix.rocm_version}}
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
@ -207,7 +221,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux-builder:cpu
  build-docker-cpu-manylinux_2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-manylinux_2_28
    steps:
@ -238,7 +253,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28-builder:cpu
  build-docker-cpu-aarch64:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64
    steps:
@ -269,7 +285,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxaarch64-builder:cpu-aarch64
  build-docker-cpu-aarch64-2_28:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.arm64.2xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.arm64.2xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-aarch64-2_28
    steps:
@ -303,7 +320,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinux2_28_aarch64-builder:cpu-aarch64
  build-docker-cpu-cxx11-abi:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: cpu-cxx11-abi
    steps:
@ -334,7 +352,8 @@ jobs:
          .ci/docker/manywheel/build.sh manylinuxcxx11-abi-builder:cpu-cxx11-abi
  build-docker-xpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: linux.9xlarge.ephemeral
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    env:
      GPU_ARCH_TYPE: xpu
    steps:
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -27,9 +27,19 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  build-wheel:
    name: "Build Triton Wheel"
-    runs-on: [self-hosted, linux.4xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
    strategy:
      fail-fast: false
      matrix:
@ -199,7 +209,8 @@ jobs:

  build-conda:
    name: "Build Triton Conda"
-    runs-on: [self-hosted, linux.2xlarge]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    strategy:
      fail-fast: false
      matrix:
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -30,6 +30,9 @@ concurrency:

 jobs:
  check-labels:
+    permissions:
+      contents: read
+      pull-requests: write
    name: Check labels
    if: github.repository_owner == 'pytorch'
    runs-on: linux.20_04.4x
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -16,6 +16,15 @@ on:
    paths: [.github/workflows/create_release.yml]

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  release:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Create Release
@ -63,7 +72,7 @@ jobs:
          files: ${{env.PT_RELEASE_FILE}}
      - name: Upload source distribution to GHA artifacts for release tags
        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@v2
+        uses: actions/upload-artifact@v4.4.0
        with:
          name: ${{ env.PT_RELEASE_FILE }}
          path: ${{ env.PT_RELEASE_FILE }}
@ -73,12 +82,14 @@ jobs:

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-    runs-on: linux.2xlarge
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: sourcecode-upload
    name: Upload source code to S3 for release tags
    permissions:
      id-token: write
-    needs: release
+    needs:
+      - get-label-type
+      - release
    steps:
      - uses: actions/download-artifact@v4.1.7
        with:
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -30,8 +30,18 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  docker-build:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    needs: get-label-type
    timeout-minutes: 240
    strategy:
      fail-fast: false
@ -68,7 +78,7 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: [self-hosted, "${{ matrix.runner }}"]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -34,9 +34,19 @@ env:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  generate-matrix:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.large"
    outputs:
      matrix: ${{ steps.generate-matrix.outputs.matrix }}
    steps:
@ -54,10 +64,12 @@ jobs:

  build:
    if: ${{ github.repository == 'pytorch/pytorch' }}
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
    environment: ${{ (github.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    timeout-minutes: 240
-    needs: generate-matrix
+    needs:
+      - generate-matrix
+      - get-label-type
    strategy:
      matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }}
      fail-fast: false
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -1010,76 +1010,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: False
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1766,6 +1696,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-main.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -19,7 +19,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -28,7 +28,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -58,7 +58,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -81,7 +81,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -105,7 +105,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -128,7 +128,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -152,7 +152,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -175,7 +175,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
@ -2,7 +2,7 @@

 # Template is at:    .github/templates/linux_binary_build_workflow.yml.j2
 # Generation script: .github/scripts/generate_ci_workflows.py
-name: linux-binary-manywheel
+name: linux-binary-manywheel-split


 on:
@ -24,7 +24,7 @@ env:
  ANACONDA_USER: pytorch
  AWS_DEFAULT_REGION: us-east-1
  BINARY_ENV_FILE: /tmp/env
-  BUILD_ENVIRONMENT: linux-binary-manywheel
+  BUILD_ENVIRONMENT: linux-binary-manywheel-split
  BUILDER_ROOT: /builder
  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
  PR_NUMBER: ${{ github.event.pull_request.number }}
@ -33,7 +33,7 @@ env:
  SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
  SKIP_ALL_TESTS: 0
 concurrency:
-  group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  group: linux-binary-manywheel-split-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

 jobs:
@ -63,7 +63,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -86,7 +86,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -134,7 +134,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -157,7 +157,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -205,7 +205,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -228,7 +228,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -275,7 +275,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-test:  # Testing
@ -296,7 +296,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -343,7 +343,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -366,7 +366,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -414,7 +414,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -437,7 +437,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -467,76 +467,6 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_10-cuda12_1-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_10-cuda12_1-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_10-cuda12_1-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_10-cuda12_1-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.10"
-      build_name: manywheel-py3_10-cuda12_1-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_10-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -555,7 +485,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -578,7 +508,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -625,7 +555,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-test:  # Testing
@ -646,7 +576,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -693,7 +623,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +646,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -764,7 +694,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +717,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -817,6 +747,76 @@ jobs:
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_1-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_1-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_1-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_1-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_1-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -835,7 +835,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -858,7 +858,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -905,7 +905,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-test:  # Testing
@ -926,7 +926,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -973,7 +973,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -996,7 +996,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1044,7 +1044,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1067,7 +1067,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1115,7 +1115,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1138,7 +1138,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1185,7 +1185,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-test:  # Testing
@ -1206,7 +1206,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
@ -1253,7 +1253,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1276,7 +1276,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda11_8
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1324,7 +1324,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1347,7 +1347,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_1
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1395,7 +1395,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1418,7 +1418,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cuda12_4
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
@ -1465,7 +1465,7 @@ jobs:
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cpu-test:  # Testing
@ -1486,7 +1486,7 @@ jobs:
      use_split_build: True
      DESIRED_PYTHON: "3.13"
      build_name: manywheel-py3_13-cpu
-      build_environment: linux-binary-manywheel
+      build_environment: linux-binary-manywheel-split
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
    secrets:
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -18,11 +18,22 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor.yml, but this doesn't run inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      sync-tag: linux-focal-cuda12_4-py3_10-gcc9-inductor-build
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -16,10 +16,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -13,10 +13,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-a10g.yml
+++ b/.github/workflows/inductor-perf-test-nightly-a10g.yml
@ -68,10 +68,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -5,9 +5,7 @@ on:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
    # Does not perform max_autotune on CPU, so skip the weekly run setup
-    # Run 6 times everyday to see if perf instablity can be reproduced
-    # Will change this back
-    - cron: 0 */4 * * *
+    - cron: 0 7 * * *
  # NB: GitHub has an upper limit of 10 inputs here
  workflow_dispatch:
    inputs:
@ -50,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-aarch64-py3_10-inductor-build:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.arm64.m7g.4xlarge
      build-environment: linux-jammy-aarch64-py3.10
      docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
@ -105,7 +114,7 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-aarch64-py3_10-inductor-build
-    if: github.event.schedule == '0 */4 * * *'
+    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-aarch64-py3.10
      # Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -48,10 +48,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image-name: pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -66,10 +66,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -18,10 +18,21 @@ concurrency:
 permissions: read-all

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build:
    name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.6'
@ -60,7 +71,9 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -22,11 +22,22 @@ concurrency:
 permissions: read-all

 jobs:
-  linux-focal-rocm6_1-py3_8-inductor-build:
-    name: rocm6.1-py3.8-inductor
-    uses: ./.github/workflows/_linux-build.yml
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-focal-rocm6_2-py3_10-inductor-build:
+    name: rocm6.2-py3.10-inductor
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -34,14 +45,14 @@ jobs:
          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-inductor-test:
+  linux-focal-rocm6_2-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm6.1-py3.8-inductor
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm6_1-py3_8-inductor-build
+    needs: linux-focal-rocm6_2-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -58,8 +58,7 @@ jobs:
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
    name: cuda12.1-py3.10-gcc9-sm86
@ -69,8 +68,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
@ -86,6 +84,7 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
    name: cuda12.1-py3.12-gcc9-sm86
@ -95,6 +94,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-build:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -108,6 +108,7 @@ jobs:
        { include: [
          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -117,6 +118,7 @@ jobs:
      build-environment: linux-jammy-py3.12-gcc11
      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
@ -134,8 +136,7 @@ jobs:
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
    name: cuda12.4-py3.10-gcc9-sm86
@ -146,8 +147,7 @@ jobs:
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -201,8 +201,7 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -212,5 +211,4 @@ jobs:
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@ -0,0 +1,44 @@
+name: Apply lint suggestions
+
+on:
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  lintrunner-autoformat:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: lf.linux.2xlarge
+    continue-on-error: true
+    if: ${{ github.repository_owner == 'pytorch' }}
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.10"
+      - name: Run lintrunner (nonretryable)
+        continue-on-error: true
+        # we can't run all files here because only changes around where the diff are shown in the PR UI
+        run: |
+          export ADDITIONAL_LINTRUNNER_ARGS="format"
+          bash .github/scripts/lintrunner.sh
+      - name: Check for changes
+        id: git-check
+        run: |
+          git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
+      - name: Suggest changes
+        if: steps.git-check.outputs.changes == 'true'
+        uses: parkerbxyz/suggest-changes@v1
+        with:
+          comment: "Please commit the suggested changes from pytorch's linter."
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -36,7 +36,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
        export CLANG=1
        .github/scripts/lintrunner.sh

@ -53,7 +53,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
        .github/scripts/lintrunner.sh

  quick-checks:
@ -278,4 +278,4 @@ jobs:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
+  cancel-in-progress: true
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -57,8 +57,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -87,8 +89,10 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

@ -214,7 +218,9 @@ jobs:
  # TODO: Figure out how to migrate this job to M1 runner
  ios-build-test:
    name: ios-build-test
-    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    # Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
+    # if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    if: false
    uses: ./.github/workflows/_ios-build-test.yml
    with:
      trigger-event: ${{ github.event_name }}
@ -293,13 +299,13 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -308,19 +314,19 @@ jobs:
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
@ -333,8 +339,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -383,7 +383,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
      test-matrix: |
        { include: [
          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
@ -503,15 +503,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-build:
+  linux-focal-rocm6_2-py3_10-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,18 +3,12 @@ name: rocm
 on:
  push:
    branches:
-#     - main
+      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
-    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
-    # Also run less frequently on weekends.
-    - cron: 45 0,8,16 * * 1-5
-    - cron: 45 4 * * 0,6
-    - cron: 45 4,12,20 * * 1-5
-    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT

 concurrency:
@ -31,11 +25,11 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -48,16 +42,16 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -56,12 +56,14 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test:
@ -87,8 +89,9 @@ jobs:
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
@ -127,13 +130,13 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -141,19 +144,19 @@ jobs:
          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
--- a/.github/workflows/sync_distributed_folder_prototype.yml
+++ b/.github/workflows/sync_distributed_folder_prototype.yml
@ -1,30 +0,0 @@
-name: Sync Distributed Folder
-
-on:
-  #push:
-  #  branches:
-  #    - 'main'
-  #  paths:
-  #    - 'torch/distributed/**'
-  workflow_dispatch:
-  pull_request:
-    paths:
-      - '.github/scripts/sync_distributed_folder_prototype.sh'
-      - '.github/workflows/sync_distributed_folder_prototype.yml'
-
-env:
-  WITH_PUSH: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
-
-permissions:
-  contents: write
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-jobs:
-  sync:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - run: .github/scripts/sync_distributed_folder_prototype.sh
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -10,8 +10,18 @@ permissions:
  contents: read

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  index:
-    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    needs: get-label-type
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" # 1 GPU A10G 24GB each
    environment: target-determinator-env
    steps:
      - name: Clone PyTorch
--- a/.github/workflows/torchbench.yml
+++ b/.github/workflows/torchbench.yml
@ -11,10 +11,21 @@ concurrency:
  cancel-in-progress: true

 jobs:
+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp:
    name: cuda12.1-py3.10-gcc9-sm80
    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
      cuda-arch-list: '8.0'
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -223,13 +223,13 @@ jobs:
      cuda-version: "12.1"
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"

-  linux-focal-rocm6_1-py3_8-build:
-    name: linux-focal-rocm6.1-py3.8
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.8
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -240,19 +240,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_8-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.8
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_8-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.8
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"

  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
@ -266,8 +266,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
@ -314,3 +316,11 @@ jobs:
      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -11,15 +11,39 @@ concurrency:

 jobs:
  do_update_viablestrict:
+    permissions:
+      id-token: write
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-20.04
    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
    steps:
      - name: Update viable/strict
        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        id: update_viablestrict
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+
+      - name: Authenticate to AWS with OIDC
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Print sha
+        env:
+          LATEST_SHA: ${{ steps.update_viablestrict.outputs.latest_viable_sha }}
+          PUSH_RESULT: ${{ steps.update_viablestrict.outputs.push_result }}
+          TIME: ${{ steps.update_viablestrict.outputs.time }}
+        run: |
+          echo "${PUSH_RESULT}"
+          if [ "$PUSH_RESULT" = "Everything up-to-date" ]; then
+            echo "No update pushed"
+          else
+            echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
+            pip install awscli==1.29.40
+            aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+          fi
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -1,55 +0,0 @@
-# upload alerts every 10 minutes
-
-name: Upload Alerts to AWS/Rockset
-
-on:
-  schedule:
-    - cron: '*/10 * * * *'
-  pull_request:
-    paths:
-      - 'tools/alerts/create_alerts.py'
-      - '.github/workflows/upload-alerts.yml'
-
-jobs:
-  upload-alerts:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-22.04
-    environment: upload-stats
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: pip
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
-
-      - name: Create alerts
-        run: |
-          output=$(PYTHONPATH=$PYTHONPATH:$(pwd) python3 "tools/alerts/create_alerts.py")
-          echo "uploading following alerts"
-          echo "$output"
-          echo "script-output=$output" >> "$GITHUB_OUTPUT"
-        id: alert_creation_step
-
-      - name: Upload alerts
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        uses: pytorch/test-infra/.github/actions/upload-alerts@main
-        with:
-          alerts: '${{ steps.alert_creation_step.outputs.script-output }}'
-          organization: "pytorch"
-          repo: "pytorch"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -96,7 +96,7 @@ jobs:
          python3 -m tools.stats.check_disabled_tests --workflow-run-id "${WORKFLOW_RUN_ID}" --workflow-run-attempt "${WORKFLOW_RUN_ATTEMPT}" --repo "${REPO_FULLNAME}"

      - name: Upload gpt-fast benchmark results to Rockset
-        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && github.event.workflow_run.name == 'inductor-micro-benchmark'
+        if: steps.upload-s3.outcome && steps.upload-s3.outcome == 'success' && contains(github.event.workflow_run.name, 'inductor-micro-benchmark')
        env:
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -139,7 +139,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.2.1',
-    'mypy==1.10.0',
+    'mypy==1.11.2',
    'sympy==1.12.1 ; python_version == "3.8"',
    'sympy==1.13.0 ; python_version >= "3.9"',
    'types-requests==2.27.25',
@ -195,6 +195,7 @@ include_patterns = [
    # and excluding most sub-directories for now.
    'aten/src/ATen/*.h',
    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/cuda/*.cpp',
    'aten/src/ATen/cpu/*.h',
    'aten/src/ATen/cpu/*.cpp',
    'aten/src/ATen/core/*.h',
@ -224,7 +225,6 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
-    'aten/**/cuda/*pp',
    'c10/xpu/**/*.h',
    'c10/xpu/**/*.cpp',
    'c10/cuda/CUDAAlgorithm.h',
@ -1585,6 +1585,27 @@ command = [
 ]
 is_formatter = true

+
+[[linter]]
+code = 'META_NO_CREATE_UNBACKED'
+include_patterns = [
+  "torch/_meta_registrations.py"
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=create_unbacked',
+    '--linter-name=META_NO_CREATE_UNBACKED',
+    '--error-name=no create_unbacked in meta registrations',
+    """--error-description=\
+        Data-dependent operators should have their meta \
+        registration in torch/_subclasses/fake_impls.py, \
+        not torch/_meta_registrations.py
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ATEN_CPU_GPU_AGNOSTIC'
 include_patterns = [
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -305,7 +305,6 @@ if(NOT DEFINED USE_VULKAN)
  cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
 endif()

-option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 cmake_dependent_option(
@ -369,7 +368,7 @@ cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
-    "USE_DISTRIBUTED" OFF)
+    "USE_DISTRIBUTED AND NOT WIN32" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 option(BUILD_LIBTORCH_CPU_WITH_DEBUG
@ -912,11 +911,6 @@ if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()

-if(USE_SLEEF_FOR_ARM_VEC256)
-  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
-  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
-endif()
-
 # Enable sleef on macOS with Apple silicon by default
 if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
  message(STATUS "Running on macOS with Apple silicon")
@ -924,6 +918,14 @@ if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STR
  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
 endif()

+# Enable sleef on Arm(R) architecture by default (except Android)
+if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+  AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
+  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
+  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
+endif()
+
+
 if(USE_XNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
 endif()
--- a/4
+++ b/4
@ -98,6 +98,10 @@ test/test_type_promotion.py @mruberry
 test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345

+# HOPs
+torch/_higher_order_ops/*.py @zou3519
+torch/_dynamo/variables/higher_order_ops.py @zou3519
+
 # torch MPS
 test/test_mps.py @kulinseth @malfet
 aten/src/ATen/mps/ @kulinseth @malfet
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -50,7 +50,6 @@ aspects of contributing to PyTorch.
 - [Windows development tips](#windows-development-tips)
  - [Known MSVC (and MSVC with NVCC) bugs](#known-msvc-and-msvc-with-nvcc-bugs)
  - [Building on legacy code and CUDA](#building-on-legacy-code-and-cuda)
- [Running clang-tidy](#running-clang-tidy)
 - [Pre-commit tidy/linting hook](#pre-commit-tidylinting-hook)
 - [Building PyTorch with ASAN](#building-pytorch-with-asan)
  - [Getting `ccache` to work](#getting-ccache-to-work)
@ -1132,38 +1131,6 @@ CUDA, MSVC, and PyTorch versions are interdependent; please install matching ver

 Note: There's a [compilation issue](https://github.com/oneapi-src/oneDNN/issues/812) in several Visual Studio 2019 versions since 16.7.1, so please make sure your Visual Studio 2019 version is not in 16.7.1 ~ 16.7.5

-## Running clang-tidy
-
-[Clang-Tidy](https://clang.llvm.org/extra/clang-tidy/index.html) is a C++
-linter and static analysis tool based on the clang compiler. We run clang-tidy
-in our CI to make sure that new C++ code is safe, sane and efficient. See the
-[`clang-tidy` job in our GitHub Workflow's
-lint.yml file](https://github.com/pytorch/pytorch/blob/main/.github/workflows/lint.yml)
-for the simple commands we use for this.
-
-To run clang-tidy locally, follow these steps:
-
-1. Install clang-tidy.
-We provide custom built binaries which have additional checks enabled. You can install it by running:
-```bash
-python3 -m tools.linter.clang_tidy.generate_build_files
-```
-We currently only support Linux and MacOS (x86).
-
-2. Install clang-tidy driver script dependencies
-```bash
-pip3 install -r tools/linter/clang_tidy/requirements.txt
-```
-
-3. Run clang-tidy
-```bash
-# Run clang-tidy on the entire codebase
-make clang-tidy
-# Run clang-tidy only on your changes
-make clang-tidy CHANGED_ONLY=--changed-only
-```
-This internally invokes our driver script and closely mimics how clang-tidy is run on CI.
-
 ## Pre-commit tidy/linting hook

 We use clang-tidy to perform additional
--- a/README.md
+++ b/README.md
@ -27,8 +27,8 @@ Our trunk health (Continuous Integration signals) can be found at [hud.pytorch.o
      - [NVIDIA CUDA Support](#nvidia-cuda-support)
      - [AMD ROCm Support](#amd-rocm-support)
      - [Intel GPU Support](#intel-gpu-support)
-    - [Install Dependencies](#install-dependencies)
    - [Get the PyTorch Source](#get-the-pytorch-source)
+    - [Install Dependencies](#install-dependencies)
    - [Install PyTorch](#install-pytorch)
      - [Adjust Build Options (Optional)](#adjust-build-options-optional)
  - [Docker Image](#docker-image)
@ -161,9 +161,34 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
 - Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)
+- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required, on Linux)
+- Visual Studio or Visual Studio Build Tool on Windows

-We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+\* PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
+Professional, or Community Editions. You can also install the build tools from
+https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
+come with Visual Studio Code by default.
+
+\* We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.
+
+An example of environment setup is shown below:
+
+* Linux:
+
+```bash
+$ source <CONDA_INSTALL_DIR>/bin/activate
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+```
+
+* Windows:
+
+```bash
+$ source <CONDA_INSTALL_DIR>\Scripts\activate.bat
+$ conda create -y -n <CONDA_NAME>
+$ conda activate <CONDA_NAME>
+$ call "C:\Program Files\Microsoft Visual Studio\<VERSION>\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
+```

 ##### NVIDIA CUDA Support
 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
@ -194,12 +219,23 @@ If you want to compile with Intel GPU support, follow these
 If you want to disable Intel GPU support, export the environment variable `USE_XPU=0`.
 Other potentially useful environment variables may be found in `setup.py`.

+#### Get the PyTorch Source
+```bash
+git clone --recursive https://github.com/pytorch/pytorch
+cd pytorch
+# if you are updating an existing checkout
+git submodule sync
+git submodule update --init --recursive
+```
+
 #### Install Dependencies

 **Common**

 ```bash
 conda install cmake ninja
+# Run this command on native Windows
+conda install rust
 # Run this command from the PyTorch directory after cloning the source code using the “Get the PyTorch Source“ section below
 pip install -r requirements.txt
 ```
@ -235,15 +271,6 @@ pip install mkl-static mkl-include
 conda install -c conda-forge libuv=1.39
 ```

-#### Get the PyTorch Source
-```bash
-git clone --recursive https://github.com/pytorch/pytorch
-cd pytorch
-# if you are updating an existing checkout
-git submodule sync
-git submodule update --init --recursive
-```
-
 #### Install PyTorch
 **On Linux**

@ -284,13 +311,6 @@ python3 setup.py develop

 **On Windows**

-Choose Correct Visual Studio Version.
-
-PyTorch CI uses Visual C++ BuildTools, which come with Visual Studio Enterprise,
-Professional, or Community Editions. You can also install the build tools from
-https://visualstudio.microsoft.com/visual-cpp-build-tools/. The build tools *do not*
-come with Visual Studio Code by default.
-
 If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/main/CONTRIBUTING.md#building-on-legacy-code-and-cuda)

 **CPU-only builds**
@ -298,7 +318,6 @@ If you want to build legacy python code, please refer to [Building on legacy cod
 In this mode PyTorch computations will run on your CPU, not your GPU

 ```cmd
-conda activate
 python setup.py develop
 ```

--- a/RELEASE.md
+++ b/RELEASE.md
@ -48,16 +48,16 @@

 Following is the Release Compatibility Matrix for PyTorch releases:

-| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
-| --- | --- | --- | --- | --- |
-| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
-| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
-| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
-| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
-| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
-| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
-| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
-| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
+| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
+| --- | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
+| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
+| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
+| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
+| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
+| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

 ## Release Cadence

@ -234,7 +234,7 @@ Typically, within a release cycle fixes are necessary for regressions, test fixe
 For fixes that are to go into a release after the release branch has been cut we typically employ the use of a cherry pick tracker.

 An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.

@ -243,7 +243,9 @@ Please also make sure to add milestone target to the PR/issue, especially if it
 #### How to do Cherry Picking

 You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
-to the main branch using `@pytorchbot cherry-pick` command as follows.
+to the main branch using `@pytorchbot cherry-pick` command as follows (make sure
+that the cherry-pick tracker issue for the target release labelled as "release tracker" -
+this will allow the bot to find it and post comments).

 ```
 usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
@ -380,7 +382,7 @@ Patch release process takes around 4-5 weeks to complete.
 ### Issue Tracker for Patch releases

 For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Only following issues are accepted:
 1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -54,7 +54,7 @@ if(NOT BUILD_LITE_INTERPRETER)
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})

-file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
+file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/sve/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
 file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
 file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -145,6 +145,14 @@ void Context::setSDPUseMath(bool e) {
  enabled_mathSDP = e;
 }

+bool Context::allowFP16BF16ReductionMathSDP() const {
+  return allow_fp16_bf16_reduction_mathSDP;
+}
+
+void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
+  allow_fp16_bf16_reduction_mathSDP = e;
+}
+
 bool Context::userEnabledCuDNNSDP() const {
  return enabled_cudnnSDP;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -234,6 +234,9 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

+  void setAllowFP16BF16ReductionMathSDP(bool);
+  bool allowFP16BF16ReductionMathSDP() const;
+
  void setSDPUseOverrideable(bool);
  bool userEnabledOverrideableSDP() const;

@ -390,6 +393,7 @@ class TORCH_API Context {
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = true;
  bool enabled_overrideable = true;
+  bool allow_fp16_bf16_reduction_mathSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
--- a/aten/src/ATen/Dispatch.h
+++ b/aten/src/ATen/Dispatch.h
@ -299,6 +299,15 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                 \
  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)

+#define AT_DISPATCH_CASE_FLOATING_TYPES_AND5(                             \
+    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, SCALARTYPE5, ...) \
+  AT_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)                            \
+  AT_DISPATCH_CASE(SCALARTYPE1, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE2, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE3, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE4, __VA_ARGS__)                              \
+  AT_DISPATCH_CASE(SCALARTYPE5, __VA_ARGS__)
+
 #define AT_DISPATCH_FLOATING_TYPES_AND4(                                 \
    SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, TYPE, NAME, ...) \
  AT_DISPATCH_SWITCH(                                                    \
@ -307,6 +316,26 @@ inline void deprecated_AT_DISPATCH_ALL_TYPES_AND_HALF_AND_COMPLEX() {}
      AT_DISPATCH_CASE_FLOATING_TYPES_AND4(                              \
          SCALARTYPE1, SCALARTYPE2, SCALARTYPE3, SCALARTYPE4, __VA_ARGS__))

+#define AT_DISPATCH_FLOATING_TYPES_AND5(    \
+    SCALARTYPE1,                            \
+    SCALARTYPE2,                            \
+    SCALARTYPE3,                            \
+    SCALARTYPE4,                            \
+    SCALARTYPE5,                            \
+    TYPE,                                   \
+    NAME,                                   \
+    ...)                                    \
+  AT_DISPATCH_SWITCH(                       \
+      TYPE,                                 \
+      NAME,                                 \
+      AT_DISPATCH_CASE_FLOATING_TYPES_AND5( \
+          SCALARTYPE1,                      \
+          SCALARTYPE2,                      \
+          SCALARTYPE3,                      \
+          SCALARTYPE4,                      \
+          SCALARTYPE5,                      \
+          __VA_ARGS__))
+
 #define AT_DISPATCH_CASE_COMPLEX_TYPES(...)                    \
  AT_DISPATCH_CASE(at::ScalarType::ComplexDouble, __VA_ARGS__) \
  AT_DISPATCH_CASE(at::ScalarType::ComplexFloat, __VA_ARGS__)
--- a/aten/src/ATen/Version.cpp
+++ b/aten/src/ATen/Version.cpp
@ -105,6 +105,11 @@ std::string get_cpu_capability() {
      return "DEFAULT";
    case native::CPUCapability::ZVECTOR:
      return "Z VECTOR";
+#elif defined(HAVE_SVE_CPU_DEFINITION)
+    case native::CPUCapability::DEFAULT:
+      return "DEFAULT";
+    case native::CPUCapability::SVE256:
+      return "SVE256";
 #else
    case native::CPUCapability::DEFAULT:
      return "NO AVX";
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@ -336,6 +336,7 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
  KERNEL_CPU(linalg_vecdot, lower_precision_fp)
  KERNEL_CPU(baddbmm, lower_precision_fp)
  KERNEL_CPU(addmm, lower_precision_fp)
+  KERNEL_CPU(_addmm_activation, lower_precision_fp)
  KERNEL_CPU(addbmm, lower_precision_fp)
  KERNEL_CPU(linear, lower_precision_fp)
  KERNEL_CPU(_convolution, deprecated, lower_precision_fp)
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -1,4 +1,6 @@
 #include <c10/core/Allocator.h>
+#include <c10/core/thread_pool.h>
+#include <c10/util/CallOnce.h>
 #include <c10/util/flat_hash_map.h>
 #include <c10/util/llvmMathExtras.h>
 #include <optional>
@ -109,6 +111,17 @@ template <
    typename E,
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
+  CachingHostAllocatorImpl() {
+    // Launch the background thread and process events in a loop.
+    if (pinned_use_background_threads()) {
+      getBackgroundThreadPool()->run([&]() {
+        while (true) {
+          process_events();
+          std::this_thread::sleep_for(std::chrono::microseconds(100));
+        }
+      });
+    }
+  }
  virtual ~CachingHostAllocatorImpl() = default;

 public:
@ -118,17 +131,34 @@ struct CachingHostAllocatorImpl {
      return {nullptr, nullptr};
    }

-    process_events();
-
-    // First, try to allocate from the free list
-    auto* block = get_free_block(size);
-    if (block) {
-      return {block->ptr_, reinterpret_cast<void*>(block)};
+    // If we are using background threads, we can process events in the
+    // background.
+    if (!pinned_use_background_threads()) {
+      process_events();
    }

    // Round up the allocation to the nearest power of two to improve reuse.
    // These power of two sizes are also used to index into the free list.
    size_t roundSize = c10::llvm::PowerOf2Ceil(size);
+
+    // First, try to allocate from the free list
+    auto* block = get_free_block(roundSize);
+    if (block) {
+      return {block->ptr_, reinterpret_cast<void*>(block)};
+    }
+
+    // Check in the recently freed blocks with pending events to see if we
+    // can reuse them. Call get_free_block again after processing events
+    if (pinned_use_background_threads()) {
+      process_events_for_specific_size(roundSize);
+      block = get_free_block(roundSize);
+      if (block) {
+        return {block->ptr_, reinterpret_cast<void*>(block)};
+      }
+    }
+
+    // Slow path: if we can't allocate from the cached free list, we need
+    // to create a new block.
    void* ptr = nullptr;
    allocate_host_memory(roundSize, &ptr);

@ -237,6 +267,10 @@ struct CachingHostAllocatorImpl {
    return c10::llvm::Log2_64_Ceil(size);
  }

+  virtual bool pinned_use_background_threads() {
+    return false;
+  }
+
  virtual void copy_data(void* dest [[maybe_unused]], const void* src [[maybe_unused]], std::size_t count [[maybe_unused]]) const {
    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data");
  }
@ -261,6 +295,21 @@ struct CachingHostAllocatorImpl {
  }

  virtual void process_events() {
+    // process all events until the last unready event, not for specific size.
+    process_events_for_specific_size(-1);
+  }
+
+  // If size is -1, process all events from backwards until the last unready
+  // event. Otherwise, process events for a specific size and on first ready block
+  // is found, add it to the free list and return.
+  virtual void process_events_for_specific_size(int64_t size) {
+    size_t event_count = 0;
+    size_t max_events = 0;
+    {
+      std::lock_guard<std::mutex> g(events_mutex_);
+      max_events = events_.size();
+    }
+
    while (true) {
      // Avoid calling cudaEventDestroy while holding a mutex, so move
      // intermediate events out of the lock into this object.
@ -278,6 +327,25 @@ struct CachingHostAllocatorImpl {
        return;
      }

+      if (size != -1) {
+        if (event_count++ > max_events) {
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          return;
+        }
+        if (size != (int64_t)processed->second->size_) {
+          // if we are processing a specific size, and the size of the block
+          // doesn't match, we can't use it.
+          {
+            std::lock_guard<std::mutex> g(events_mutex_);
+            events_.push_front(std::move(*processed));
+          }
+          continue;
+        }
+      }
+
      // otherwise, query the event
      {
        // now, see if we can handle this element
@ -286,9 +354,14 @@ struct CachingHostAllocatorImpl {
          // push the event onto the back if it's not ready.
          {
            std::lock_guard<std::mutex> g(events_mutex_);
-            events_.push_back(std::move(*processed));
+            if (size == -1) {
+              events_.push_back(std::move(*processed));
+              return;
+            } else {
+              events_.push_front(std::move(*processed));
+              continue;
+            }
          }
-          return;
        }
      }

@ -309,46 +382,54 @@ struct CachingHostAllocatorImpl {
        auto index = size_index(block->size_);
        std::lock_guard<std::mutex> g(free_list_[index].mutex_);
        free_list_[index].list_.push_back(block);
+        if (size != -1) {
+          return;
+        }
      }
    }
  }

-  /* These following functions are runtime-related. */
-
-  // Allocate page-locked memory on the host.
-  virtual void allocate_host_memory(size_t size, void** ptr) {
-    TORCH_CHECK_NOT_IMPLEMENTED(
-        false, "Not implemented for allocate_host_memory");
+  TaskThreadPool* getBackgroundThreadPool() {
+    static TaskThreadPool* pool = new TaskThreadPool(1);
+    return pool;
  }

-  // Free block and release the pointer contained in block.
-  virtual void free_block(B* block) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
-  }
+    /* These following functions are runtime-related. */

-  // Record an event on stream and store event into events.
-  virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
-  }
+    // Allocate page-locked memory on the host.
+    virtual void allocate_host_memory(size_t size, void** ptr) {
+      TORCH_CHECK_NOT_IMPLEMENTED(
+          false, "Not implemented for allocate_host_memory");
+    }

-  // Query event if it is completed.
-  virtual bool query_event(E& event) {
-    TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
-  }
+    // Free block and release the pointer contained in block.
+    virtual void free_block(B* block) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block");
+    }

-  alignas(64) std::mutex blocks_mutex_;
-  ska::flat_hash_set<B*> blocks_; // block list
-  ska::flat_hash_map<void*, B*> ptr_to_block_;
+    // Record an event on stream and store event into events.
+    virtual void record_stream(std::optional<std::vector<E>>& events, S stream) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream");
+    }

-  // We keep free list as a vector of free lists, one for each power of two
-  // size. This allows us to quickly find a free block of the right size.
-  // We use deque to store per size free list and guard the list with its own
-  // mutex.
-  alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+    // Query event if it is completed.
+    virtual bool query_event(E& event) {
+      TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event");
+    }

-  alignas(64) std::mutex events_mutex_;
-  std::deque<std::pair<E, B*>> events_; // event queue paired with block
-};
+    alignas(64) std::mutex blocks_mutex_;
+    ska::flat_hash_set<B*> blocks_; // block list
+    ska::flat_hash_map<void*, B*> ptr_to_block_;
+
+    // We keep free list as a vector of free lists, one for each power of two
+    // size. This allows us to quickly find a free block of the right size.
+    // We use deque to store per size free list and guard the list with its own
+    // mutex.
+    alignas(64) std::vector<FreeBlockList<B>> free_list_ = std::vector<FreeBlockList<B>>(MAX_SIZE_INDEX);
+
+    alignas(64) std::mutex events_mutex_;
+    std::deque<std::pair<E, B*>> events_; // event queue paired with block
+  };

 template <typename T>
 struct CachingHostAllocatorInterface : public at::Allocator {
--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -45,7 +45,7 @@ private:
  c10::impl::LocalDispatchKeySet saved_;
 };

-void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
  // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
  // StashTLSOnEntryGuard stash_guard;
@ -68,12 +68,20 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  // we actually run dispatch(), we will take out PyObjects in the context
  // of that interpreter, and this will ensure that everyone is on the same
  // interpreter.
+  bool tensors_with_python_key_present = false;
+  c10::impl::PyInterpreter* interpreter = nullptr;
  for (const auto& ivalue : torch::jit::last(*stack, num_arguments)) {
    if (ivalue.isTensor()) {
-      auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-      if (interpreter) {
-        (*interpreter)->dispatch(op, stack);
-        return;
+      auto* t = ivalue.unsafeToTensorImpl();
+      if (t->key_set().has(c10::DispatchKey::Python)) {
+        tensors_with_python_key_present = true;
+      }
+
+      if (!interpreter) {
+        auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+        if (t_interpreter) {
+          interpreter = t_interpreter;
+        }
      }
    } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
      // NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
@ -82,14 +90,43 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
        if (nv.isNone()) {
          continue;
        }
-        auto* interpreter = nv.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-        if (interpreter) {
-          (*interpreter)->dispatch(op, stack);
-          return;
+
+        auto* t = nv.unsafeToTensorImpl();
+        if (t->key_set().has(c10::DispatchKey::Python)) {
+          tensors_with_python_key_present = true;
+        }
+
+        if (!interpreter) {
+          auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+          if (t_interpreter) {
+            interpreter = t_interpreter;
+          }
        }
      }
    }
  }
+
+  if (interpreter) {
+    if (tensors_with_python_key_present) {
+      (*interpreter)->dispatch(op, stack);
+    } else {
+      // At this point, there are no modes in the stack and no tensors with the python key.
+      // so disable the python key before redispatching.
+      // See https://github.com/pytorch/pytorch/issues/136565
+      c10::DispatchKeySet keyset = dispatch_keys.remove(c10::DispatchKey::Python);
+
+      // Remove Python key from the included set as well (modes add it there).
+      c10::impl::LocalDispatchKeySet local_keyset = c10::impl::tls_local_dispatch_key_set();
+      c10::impl::ForceDispatchKeyGuard no_python_guard(
+        local_keyset.included_.remove(c10::DispatchKey::Python),
+        local_keyset.excluded_
+      );
+
+      op.redispatchBoxed(keyset, stack);
+    }
+    return;
+  }
+
  TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }

--- a/aten/src/ATen/cpu/vec/functional_base.h
+++ b/aten/src/ATen/cpu/vec/functional_base.h
@ -78,7 +78,7 @@ struct VecReduceAllSIMD<float, Op> {
 #endif // defined(CPU_CAPABILITY_AVX512)
 #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE)

-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE)
 template <typename Op>
 struct VecReduceAllSIMD<float, Op> {
  static inline float apply(const Op& vec_fun, const Vectorized<float>& acc_vec) {
--- a/aten/src/ATen/cpu/vec/intrinsics.h
+++ b/aten/src/ATen/cpu/vec/intrinsics.h
@ -5,6 +5,10 @@
 #elif defined(__clang__) && (defined(__ARM_NEON__) || defined(__aarch64__))
 /* Clang-compatible compiler, targeting arm neon */
 #include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* CLANG-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
 #elif defined(_MSC_VER)
 /* Microsoft C/C++-compatible compiler */
 #include <intrin.h>
@ -17,6 +21,10 @@
 #elif defined(__GNUC__) && (defined(__ARM_NEON__) || defined(__aarch64__))
 /* GCC-compatible compiler, targeting ARM with NEON */
 #include <arm_neon.h>
+#if defined(__ARM_FEATURE_SVE)
+/* GCC-compatible compiler, targeting ARM with SVE */
+#include <arm_sve.h>
+#endif
 #if defined (MISSING_ARM_VLD1)
 #include <ATen/cpu/vec/vec256/missing_vld1_neon.h>
 #elif defined (MISSING_ARM_VST1)
--- a/aten/src/ATen/cpu/vec/sve/sve_helper.h
+++ b/aten/src/ATen/cpu/vec/sve/sve_helper.h
@ -0,0 +1,63 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// Define the data type of VLS(vector-length specific).
+typedef svbool_t vls_pred_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint8_t vls_int8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint16_t vls_int16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint32_t vls_int32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svint64_t vls_int64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint8_t vls_uint8_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint16_t vls_uint16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint32_t vls_uint32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svuint64_t vls_uint64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat16_t vls_float16_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat32_t vls_float32_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+typedef svfloat64_t vls_float64_t __attribute__((arm_sve_vector_bits(VECTOR_WIDTH * 8)));
+
+#define ptrue svptrue_b8()
+#define ZERO_S8 svdup_n_s8(0)
+#define ZERO_S16 svdup_n_s16(0)
+#define ZERO_S32 svdup_n_s32(0)
+#define ZERO_S64 svdup_n_s64(0)
+#define ZERO_U8 svdup_n_u8(0)
+#define ZERO_U16 svdup_n_u16(0)
+#define ZERO_U32 svdup_n_u32(0)
+#define ZERO_U64 svdup_n_u64(0)
+#define ZERO_F16 svdup_n_f16(0.f)
+#define ZERO_F32 svdup_n_f32(0.f)
+#define ZERO_F64 svdup_n_f64(0.0)
+#define ONE_S8  svdup_n_s8(1)
+#define ONE_S16 svdup_n_s16(1)
+#define ONE_S32 svdup_n_s32(1)
+#define ONE_S64 svdup_n_s64(1)
+#define ONE_U8 svdup_n_u8(1)
+#define ONE_U16 svdup_n_u16(1)
+#define ONE_U32 svdup_n_u32(1)
+#define ONE_U64 svdup_n_u64(1)
+#define ONE_F16 svdup_n_f16(1.f)
+#define ONE_F32 svdup_n_f32(1.f)
+#define ONE_F64 svdup_n_f64(1.0)
+#define ALL_S8_TRUE_MASK svdup_n_s8(0xff)
+#define ALL_S8_FALSE_MASK svdup_n_s8(0x0)
+#define ALL_S16_TRUE_MASK svdup_n_s16(0xffff)
+#define ALL_S16_FALSE_MASK svdup_n_s16(0x0)
+#define ALL_S32_TRUE_MASK svdup_n_s32(0xffffffff)
+#define ALL_S32_FALSE_MASK svdup_n_s32(0x0)
+#define ALL_S64_TRUE_MASK svdup_n_s64(0xffffffffffffffff)
+#define ALL_S64_FALSE_MASK svdup_n_s64(0x0)
+#define ALL_U8_TRUE_MASK svdup_n_u8(0x01)
+#define ALL_U8_FALSE_MASK svdup_n_u8(0x00)
+#define ALL_F16_TRUE_MASK svreinterpret_f16_s16(ALL_S16_TRUE_MASK)
+#define ALL_F16_FALSE_MASK svreinterpret_f16_s16(ALL_S16_FALSE_MASK)
+#define ALL_F32_TRUE_MASK svreinterpret_f32_s32(ALL_S32_TRUE_MASK)
+#define ALL_F32_FALSE_MASK svreinterpret_f32_s32(ALL_S32_FALSE_MASK)
+#define ALL_F64_TRUE_MASK svreinterpret_f64_s64(ALL_S64_TRUE_MASK)
+#define ALL_F64_FALSE_MASK svreinterpret_f64_s64(ALL_S64_FALSE_MASK)
+
+#endif // defined(CPU_CAPABILITY_SVE)
--- a/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_common_sve.h
@ -0,0 +1,176 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+
+#if defined(CPU_CAPABILITY_SVE)
+#include <ATen/cpu/vec/sve/vec_float.h>
+#include <ATen/cpu/vec/sve/vec_double.h>
+#include <ATen/cpu/vec/sve/vec_int.h>
+#include <ATen/cpu/vec/sve/vec_qint.h>
+#endif
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<>
+inline Vectorized<float> cast<float, double>(const Vectorized<double>& src) {
+  return svreinterpret_f32_f64(src);
+}
+
+template<>
+inline Vectorized<double> cast<double, float>(const Vectorized<float>& src) {
+  return svreinterpret_f64_f32(src);
+}
+
+#define DEFINE_FLOAT_INT_CAST(int_t, int_bit, float_t, float_bit)                \
+template<>                                                                       \
+inline  Vectorized<int_t> cast<int_t, float_t>(const Vectorized<float_t>& src) { \
+  return svreinterpret_s##int_bit##_f##float_bit(src);                           \
+}                                                                                \
+template<>                                                                       \
+inline Vectorized<float_t> cast<float_t, int_t>(const Vectorized<int_t>& src) {  \
+  return svreinterpret_f##float_bit##_s##int_bit(src);                           \
+}
+
+DEFINE_FLOAT_INT_CAST(int64_t, 64, double, 64)
+DEFINE_FLOAT_INT_CAST(int32_t, 32, double, 64)
+DEFINE_FLOAT_INT_CAST(int16_t, 16, double, 64)
+DEFINE_FLOAT_INT_CAST(int64_t, 64, float, 32)
+DEFINE_FLOAT_INT_CAST(int32_t, 32, float, 32)
+DEFINE_FLOAT_INT_CAST(int16_t, 16, float, 32)
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline gather(const double* base_addr, const Vectorized<int64_t>& vindex_) {
+  svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svld1_gather_s64index_f64(ptrue, base_addr, vindex);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline gather(const float* base_addr, const Vectorized<int32_t>& vindex_) {
+  svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svld1_gather_s32index_f32(ptrue, base_addr, vindex);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<double>>
+inline mask_gather(const Vectorized<double>& src, const double* base_addr,
+                   const Vectorized<int64_t>& vindex_, const Vectorized<double>& mask_) {
+  svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
+                              ALL_S64_TRUE_MASK);
+  svint64_t vindex = svasrd_n_s64_x(ptrue, svmul_s64_x(ptrue, vindex_, svdup_n_s64(scale)), 3);
+  return svsel_f64(mask, svld1_gather_s64index_f64(mask, base_addr, vindex), src);
+}
+
+template<int64_t scale = 1>
+std::enable_if_t<scale == 1 || scale == 2 || scale == 4 || scale == 8, Vectorized<float>>
+inline mask_gather(const Vectorized<float>& src, const float* base_addr,
+                   const Vectorized<int32_t>& vindex_, const Vectorized<float>& mask_) {
+  svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
+                              ALL_S32_TRUE_MASK);
+  svint32_t vindex = svasrd_n_s32_x(ptrue, svmul_s32_x(ptrue, vindex_, svdup_n_s32(scale)), 2);
+  return svsel_f32(mask, svld1_gather_s32index_f32(mask, base_addr, vindex), src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+// Only works for inputs in the range: [-2^51, 2^51]
+// From: https://stackoverflow.com/a/41148578
+template<>
+Vectorized<int64_t>
+inline convert_to_int_of_same_size<double>(const Vectorized<double> &src) {
+  svfloat64_t x = svadd_f64_x(ptrue, src, svdup_n_f64(0x0018000000000000));
+  return svsub_s64_x(ptrue,
+                     svreinterpret_s64_f64(x),
+                     svreinterpret_s64_f64(svdup_n_f64(0x0018000000000000)));
+}
+
+template<>
+Vectorized<int32_t>
+inline convert_to_int_of_same_size<float>(const Vectorized<float> &src) {
+  return svcvt_s32_f32_x(ptrue, src);
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline interleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, a1, a3, a3}
+  //   b = {b0, b1, b2, b3}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1}
+  //          {a2, b2, a3, b3}
+  return std::make_pair(Vectorized<double>(svzip1_f64(a, b)),
+                        Vectorized<double>(svzip2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline interleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, a1, a2, a3, a4, a5, a6, a7}
+  //   b = {b0, b1, b2, b3, b4, b5, b6, b7}
+  // group cols crossing lanes:
+  //   return {a0, b0, a1, b1, a2, b2, a3, b3}
+  //          {a4, b4, a5, b5, a6, b6, a7, b7}
+  return std::make_pair(Vectorized<float>(svzip1_f32(a, b)),
+                        Vectorized<float>(svzip2_f32(a, b)));
+}
+
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ DEINTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+template <>
+std::pair<Vectorized<double>, Vectorized<double>>
+inline deinterleave2<double>(const Vectorized<double>& a, const Vectorized<double>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1}
+  //   b = {a2, b2, a3, b3}
+  // swap lanes:
+  //   return {a0, a1, a2, a3}
+  //          {b0, b1, b2, b3}
+  return std::make_pair(Vectorized<double>(svuzp1_f64(a, b)),
+                        Vectorized<double>(svuzp2_f64(a, b)));
+}
+
+template <>
+std::pair<Vectorized<float>, Vectorized<float>>
+inline deinterleave2<float>(const Vectorized<float>& a, const Vectorized<float>& b) {
+  // inputs:
+  //   a = {a0, b0, a1, b1, a2, b2, a3, b3}
+  //   b = {a4, b4, a5, b5, a6, b6, a7, b7}
+  // swap lanes:
+  //   return {a0, a1, a2, a3, a4, a5, a6, a7}
+  //          {b0, b1, b2, b3, b4, b5, b6, b7}
+  return std::make_pair(Vectorized<float>(svuzp1_f32(a, b)),
+                        Vectorized<float>(svuzp2_f32(a, b)));
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -0,0 +1,505 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <> class Vectorized<double> {
+private:
+  vls_float64_t values;
+public:
+  using value_type = double;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(double);
+  }
+  Vectorized() {}
+  Vectorized(svfloat64_t v) : values(v) {}
+  Vectorized(double val) {
+    values = svdup_n_f64(val);
+  }
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ double buffer[size()] = { vals... };
+    values = svld1_f64(ptrue, buffer);
+  }
+  operator svfloat64_t() const {
+    return values;
+  }
+  static Vectorized<double> blendv(const Vectorized<double>& a, const Vectorized<double>& b,
+                              const Vectorized<double>& mask_) {
+    svbool_t mask = svcmpeq_s64(ptrue, svreinterpret_s64_f64(mask_),
+                               ALL_S64_TRUE_MASK);
+    return svsel_f64(mask, b, a);
+  }
+  template<typename step_t>
+  static Vectorized<double> arange(double base = 0., step_t step = static_cast<step_t>(1)) {
+    __at_align__ double buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f64(ptrue, buffer);
+  }
+  static Vectorized<double> set(const Vectorized<double>& a, const Vectorized<double>& b,
+                           int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f64(svwhilelt_b64(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<double> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f64(ptrue, reinterpret_cast<const double*>(ptr));
+    svbool_t pg = svwhilelt_b64(0ull, count);
+    return svld1_f64(pg, reinterpret_cast<const double*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f64(ptrue, reinterpret_cast<double*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b64(0ull, count);
+      svst1_f64(pg, reinterpret_cast<double*>(ptr), values);
+    }
+  }
+  const double& operator[](int idx) const  = delete;
+  double& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int64_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f64(ptrue, values, ZERO_F64);
+    svst1_s64(ptrue, mask_array, svsel_s64(svbool_mask,
+                                          ALL_S64_TRUE_MASK,
+                                          ALL_S64_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i]) mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<double> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f64(ptrue, svsub_f64_x(ptrue, values, values), ZERO_F64));
+  }
+  Vectorized<double> map(double (*f)(double)) const {
+    __at_align__ double tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> abs() const {
+    return svabs_f64_x(ptrue, values);
+  }
+  Vectorized<double> angle() const {
+    const auto nan_vec = svdup_n_f64(NAN);
+    const auto nan_mask = svcmpuo_f64(ptrue, values, ZERO_F64);
+    const auto pi = svdup_n_f64(c10::pi<double>);
+
+    const auto neg_mask = svcmplt_f64(ptrue, values, ZERO_F64);
+    auto angle = svsel_f64(neg_mask, pi, ZERO_F64);
+    angle = svsel_f64(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<double> real() const {
+    return *this;
+  }
+  Vectorized<double> imag() const {
+    return Vectorized<double>(0.0);
+  }
+  Vectorized<double> conj() const {
+    return *this;
+  }
+  Vectorized<double> acos() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_acosdx_u10sve(values)),map(std::acos));
+  }
+  Vectorized<double> acosh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_acoshdx_u10sve(values)),map(std::acosh));
+  }
+  Vectorized<double> asin() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_asindx_u10sve(values)),map(std::asin));
+  }
+  Vectorized<double> atan() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_atandx_u10sve(values)),map(std::atan));
+  }
+  Vectorized<double> atanh() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_atanhdx_u10sve(values)),map(std::atanh));
+  }
+  Vectorized<double> atan2(const Vectorized<double> &b) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_atan2dx_u10sve(values, b));},
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> copysign(const Vectorized<double> &sign) const {
+   USE_SLEEF( {return Vectorized<double>(Sleef_copysigndx_sve(values, sign));},
+     {
+       __at_align__ double tmp[size()];
+       __at_align__ double tmp_sign[size()];
+       store(tmp);
+       sign.store(tmp_sign);
+       for (int64_t i = 0; i < size(); i++) {
+         tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+       }
+       return loadu(tmp);
+     }
+   )
+  }
+  Vectorized<double> erf() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_erfdx_u10sve(values)),map(std::erf));
+  }
+  Vectorized<double> erfc() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_erfcdx_u15sve(values)),map(std::erfc));
+  }
+  Vectorized<double> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<double> exp() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_expdx_u10sve(values)),map(std::exp));
+  }
+  Vectorized<double> exp2() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_exp2dx_u10sve(values)),map(std::exp2));
+  }
+  Vectorized<double> expm1() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_expm1dx_u10sve(values)),map(std::expm1));
+  }
+  Vectorized<double> exp_u20() const {
+    return exp();
+  }
+  Vectorized<double> fmod(const Vectorized<double>& q) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_fmoddx_sve(values, q));},
+    {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> hypot(const Vectorized<double> &b) const {
+    USE_SLEEF({return Vectorized<double>(Sleef_hypotdx_u05sve(values, b));},
+    {
+      __at_align__ double tmp[size()];
+      __at_align__ double tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    })
+  }
+  Vectorized<double> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<double> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<double> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<double> igamma(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> igammac(const Vectorized<double> &x) const {
+    __at_align__ double tmp[size()];
+    __at_align__ double tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<double> nextafter(const Vectorized<double> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<double>(Sleef_nextafterdx_sve(values, b));
+      },
+      {
+        __at_align__ double tmp[size()];
+        __at_align__ double tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<double> log() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_logdx_u10sve(values)),map(std::log));
+  }
+  Vectorized<double> log2() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log2dx_u10sve(values)),map(std::log2));
+  }
+  Vectorized<double> log10() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log10dx_u10sve(values)),map(std::log10));
+  }
+  Vectorized<double> log1p() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_log1pdx_u10sve(values)),map(std::log1p));
+  }
+  Vectorized<double> frac() const;
+  Vectorized<double> sin() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_sindx_u10sve(values)),map(std::sin));
+  }
+  Vectorized<double> sinh() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_sinhdx_u10sve(values)),map(std::sinh));
+  }
+  Vectorized<double> cos() const {
+    return USE_SLEEF(Vectorized<double>(Sleef_cosdx_u10sve(values)),map(std::cos));
+  }
+  Vectorized<double> cosh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_coshdx_u10sve(values)),map(std::cosh));
+  }
+  Vectorized<double> ceil() const {
+    return svrintp_f64_x(ptrue, values);
+  }
+  Vectorized<double> floor() const {
+    return svrintm_f64_x(ptrue, values);
+  }
+  Vectorized<double> neg() const {
+    return svneg_f64_x(ptrue, values);
+  }
+  Vectorized<double> round() const {
+    return svrinti_f64_x(ptrue, values);
+  }
+  Vectorized<double> tan() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_tandx_u10sve(values)),map(std::tan));
+  }
+  Vectorized<double> tanh() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_tanhdx_u10sve(values)),map(std::tanh));
+  }
+  Vectorized<double> trunc() const {
+    return svrintz_f64_x(ptrue, values);
+  }
+  Vectorized<double> lgamma() const {
+    return USE_SLEEF( Vectorized<double>(Sleef_lgammadx_u10sve(values)),map(std::lgamma));
+  }
+  Vectorized<double> sqrt() const {
+    return svsqrt_f64_x(ptrue, values);
+  }
+  Vectorized<double> reciprocal() const {
+    return svdivr_f64_x(ptrue, values, ONE_F64);
+  }
+  Vectorized<double> rsqrt() const {
+    return svdivr_f64_x(ptrue, svsqrt_f64_x(ptrue, values), ONE_F64);
+  }
+  Vectorized<double> pow(const Vectorized<double> &b) const {
+   USE_SLEEF( {return Vectorized<double>(Sleef_powdx_u10sve(values, b));},
+    {
+      __at_align__ double tmp[size()];
+      __at_align__ double tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::pow(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    }
+    )
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<double> operator==(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpeq_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator!=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpne_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<(const Vectorized<double>& other) const {
+    svbool_t mask = svcmplt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator<=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmple_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpgt_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> operator>=(const Vectorized<double>& other) const {
+    svbool_t mask = svcmpge_f64(ptrue, values, other);
+    return svsel_f64(mask, ALL_F64_TRUE_MASK, ALL_F64_FALSE_MASK);
+  }
+
+  Vectorized<double> eq(const Vectorized<double>& other) const;
+  Vectorized<double> ne(const Vectorized<double>& other) const;
+  Vectorized<double> gt(const Vectorized<double>& other) const;
+  Vectorized<double> ge(const Vectorized<double>& other) const;
+  Vectorized<double> lt(const Vectorized<double>& other) const;
+  Vectorized<double> le(const Vectorized<double>& other) const;
+};
+
+template <>
+Vectorized<double> inline operator+(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svadd_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator-(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svsub_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator*(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmul_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline operator/(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svdiv_f64_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<double> inline Vectorized<double>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline maximum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmax_f64_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<double> inline minimum(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svmin_f64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<double> inline clamp(const Vectorized<double>& a, const Vectorized<double>& min, const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, svmax_f64_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<double> inline clamp_max(const Vectorized<double>& a, const Vectorized<double>& max) {
+  return svmin_f64_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<double> inline clamp_min(const Vectorized<double>& a, const Vectorized<double>& min) {
+  return svmax_f64_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<double> inline operator&(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(svand_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator|(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(svorr_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+template <>
+Vectorized<double> inline operator^(const Vectorized<double>& a, const Vectorized<double>& b) {
+  return svreinterpret_f64_s64(sveor_s64_x(ptrue, svreinterpret_s64_f64(a), svreinterpret_s64_f64(b)));
+}
+
+Vectorized<double> inline Vectorized<double>::eq(const Vectorized<double>& other) const {
+  return (*this == other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ne(const Vectorized<double>& other) const {
+  return (*this != other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::gt(const Vectorized<double>& other) const {
+  return (*this > other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::ge(const Vectorized<double>& other) const {
+  return (*this >= other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::lt(const Vectorized<double>& other) const {
+  return (*this < other) & Vectorized<double>(1.0);
+}
+
+Vectorized<double> inline Vectorized<double>::le(const Vectorized<double>& other) const {
+  return (*this <= other) & Vectorized<double>(1.0);
+}
+
+template <>
+inline void convert(const double* src, double* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<double>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<double>::size()) {
+    svst1_f64(ptrue, dst + i, svldnt1_f64(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<double>::size()) {
+    svbool_t pg = svwhilelt_b64(i, n);
+    svst1_f64(pg, dst + i, svldnt1_f64(pg, src + i));
+  }
+}
+
+template <>
+Vectorized<double> inline fmadd(const Vectorized<double>& a, const Vectorized<double>& b, const Vectorized<double>& c) {
+  return svmad_f64_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_float.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_float.h
@ -0,0 +1,570 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+#include <cmath>
+#if defined(__aarch64__) && defined(AT_BUILD_ARM_VEC256_WITH_SLEEF)
+#include <sleef.h>
+#define USE_SLEEF(sleef_code, non_sleef_code) sleef_code
+#else
+#define USE_SLEEF(sleef_code, non_sleef_code) non_sleef_code
+#endif
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+template <> class Vectorized<float> {
+private:
+  vls_float32_t values;
+public:
+  using value_type = float;
+  using size_type = int;
+  static constexpr size_type size() {
+    return VECTOR_WIDTH / sizeof(float);
+  }
+  Vectorized() {}
+  Vectorized(svfloat32_t v) : values(v) {}
+  Vectorized(float val) {
+    values = svdup_n_f32(val);
+  }
+  template<typename... Args,
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>
+  Vectorized(Args... vals) {
+    __at_align__ float buffer[size()] = { vals... };
+    values = svld1_f32(ptrue, buffer);
+  }
+  operator svfloat32_t() const {
+    return values;
+  }
+  static Vectorized<float> blendv(const Vectorized<float>& a, const Vectorized<float>& b,
+                              const Vectorized<float>& mask_) {
+    svbool_t mask = svcmpeq_s32(ptrue, svreinterpret_s32_f32(mask_),
+                               ALL_S32_TRUE_MASK);
+    return svsel_f32(mask, b, a);
+  }
+  template<typename step_t>
+  static Vectorized<float> arange(float base = 0.f, step_t step = static_cast<step_t>(1)) {
+    __at_align__ float buffer[size()];
+    for (int64_t i = 0; i < size(); i++) {
+      buffer[i] = base + i * step;
+    }
+    return svld1_f32(ptrue, buffer);
+  }
+  static Vectorized<float> set(const Vectorized<float>& a, const Vectorized<float>& b,
+                           int64_t count = size()) {
+    if (count == 0) {
+      return a;
+    } else if (count < size()) {
+      return svsel_f32(svwhilelt_b32(0ull, count), b, a);
+    }
+    return b;
+  }
+  static Vectorized<float> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_f32(ptrue, reinterpret_cast<const float*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_f32(pg, reinterpret_cast<const float*>(ptr));
+  }
+  void store(void* ptr, int64_t count = size()) const {
+    if (count == size()) {
+      svst1_f32(ptrue, reinterpret_cast<float*>(ptr), values);
+    } else {
+      svbool_t pg = svwhilelt_b32(0ull, count);
+      svst1_f32(pg, reinterpret_cast<float*>(ptr), values);
+    }
+  }
+  const float& operator[](int idx) const  = delete;
+  float& operator[](int idx) = delete;
+  int64_t zero_mask() const {
+    // returns an integer mask where all zero elements are translated to 1-bit and others are translated to 0-bit
+    int64_t mask = 0;
+    __at_align__ int32_t mask_array[size()];
+
+    svbool_t svbool_mask = svcmpeq_f32(ptrue, values, ZERO_F32);
+    svst1_s32(ptrue, mask_array, svsel_s32(svbool_mask,
+                                          ALL_S32_TRUE_MASK,
+                                          ALL_S32_FALSE_MASK));
+    for (int64_t i = 0; i < size(); ++i) {
+      if (mask_array[i]) mask |= (1ull << i);
+    }
+    return mask;
+  }
+  Vectorized<float> isnan() const {
+    // NaN check
+    svbool_t mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+  bool has_inf_nan() const {
+    return svptest_any(ptrue, svcmpuo_f32(ptrue, svsub_f32_x(ptrue, values, values), ZERO_F32));
+  }
+  Vectorized<float> map(float (*f)(float)) const {
+    __at_align__ float tmp[size()];
+    store(tmp);
+    for (int64_t i = 0; i < size(); ++i) {
+      tmp[i] = f(tmp[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> abs() const {
+    return svabs_f32_x(ptrue, values);
+  }
+  Vectorized<float> angle() const {
+    const auto nan_vec = svdup_n_f32(NAN);
+    const auto nan_mask = svcmpuo_f32(ptrue, values, ZERO_F32);
+    const auto pi = svdup_n_f32(c10::pi<float>);
+
+    const auto neg_mask = svcmplt_f32(ptrue, values, ZERO_F32);
+    auto angle = svsel_f32(neg_mask, pi, ZERO_F32);
+    angle = svsel_f32(nan_mask, nan_vec, angle);
+    return angle;
+  }
+  Vectorized<float> real() const {
+    return values;
+  }
+  Vectorized<float> imag() const {
+    return Vectorized<float>(0.f);
+  }
+  Vectorized<float> conj() const {
+    return values;
+  }
+  Vectorized<float> acos() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_acosfx_u10sve(values)),map(std::acos));
+  }
+  Vectorized<float> acosh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_acoshfx_u10sve(values)),map(std::acosh));
+  }
+  Vectorized<float> asin() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_asinfx_u10sve(values)),map(std::asin));
+  }
+  Vectorized<float> atan() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_atanfx_u10sve(values)),map(std::atan));
+  }
+  Vectorized<float> atanh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_atanhfx_u10sve(values)),map(std::atanh));
+  }
+  Vectorized<float> atan2(const Vectorized<float> &b) const {
+     USE_SLEEF({return Vectorized<float>(Sleef_atan2fx_u10sve(values, b));},
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++){
+          tmp[i] = std::atan2(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+      )
+  }
+  Vectorized<float> copysign(const Vectorized<float> &sign) const {
+
+    USE_SLEEF({return Vectorized<float>(Sleef_copysignfx_sve(values, sign));},
+    {
+      __at_align__ float tmp[size()];
+      __at_align__ float tmp_sign[size()];
+      store(tmp);
+      sign.store(tmp_sign);
+      for (int64_t i = 0; i < size(); ++i) {
+        tmp[i] = std::copysign(tmp[i], tmp_sign[i]);
+      }
+      return loadu(tmp);
+    })
+  }
+  Vectorized<float> erf() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_erffx_u10sve(values)),map(std::erf));
+  }
+  Vectorized<float> erfc() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_erfcfx_u15sve(values)),map(std::erfc));
+  }
+  Vectorized<float> erfinv() const {
+    return map(calc_erfinv);
+  }
+  Vectorized<float> exp() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_expfx_u10sve(values)),map(std::exp));
+  }
+  Vectorized<float> exp2() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_exp2fx_u10sve(values)),map(std::exp2));
+  }
+  Vectorized<float> expm1() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_expm1fx_u10sve(values)),map(std::expm1));
+  }
+  Vectorized<float> exp_u20() const {
+    return exp();
+  }
+  Vectorized<float> fmod(const Vectorized<float>& q) const {
+   USE_SLEEF({return Vectorized<float>(Sleef_fmodfx_sve(values, q));},
+    {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_q[size()];
+        store(tmp);
+        q.store(tmp_q);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::fmod(tmp[i], tmp_q[i]);
+        }
+        return loadu(tmp);
+      })
+  }
+  Vectorized<float> hypot(const Vectorized<float> &b) const {
+   USE_SLEEF( {return Vectorized<float>(Sleef_hypotfx_u05sve(values, b));},
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); i++) {
+          tmp[i] = std::hypot(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+      )
+  }
+  Vectorized<float> i0() const {
+    return map(calc_i0);
+  }
+  Vectorized<float> i0e() const {
+    return map(calc_i0e);
+  }
+  Vectorized<float> digamma() const {
+    return map(calc_digamma);
+  }
+  Vectorized<float> igamma(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igamma(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> igammac(const Vectorized<float> &x) const {
+    __at_align__ float tmp[size()];
+    __at_align__ float tmp_x[size()];
+    store(tmp);
+    x.store(tmp_x);
+    for (int64_t i = 0; i < size(); i++) {
+      tmp[i] = calc_igammac(tmp[i], tmp_x[i]);
+    }
+    return loadu(tmp);
+  }
+  Vectorized<float> nextafter(const Vectorized<float> &b) const {
+    USE_SLEEF(
+      {
+        return Vectorized<float>(Sleef_nextafterfx_sve(values, b));
+      },
+      {
+        __at_align__ float tmp[size()];
+        __at_align__ float tmp_b[size()];
+        store(tmp);
+        b.store(tmp_b);
+        for (int64_t i = 0; i < size(); ++i) {
+          tmp[i] = std::nextafter(tmp[i], tmp_b[i]);
+        }
+        return loadu(tmp);
+      }
+    )
+  }
+  Vectorized<float> log() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_logfx_u10sve(values)),map(std::log));
+  }
+  Vectorized<float> log2() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log2fx_u10sve(values)),map(std::log2));
+  }
+  Vectorized<float> log10() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log10fx_u10sve(values)),map(std::log10));
+  }
+  Vectorized<float> log1p() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_log1pfx_u10sve(values)),map(std::log1p));
+  }
+  Vectorized<float> frac() const;
+  Vectorized<float> sin() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_sinfx_u10sve(values)),map(std::sin));
+  }
+  Vectorized<float> sinh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_sinhfx_u10sve(values)),map(std::sinh));
+  }
+  Vectorized<float> cos() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_cosfx_u10sve(values)),map(std::cos));
+  }
+  Vectorized<float> cosh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_coshfx_u10sve(values)),map(std::cosh));
+  }
+  Vectorized<float> ceil() const {
+    return svrintp_f32_x(ptrue, values);
+  }
+  Vectorized<float> floor() const {
+    return svrintm_f32_x(ptrue, values);
+  }
+  Vectorized<float> neg() const {
+    return svneg_f32_x(ptrue, values);
+  }
+  Vectorized<float> round() const {
+    return svrinti_f32_x(ptrue, values);
+  }
+  Vectorized<float> tan() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_tanfx_u10sve(values)),map(std::tan));
+  }
+  Vectorized<float> tanh() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_tanhfx_u10sve(values)),map(std::tanh));
+  }
+  Vectorized<float> trunc() const {
+    return svrintz_f32_x(ptrue, values);
+  }
+  Vectorized<float> lgamma() const {
+    return USE_SLEEF(Vectorized<float>(Sleef_lgammafx_u10sve(values)),map(std::lgamma));
+  }
+  Vectorized<float> sqrt() const {
+    return svsqrt_f32_x(ptrue, values);
+  }
+  Vectorized<float> reciprocal() const {
+    return svdivr_f32_x(ptrue, values, ONE_F32);
+  }
+  Vectorized<float> rsqrt() const {
+    return svdivr_f32_x(ptrue, svsqrt_f32_x(ptrue, values), ONE_F32);
+  }
+  Vectorized<float> pow(const Vectorized<float> &b) const {
+   USE_SLEEF( {return Vectorized<float>(Sleef_powfx_u10sve(values, b));},
+    {
+      __at_align__ float tmp[size()];
+      __at_align__ float tmp_b[size()];
+      store(tmp);
+      b.store(tmp_b);
+      for (int64_t i = 0; i < size(); i++) {
+        tmp[i] = std::pow(tmp[i], tmp_b[i]);
+      }
+      return loadu(tmp);
+    }
+   )
+  }
+  // Comparison using the _CMP_**_OQ predicate.
+  //   `O`: get false if an operand is NaN
+  //   `Q`: do not raise if an operand is NaN
+  Vectorized<float> operator==(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpeq_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator!=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpne_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<(const Vectorized<float>& other) const {
+    svbool_t mask = svcmplt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator<=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmple_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpgt_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> operator>=(const Vectorized<float>& other) const {
+    svbool_t mask = svcmpge_f32(ptrue, values, other);
+    return svsel_f32(mask, ALL_F32_TRUE_MASK, ALL_F32_FALSE_MASK);
+  }
+
+  Vectorized<float> eq(const Vectorized<float>& other) const;
+  Vectorized<float> ne(const Vectorized<float>& other) const;
+  Vectorized<float> gt(const Vectorized<float>& other) const;
+  Vectorized<float> ge(const Vectorized<float>& other) const;
+  Vectorized<float> lt(const Vectorized<float>& other) const;
+  Vectorized<float> le(const Vectorized<float>& other) const;
+};
+
+template <>
+Vectorized<float> inline operator+(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svadd_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator-(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svsub_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator*(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmul_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline operator/(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svdiv_f32_x(ptrue, a, b);
+}
+
+// frac. Implement this here so we can use subtraction
+Vectorized<float> inline Vectorized<float>::frac() const {
+  return *this - this->trunc();
+}
+
+// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline maximum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmax_f32_x(ptrue, a, b);
+}
+
+// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if
+// either input is a NaN.
+template <>
+Vectorized<float> inline minimum(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svmin_f32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<float> inline clamp(const Vectorized<float>& a, const Vectorized<float>& min, const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, svmax_f32_x(ptrue, min, a));
+}
+
+template <>
+Vectorized<float> inline clamp_max(const Vectorized<float>& a, const Vectorized<float>& max) {
+  return svmin_f32_x(ptrue, max, a);
+}
+
+template <>
+Vectorized<float> inline clamp_min(const Vectorized<float>& a, const Vectorized<float>& min) {
+  return svmax_f32_x(ptrue, min, a);
+}
+
+template <>
+Vectorized<float> inline operator&(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svand_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator|(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(svorr_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+template <>
+Vectorized<float> inline operator^(const Vectorized<float>& a, const Vectorized<float>& b) {
+  return svreinterpret_f32_s32(sveor_s32_x(ptrue, svreinterpret_s32_f32(a), svreinterpret_s32_f32(b)));
+}
+
+Vectorized<float> inline Vectorized<float>::eq(const Vectorized<float>& other) const {
+  return (*this == other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ne(const Vectorized<float>& other) const {
+  return (*this != other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::gt(const Vectorized<float>& other) const {
+  return (*this > other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::ge(const Vectorized<float>& other) const {
+  return (*this >= other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::lt(const Vectorized<float>& other) const {
+  return (*this < other) & Vectorized<float>(1.0f);
+}
+
+Vectorized<float> inline Vectorized<float>::le(const Vectorized<float>& other) const {
+  return (*this <= other) & Vectorized<float>(1.0f);
+}
+
+template <>
+inline void convert(const float* src, float* dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svst1_f32(ptrue, dst + i, svldnt1_f32(ptrue, src + i));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    svbool_t pg = svwhilelt_b32(i, n);
+    svst1_f32(pg, dst + i, svldnt1_f32(pg, src + i));
+  }
+}
+
+template <>
+inline void convert(const float *src, at::Half *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                    ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svuzp1_f16(svcvt_f16_f32_x(ptrue, svldnt1_f32(pg_32, src + i)),
+                                     ZERO_F16);
+    svst1_f16(pg_16, reinterpret_cast<float16_t*>(dst) + i, src_vec);
+  }
+}
+
+template <>
+inline void convert(const at::Half *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_16 = svwhilelt_b16(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                    ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+#pragma unroll
+  for (int64_t i =  n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_16 = svwhilelt_b16(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svfloat16_t src_vec = svzip1_f16(svldnt1_f16(pg_16, reinterpret_cast<const float16_t*>(src) + i),
+                                     ZERO_F16);
+    svst1_f32(pg_32, dst + i, svcvt_f32_f16_x(ptrue, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<float>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<float>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<float>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<float>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<float>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_f32(pg_32, dst + i, svsel_f32(mask, ONE_F32, ZERO_F32));
+  }
+}
+
+template <>
+Vectorized<float> inline fmadd(const Vectorized<float>& a, const Vectorized<float>& b, const Vectorized<float>& c) {
+  return svmad_f32_x(ptrue, a, b, c);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_int.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_int.h
@ -0,0 +1,410 @@
+#pragma once
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/cpu/vec/sve/sve_helper.h>
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+#define VEC_INT_SVE_TEMPLATE(vl, bit)                                                                   \
+template <> class Vectorized<int##bit##_t> {                                                            \
+private:                                                                                                \
+  vls_int##bit##_t values;                                                                              \
+public:                                                                                                 \
+  using value_type = int##bit##_t;                                                                      \
+  using size_type = int;                                                                                \
+  static constexpr size_type size() {                                                                   \
+    return vl;                                                                                          \
+  }                                                                                                     \
+  Vectorized() {}                                                                                       \
+  Vectorized(svint##bit##_t v) : values(v) {}                                                           \
+  Vectorized(int##bit##_t val) {                                                                        \
+    values = svdup_n_s##bit(val);                                                                       \
+  }                                                                                                     \
+  template<typename... Args,                                                                            \
+           typename = std::enable_if_t<(sizeof...(Args) == size())>>                                    \
+  Vectorized(Args... vals) {                                                                            \
+    __at_align__ int##bit##_t buffer[size()] = { vals... };                                             \
+    values = svld1_s##bit(ptrue, buffer);                                                               \
+  }                                                                                                     \
+  operator svint##bit##_t() const {                                                                     \
+    return values;                                                                                      \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> blendv(const Vectorized<int##bit##_t>& a,                             \
+                                        const Vectorized<int##bit##_t>& b,                             \
+                                        const Vectorized<int##bit##_t>& mask_) {                       \
+    svbool_t mask = svcmpeq_s##bit(ptrue, mask_, ALL_S##bit##_TRUE_MASK);                               \
+    return svsel_s##bit(mask, b, a);                                                                    \
+  }                                                                                                     \
+  /* step sometimes requires a higher precision type (e.g., T=int, step_t=double) */                    \
+  template <typename step_t>                                              \
+  static Vectorized<int##bit##_t> arange(int##bit##_t base = 0, step_t step = static_cast<step_t>(1)) { \
+    __at_align__ int##bit##_t buffer[size()];                                                           \
+    for (int64_t i = 0; i < size(); i++) {                                                              \
+      buffer[i] = base + i * step;                                                                      \
+    }                                                                                                   \
+    return svld1_s##bit(ptrue, buffer);                                                                 \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> set(const Vectorized<int##bit##_t>& a,                                \
+                                     const Vectorized<int##bit##_t>& b,                                \
+                                     int##bit##_t count = size()) {                                    \
+    if (count == 0) {                                                                                   \
+      return a;                                                                                         \
+    } else if (count < size()) {                                                                        \
+      return svsel_s##bit(svwhilelt_b##bit(0ull, count), b, a);                                         \
+    }                                                                                                   \
+    return b;                                                                                           \
+  }                                                                                                     \
+  static Vectorized<int##bit##_t> loadu(const void* ptr, int64_t count = size()) {                      \
+    if (count == size())                                                                                \
+      return svld1_s##bit(ptrue, reinterpret_cast<const int##bit##_t*>(ptr));                           \
+    svbool_t pg = svwhilelt_b##bit(0ull, count);                                                        \
+    return svld1_s##bit(pg, reinterpret_cast<const int##bit##_t*>(ptr));                                \
+  }                                                                                                     \
+  void store(void* ptr, int64_t count = size()) const {                                                 \
+    if (count == size()) {                                                                              \
+      svst1_s##bit(ptrue, reinterpret_cast<int##bit##_t*>(ptr), values);                                \
+    } else {                                                                                            \
+      svbool_t pg = svwhilelt_b##bit(0ull, count);                                                      \
+      svst1_s##bit(pg, reinterpret_cast<int##bit##_t*>(ptr), values);                                   \
+    }                                                                                                   \
+  }                                                                                                     \
+  const int##bit##_t& operator[](int idx) const  = delete;                                              \
+  int##bit##_t& operator[](int idx) = delete;                                                           \
+  Vectorized<int##bit##_t> abs() const {                                                                \
+    return svabs_s##bit##_x(ptrue, values);                                                             \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> real() const {                                                               \
+    return values;                                                                                      \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> imag() const {                                                               \
+    return svdup_n_s##bit(0);                                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> conj() const {                                                               \
+    return values;                                                                                      \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> frac() const;                                                                \
+  Vectorized<int##bit##_t> neg() const {                                                                \
+    return svneg_s##bit##_x(ptrue, values);                                                             \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator==(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpeq_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator!=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpne_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator<(const Vectorized<int##bit##_t>& other) const {                     \
+    svbool_t mask = svcmplt_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator<=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmple_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator>(const Vectorized<int##bit##_t>& other) const {                     \
+    svbool_t mask = svcmpgt_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> operator>=(const Vectorized<int##bit##_t>& other) const {                    \
+    svbool_t mask = svcmpge_s##bit(ptrue, values, other);                                               \
+    return svsel_s##bit(mask, ALL_S##bit##_TRUE_MASK, ALL_S##bit##_FALSE_MASK);                         \
+  }                                                                                                     \
+  Vectorized<int##bit##_t> eq(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> ne(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> gt(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> ge(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> lt(const Vectorized<int##bit##_t>& other) const;                             \
+  Vectorized<int##bit##_t> le(const Vectorized<int##bit##_t>& other) const;                             \
+};                                                                                                      \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator+(const Vectorized<int##bit##_t>& a,                            \
+                                          const Vectorized<int##bit##_t>& b) {       \
+  return svadd_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator-(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svsub_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator*(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svmul_s##bit##_x(ptrue, a, b);                                                                \
+}                                                                                                       \
+template <>                                                 \
+Vectorized<int##bit##_t> inline maximum(const Vectorized<int##bit##_t>& a,                              \
+                                       const Vectorized<int##bit##_t>& b) {                            \
+  return svmax_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline minimum(const Vectorized<int##bit##_t>& a,                              \
+                                       const Vectorized<int##bit##_t>& b) {                          \
+  return svmin_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp(const Vectorized<int##bit##_t>& a,                                \
+                                     const Vectorized<int##bit##_t>& min,                              \
+                                     const Vectorized<int##bit##_t>& max) {                            \
+  return svmin_s##bit##_x(ptrue, max, svmax_s##bit##_x(ptrue, min, a));                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp_max(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& max) {                        \
+  return svmin_s##bit##_x(ptrue, max, a);                                                               \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline clamp_min(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& min) {                        \
+  return svmax_s##bit##_x(ptrue, min, a);                                                               \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator&(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svand_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator|(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return svorr_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+Vectorized<int##bit##_t> inline operator^(const Vectorized<int##bit##_t>& a,                            \
+                                         const Vectorized<int##bit##_t>& b) {                          \
+  return sveor_s##bit##_x(ptrue, a, b);                                                                 \
+}                                                                                                       \
+template <>                                                                                             \
+inline Vectorized<int##bit##_t> operator~(const Vectorized<int##bit##_t>& a) {                          \
+  return sveor_s##bit##_x(ptrue, a, svdup_n_s##bit(-1));                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::eq(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this == other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ne(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this != other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::gt(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this > other) & Vectorized<int##bit##_t>(1);                                                 \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::ge(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this >= other) & Vectorized<int##bit##_t>(1);                                                \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::lt(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this < other) & Vectorized<int##bit##_t>(1);                                                 \
+}                                                                                                       \
+Vectorized<int##bit##_t> inline Vectorized<int##bit##_t>::le(const Vectorized<int##bit##_t>& other) const {    \
+  return (*this <= other) & Vectorized<int##bit##_t>(1);                                                \
+}
+
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int64_t), 64)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int32_t), 32)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int16_t), 16)
+VEC_INT_SVE_TEMPLATE(VECTOR_WIDTH / sizeof(int8_t), 8)
+
+template <typename T>
+Vectorized<T> inline intdiv_nosve(const Vectorized<T>& a, const Vectorized<T>& b) {
+  T values_a[Vectorized<T>::size()];
+  T values_b[Vectorized<T>::size()];
+  a.store(values_a);
+  b.store(values_b);
+  for (int i = 0; i != Vectorized<T>::size(); i++) {
+    values_a[i] /= values_b[i];
+  }
+  return Vectorized<T>::loadu(values_a);
+}
+
+template <>
+Vectorized<int64_t> inline operator/(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svdiv_s64_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int32_t> inline operator/(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svdiv_s32_x(ptrue, a, b);
+}
+
+template <>
+Vectorized<int16_t> inline operator/(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+Vectorized<int8_t> inline operator/(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return intdiv_nosve(a, b);
+}
+
+template <>
+inline void convert(const int32_t *src, int64_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size())
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svst1_s64(pg_64, dst + i, svunpklo_s64(svldnt1_s32(pg_32, src + i)));
+  }
+}
+
+template <>
+inline void convert(const int64_t *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_32 = svwhilelt_b32(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svint64_t src_vec_s64 = svldnt1_s64(pg_64, src + i);
+    svfloat32_t src_vec_f32 = svuzp1_f32(svcvt_f32_s64_x(pg_64, src_vec_s64), ZERO_F32);
+    svst1_f32(pg_32, dst + i, src_vec_f32);
+  }
+}
+
+template <>
+inline void convert(const int32_t *src, float *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg = svwhilelt_b32(i, n);
+    svint32_t src_vec = svldnt1_s32(pg, src + i);
+    svst1_f32(pg, dst + i, svcvt_f32_s32_x(pg, src_vec));
+  }
+}
+
+template <>
+inline void convert(const bool *src, int64_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int64_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int64_t>::size());
+  svbool_t pg_64 = svwhilelt_b64(0ull, Vectorized<int64_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int64_t>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int64_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_64 = svwhilelt_b64(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint64_t src_vec_u64 = svunpklo_u64(svunpklo_u32(svunpklo_u16(src_vec_u8)));
+    svbool_t mask = svcmpne_u64(pg_64, src_vec_u64, ZERO_U64);
+    svst1_s64(pg_64, dst + i, svsel_s64(mask, ONE_S64, ZERO_S64));
+  }
+}
+
+template <>
+inline void convert(const bool *src, int32_t *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<int32_t>::size();
+  svbool_t pg_8 = svwhilelt_b8(0ull, Vectorized<int32_t>::size());
+  svbool_t pg_32 = svwhilelt_b32(0ull, Vectorized<int32_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<int32_t>::size()) {
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<int32_t>::size()) {
+    pg_8 = svwhilelt_b8(i, n);
+    pg_32 = svwhilelt_b32(i, n);
+    svuint8_t src_vec_u8 = svldnt1_u8(pg_8, reinterpret_cast<const uint8_t*>(src) + i);
+    svuint32_t src_vec_u32 = svunpklo_u32(svunpklo_u16(src_vec_u8));
+    svbool_t mask = svcmpne_u32(pg_32, src_vec_u32, ZERO_U32);
+    svst1_s32(pg_32, dst + i, svsel_s32(mask, ONE_S32, ZERO_S32));
+  }
+}
+
+template <>
+inline void convert(const uint8_t *src, bool *dst, int64_t n) {
+  const int64_t fraction = n % Vectorized<uint8_t>::size();
+  svbool_t pg = svwhilelt_b8(0ull, Vectorized<uint8_t>::size());
+#pragma unroll
+  for (int64_t i = 0; i < n - fraction; i += Vectorized<uint8_t>::size()) {
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
+            svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+#pragma unroll
+  for (int64_t i = n - fraction; i < n; i += Vectorized<uint8_t>::size()) {
+    pg = svwhilelt_b8(i, n);
+    svbool_t mask = svcmpne_u8(pg, svldnt1_u8(pg, src + i), ZERO_U8);
+    svst1_u8(pg, reinterpret_cast<uint8_t*>(dst) + i,
+             svsel_u8(mask, ALL_U8_TRUE_MASK, ALL_U8_FALSE_MASK));
+  }
+}
+
+template <>
+Vectorized<int64_t> inline operator<<(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svlsl_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator<<(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svlsl_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator<<(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return svlsl_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator<<(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return svlsl_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+template <>
+Vectorized<int64_t> inline operator>>(const Vectorized<int64_t>& a, const Vectorized<int64_t>& b) {
+  return svasr_s64_x(ptrue, a, svreinterpret_u64_s64(b));
+}
+
+template <>
+Vectorized<int32_t> inline operator>>(const Vectorized<int32_t>& a, const Vectorized<int32_t>& b) {
+  return svasr_s32_x(ptrue, a, svreinterpret_u32_s32(b));
+}
+
+template <>
+Vectorized<int16_t> inline operator>>(const Vectorized<int16_t>& a, const Vectorized<int16_t>& b) {
+  return svasr_s16_x(ptrue, a, svreinterpret_u16_s16(b));
+}
+
+template <>
+Vectorized<int8_t> inline operator>>(const Vectorized<int8_t>& a, const Vectorized<int8_t>& b) {
+  return svasr_s8_x(ptrue, a, svreinterpret_u8_s8(b));
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/sve/vec_qint.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_qint.h
@ -0,0 +1,567 @@
+#pragma once
+
+// DO NOT DEFINE STATIC DATA IN THIS HEADER!
+// See Note [Do not compile initializers with SVE]
+
+#include <ATen/cpu/vec/intrinsics.h>
+#include <ATen/cpu/vec/vec_base.h>
+#include <ATen/native/quantized/AffineQuantizerBase.h>
+#include <c10/util/qint32.h>
+#include <c10/util/qint8.h>
+#include <c10/util/quint8.h>
+
+#include <array>
+
+// This file defines Vectorized<> for the quantized types.
+//
+//
+// Currently, we simply use these classes as efficient converters between
+// the quantized types and Vectorized<float>, usually in bandwidth-bound cases
+// where doing the arithmetic in full-precision is acceptable (e.g.
+// elementwise operators).
+//
+//
+// Conversions are as follows:
+//  Vectorized<qint8> -> 4x Vectorized<float>
+//  Vectorized<quint8> -> 4x Vectorized<float>
+//  Vectorized<qint32> -> 1x Vectorized<float>
+//
+// The size of the returned float vector is specified by the special
+// constexpr function float_num_vecs. The type of the value returned
+// from dequantize (and expected as an argument to quantize) is
+// specified by float_vec_return_type.
+//
+// When writing kernels with these vectors, it is expected that floating-
+// point operations will be carried out in a loop over Vectorized<T>::float_num_vecs
+// iterations.
+
+namespace at {
+namespace vec {
+// Note [CPU_CAPABILITY namespace]
+// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+// This header, and all of its subheaders, will be compiled with
+// different architecture flags for each supported set of vector
+// intrinsics. So we need to make sure they aren't inadvertently
+// linked together. We do this by declaring objects in an `inline
+// namespace` which changes the name mangling, but can still be
+// accessed as `at::vec`.
+inline namespace CPU_CAPABILITY {
+
+#if defined(CPU_CAPABILITY_SVE)
+
+// NOTE: These are low-performance implementations that we fall back on
+// if we are not building with SVE. This may not be an issue, because
+// currently for quantization we assume the user has at least SVE
+// installed, so these can simply act as a reference implementation.
+//
+// If in the future we relax this requirement (SVE+), we should probably
+// revisit these implementations
+
+template <
+    typename T,
+    typename float_vec_return_type_,
+    typename int_vec_return_type_,
+    int size_>
+struct VectorizedQuantizedConverter {
+  using size_type = int;
+  static constexpr size_type size() {
+    return size_;
+  }
+
+  static constexpr int float_num_vecs() {
+    return size() / Vectorized<float>::size();
+  }
+
+  static constexpr int int_num_vecs() {
+    return size() / Vectorized<int32_t>::size();
+  }
+
+  using float_vec_return_type = float_vec_return_type_;
+  using int_vec_return_type = int_vec_return_type_;
+
+  using value_type = typename T::underlying;
+  std::array<value_type, size_> vals;
+
+  VectorizedQuantizedConverter(T val) {
+    for (size_t i = 0; i < size(); ++i) {
+      vals[i] = val.val_;
+    }
+  }
+
+  VectorizedQuantizedConverter(const void* ptr) {
+    memcpy(vals.data(), ptr, sizeof(value_type) * size());
+  }
+
+  void store(void* ptr, int count = size()) const {
+    memcpy(ptr, vals.data(), count * sizeof(value_type));
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point,
+      Vectorized<float> scale_zp_premul) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] =
+          at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+  float_vec_return_type dequantize(
+      Vectorized<float> scale,
+      Vectorized<float> zero_point) const {
+    float_vec_return_type rv;
+    float tmp_scale[Vectorized<float>::size()];
+    float tmp_zero_point[Vectorized<float>::size()];
+    scale.store(tmp_scale);
+    zero_point.store(tmp_zero_point);
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      float tmp_vals[Vectorized<float>::size()];
+      for (int j = 0; j < Vectorized<float>::size(); ++j) {
+        tmp_vals[j] =
+          at::native::dequantize_val<T>(tmp_scale[j], tmp_zero_point[j], T(vals[Vectorized<float>::size() * i + j]));
+      }
+      rv[i] = Vectorized<float>::loadu(tmp_vals);
+    }
+    return rv;
+  }
+
+ protected:
+  VectorizedQuantizedConverter() {}
+};
+
+template <>
+struct Vectorized<c10::qint32> : public VectorizedQuantizedConverter<
+                                 c10::qint32,
+                                 std::array<Vectorized<float>, 1>,
+                                 std::array<Vectorized<c10::qint32>, 1>,
+                                 VECTOR_WIDTH / 4> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>() {}
+  Vectorized(c10::qint32 val)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint32,
+            std::array<Vectorized<float>, 1>,
+            std::array<Vectorized<c10::qint32>, 1>,
+            VECTOR_WIDTH / 4>(ptr) {}
+#if 1
+  static Vectorized<c10::qint32> loadu(const void* ptr) {
+    return Vectorized<c10::qint32>(ptr);
+  }
+
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::qint32> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_s32(ptrue, reinterpret_cast<const int32_t*>(ptr));
+    svbool_t pg = svwhilelt_b32(0ull, count);
+    return svld1_s32(pg, reinterpret_cast<const int32_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::qint32> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint32, /*precision=*/32>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint32*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint32>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint32> maximum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> minimum(Vectorized<c10::qint32> b) const {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint32> relu(Vectorized<c10::qint32> zero_point) const  {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::qint32> relu6(
+      Vectorized<c10::qint32> zero_point,
+      Vectorized<c10::qint32> q_six) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint32> b) const {
+    int_vec_return_type retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval[0].vals[i] = vals[i] - b.vals[i];
+    }
+    return retval;
+  }
+
+  static Vectorized<c10::qint32> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    Vectorized<c10::qint32> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] =
+          nearbyint(static_cast<float>(inp[0].vals[i]) * multiplier) +
+          zero_point;
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint32> inline maximum(const Vectorized<c10::qint32>& a, const Vectorized<c10::qint32>& b) {
+  return a.maximum(b);
+}
+
+template <>
+Vectorized<c10::qint32> inline operator*(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] * b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+Vectorized<c10::qint32> inline operator+(
+    const Vectorized<c10::qint32>& a,
+    const Vectorized<c10::qint32>& b) {
+  Vectorized<c10::qint32> retval;
+  for (size_t i = 0; i < std::decay_t<decltype(a)>::size(); ++i) {
+    retval.vals[i] = a.vals[i] + b.vals[i];
+  }
+  return retval;
+}
+
+template <>
+struct Vectorized<c10::qint8> : public VectorizedQuantizedConverter<
+                                c10::qint8,
+                                std::array<Vectorized<float>, 4>,
+                                std::array<Vectorized<c10::qint32>, 4>,
+                                VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::qint8 val)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::qint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+
+  static Vectorized<c10::qint8> loadu(const void* ptr) {
+    return Vectorized<c10::qint8>(ptr);
+  }
+
+  static Vectorized<c10::qint8> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+
+  static Vectorized<c10::qint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::qint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::qint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::qint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::qint8> maximum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> minimum(Vectorized<c10::qint8> b) const {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::qint8> relu(Vectorized<c10::qint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+  Vectorized<c10::qint8> relu6(
+      Vectorized<c10::qint8> zero_point,
+      Vectorized<c10::qint8> q_six) {
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::qint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::qint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::qint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::qint8> inline maximum(const Vectorized<c10::qint8>& a, const Vectorized<c10::qint8>& b) {
+  return a.maximum(b);
+}
+
+template <>
+struct Vectorized<c10::quint8> : public VectorizedQuantizedConverter<
+                                 c10::quint8,
+                                 std::array<Vectorized<float>, 4>,
+                                 std::array<Vectorized<c10::qint32>, 4>,
+                                 VECTOR_WIDTH> {
+  Vectorized()
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>() {}
+  Vectorized(c10::quint8 val)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(val) {}
+  Vectorized(const void* ptr)
+      : VectorizedQuantizedConverter<
+            c10::quint8,
+            std::array<Vectorized<float>, 4>,
+            std::array<Vectorized<c10::qint32>, 4>,
+            VECTOR_WIDTH>(ptr) {}
+#if 1
+  static Vectorized<c10::quint8> loadu(const void* ptr) {
+    return Vectorized<c10::quint8>(ptr);
+  }
+
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count) {
+      __at_align__ value_type tmp_values[size()];
+      // Ensure uninitialized memory does not change the output value See https://github.com/pytorch/pytorch/issues/32502
+      // for more details. We do not initialize arrays to zero using "={0}" because gcc would compile it to two
+      // instructions while a loop would be compiled to one instruction.
+      for (const auto i : c10::irange(size())) {
+        tmp_values[i] = 0;
+      }
+      std::memcpy(tmp_values, reinterpret_cast<const value_type*>(ptr), count * sizeof(value_type));
+      return loadu(tmp_values);
+  }
+#else
+  static Vectorized<c10::quint8> loadu(const void* ptr, int64_t count = size()) {
+    if (count == size())
+      return svld1_u8(ptrue, reinterpret_cast<const uint8_t*>(ptr));
+    svbool_t pg = svwhilelt_b8(0ull, count);
+    return svld1_u8(pg, reinterpret_cast<const uint8_t*>(ptr));
+  }
+#endif
+  static Vectorized<c10::quint8> quantize(
+      const float_vec_return_type& rhs,
+      float scale,
+      int32_t zero_point,
+      float inverse_scale) {
+    std::array<value_type, size()> qvals;
+    std::array<float, float_num_vecs() * Vectorized<float>::size()> float_vals;
+
+    for (int i = 0; i < float_num_vecs(); ++i) {
+      rhs[i].store(&float_vals[i * Vectorized<float>::size()], Vectorized<float>::size());
+    }
+
+    at::native::quantize_vec<c10::quint8>(
+        scale,
+        zero_point,
+        float_vals.data(),
+        (c10::quint8*)qvals.data(),
+        Vectorized<float>::size() * float_num_vecs());
+
+    return Vectorized<c10::quint8>::loadu(qvals.data());
+  }
+
+  Vectorized<c10::quint8> maximum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::max<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> minimum(Vectorized<c10::quint8> b) const {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(vals[i], b.vals[i]);
+    }
+    return retval;
+  }
+
+  Vectorized<c10::quint8> relu(Vectorized<c10::quint8> zero_point) const {
+    return maximum(zero_point);
+  }
+
+
+  Vectorized<c10::quint8> relu6(
+      Vectorized<c10::quint8> zero_point,
+      Vectorized<c10::quint8> q_six) {
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < size(); ++i) {
+      retval.vals[i] = std::min<value_type>(
+          std::max<value_type>(vals[i], zero_point.vals[i]), q_six.vals[i]);
+    }
+    return retval;
+  }
+
+  int_vec_return_type widening_subtract(Vectorized<c10::quint8> b) const {
+    int_vec_return_type retval;
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        retval[i].vals[j] =
+            static_cast<int32_t>(vals[i * elem_per_int_vec + j]) -
+            static_cast<int32_t>(b.vals[i * elem_per_int_vec + j]);
+      }
+    }
+    return retval;
+  }
+  static Vectorized<c10::quint8> requantize_from_int(
+      const int_vec_return_type& inp,
+      float multiplier,
+      int32_t zero_point) {
+    constexpr int elem_per_int_vec = size() / int_num_vecs();
+    constexpr auto min_val = std::numeric_limits<value_type>::min();
+    constexpr auto max_val = std::numeric_limits<value_type>::max();
+    Vectorized<c10::quint8> retval;
+    for (size_t i = 0; i < int_num_vecs(); ++i) {
+      for (size_t j = 0; j < elem_per_int_vec; ++j) {
+        int32_t rounded =
+            nearbyint(static_cast<float>(inp[i].vals[j]) * multiplier) +
+            zero_point;
+        retval.vals[i * elem_per_int_vec + j] =
+            std::min<int32_t>(std::max<int32_t>(rounded, min_val), max_val);
+      }
+    }
+    return retval;
+  }
+};
+
+template <>
+Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const Vectorized<c10::quint8>& b) {
+  return a.maximum(b);
+}
+
+#endif // defined(CPU_CAPABILITY_SVE)
+
+}}}
--- a/aten/src/ATen/cpu/vec/vec256/vec256.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256.h
@ -7,9 +7,13 @@

 #include <ATen/cpu/vec/vec_base.h>
 #if !(defined(__VSX__)  || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR))
-#include <ATen/cpu/vec/vec256/vec256_float.h>
+#if defined(CPU_CAPABILITY_SVE256)
+#include <ATen/cpu/vec/sve/vec_common_sve.h>
+#else
 #include <ATen/cpu/vec/vec256/vec256_float_neon.h>
 #include <ATen/cpu/vec/vec256/vec256_half_neon.h>
+#endif
+#include <ATen/cpu/vec/vec256/vec256_float.h>
 #include <ATen/cpu/vec/vec256/vec256_bfloat16.h>
 #include <ATen/cpu/vec/vec256/vec256_double.h>
 #include <ATen/cpu/vec/vec256/vec256_int.h>
--- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h
@ -1097,7 +1097,7 @@ inline Vectorized<type> convert_float_##name(const Vectorized<float>& a, const V
  return Vectorized<type>::loadu(arr2); \
 }
 CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16);
-#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__)
+#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) && !defined(CPU_CAPABILITY_SVE256)
 inline std::tuple<Vectorized<float>, Vectorized<float>> convert_half_float(const Vectorized<Half>& a) {
  static_assert(Vectorized<Half>::size() == 2 * Vectorized<float>::size());
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -208,8 +208,27 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
+    __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
+    // Shuffle [191:128] bit from combined in to [127:64] bit of result
+    __m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+    return at::vec::Vectorized<dst_t>(result);
  }
 };

@ -226,6 +245,25 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
+    __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
+    __m256i src2 = _mm256_castsi128_si256(
+      _mm_castps_si128(
+        _mm256_extractf128_ps(_mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};

 template <typename dst_t>
 struct VecConvert<
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -843,7 +843,7 @@ Vectorized<c10::quint8> inline maximum(const Vectorized<c10::quint8>& a, const V
  return a.maximum(b);
 }

-#else
+#elif !defined(CPU_CAPABILITY_SVE256)

 // NOTE: These are low-performance implementations that we fall back on
 // if we are not building with AVX2. This may not be an issue, because
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -209,8 +209,25 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
+    __m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane
+    return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
  }
 };

@ -227,6 +244,24 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    __m512i src2 = _mm512_castsi128_si512(
+      _mm_castps_si128(
+        _mm512_extractf32x4_ps(_mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};
+
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@ -990,7 +990,7 @@ inline mask_gather(const Vectorized<T>& src, T const* base_addr,
      buffer[i] = src_arr[i];
    }
  }
-  mask = Vectorized<T>();  // "zero out" mask
+  mask = Vectorized<T>(static_cast<T>(0));  // "zero out" mask
  return Vectorized<T>::loadu(static_cast<void*>(buffer));
 }

--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1408,7 +1408,6 @@ void scaled_gemm(
    const void *result_scale_ptr,
    int64_t result_ld,
    ScalarType result_dtype,
-    void* amax_ptr,
    bool use_fast_accum) {
 #if CUDA_VERSION >= 11080 || defined(USE_ROCM)
  const auto computeType = CUBLAS_COMPUTE_32F;
@ -1421,13 +1420,9 @@ void scaled_gemm(
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb));
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr);
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr);
-  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
-#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60200)
-  // Amax support in ROCm as of 6.2
-  if (isFloat8Type(result_dtype)) {
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr);
+  if (result_scale_ptr != nullptr) {
+    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr);
  }
-#endif
 #ifndef USE_ROCM
  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode);
 #endif
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .0.0
 .1.0