Update (base update)

[ghstack-poisoned]
2025-11-04 16:04:58 +08:00 · 2024-10-10 14:33:09 +08:00
parent af699cb584 ea83c78174
commit 7ddcfd5906
1444 changed files with 34310 additions and 31446 deletions
--- a/.buckconfig.oss
+++ b/.buckconfig.oss
@ -21,6 +21,3 @@
  cxx = /usr/bin/clang++
  cxxpp = /usr/bin/clang++
  ld = /usr/bin/clang++
-
-[project]
-  default_flavors_mode=all
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -291,7 +291,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.0
+    ROCM_VERSION=6.1
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -302,7 +302,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.1
+    ROCM_VERSION=6.2
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -355,6 +355,12 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
+  pytorch-linux-jammy-py3-clang18-asan)
+    ANACONDA_PYTHON_VERSION=3.10
+    CLANG_VERSION=18
+    CONDA_CMAKE=yes
+    VISION=yes
+    ;;
  pytorch-linux-jammy-py3.9-gcc11)
    ANACONDA_PYTHON_VERSION=3.9
    GCC_VERSION=11
@ -379,6 +385,14 @@ case "$image" in
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-jammy-py3.12-triton-cpu)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    TRITON_CPU=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
@ -509,6 +523,7 @@ docker build \
       --build-arg "UCC_COMMIT=${UCC_COMMIT}" \
       --build-arg "CONDA_CMAKE=${CONDA_CMAKE}" \
       --build-arg "TRITON=${TRITON}" \
+       --build-arg "TRITON_CPU=${TRITON_CPU}" \
       --build-arg "ONNX=${ONNX}" \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
--- a/.ci/docker/ci_commit_pins/triton-cpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-cpu.txt
@ -0,0 +1 @@
+6a333f1b05671f6fada4ba7bbfae4a02a9d96f4f
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5fe38ffd73c2ac6ed6323b554205186696631c6f
+cf34004b8a67d290a962da166f5aa2fc66751326
--- a/.ci/docker/common/install_clang.sh
+++ b/.ci/docker/common/install_clang.sh
@ -13,11 +13,17 @@ if [ -n "$CLANG_VERSION" ]; then
  elif [[ $UBUNTU_VERSION == 22.04 ]]; then
    # work around ubuntu apt-get conflicts
    sudo apt-get -y -f install
+    wget --no-check-certificate -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add  -
+    if [[ $CLANG_VERSION == 18 ]]; then
+      apt-add-repository "deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-18 main"
+    fi
  fi

  sudo apt-get update
-  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
-  apt-get install -y --no-install-recommends llvm-"$CLANG_VERSION"
+  apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" llvm-"$CLANG_VERSION"
+  if [[ $CLANG_VERSION == 18 ]]; then
+    apt-get install -y --no-install-recommends libomp-18-dev
+  fi

  # Install dev version of LLVM.
  if [ -n "$LLVMDEV" ]; then
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -105,7 +105,7 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -5,19 +5,19 @@ set -ex

 NCCL_VERSION=v2.21.5-1

-function install_cusparselt_052 {
+function install_cusparselt_062 {
    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.6.2.3-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
    popd
    rm -rf tmp_cusparselt
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.6.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
  # install CUDA 12.4.1 in the same container
  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
@ -44,7 +44,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_052
+  install_cusparselt_062

  ldconfig
 }
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,7 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-4]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,6 +10,21 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi

+IS_UBUNTU=0
+ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
+case "$ID" in
+  ubuntu)
+    IS_UBUNTU=1
+    ;;
+  centos)
+    IS_UBUNTU=0
+    ;;
+  *)
+    echo "Unable to determine OS..."
+    exit 1
+    ;;
+esac
+
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -57,9 +72,11 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-    echo "ROCm 6.2 MIOpen does not need any patches, do not build from source"
+if [[ $ROCM_INT -ge 60300 ]]; then
+    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
    exit 0
+elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
+    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
 elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
@ -93,12 +110,21 @@ else
    exit 1
 fi

-yum remove -y miopen-hip
+
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get remove -y miopen-hip
+else
+  yum remove -y miopen-hip
+fi

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
+# Don't build CK to save docker build time
+if [[ $ROCM_INT -ge 60200 ]]; then
+    sed -i '/composable_kernel/d' requirements.txt
+fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
@ -111,10 +137,15 @@ cmake -P install_deps.cmake --minimum

 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-yum clean all
-rm -rf /var/cache/yum
-rm -rf /var/lib/yum/yumdb
-rm -rf /var/lib/yum/history
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  apt-get autoclean && apt-get clean
+  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+else
+  yum clean all
+  rm -rf /var/cache/yum
+  rm -rf /var/lib/yum/yumdb
+  rm -rf /var/lib/yum/history
+fi

 ## Build MIOpen
 mkdir -p build
@ -131,7 +162,11 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget

-yum install -y miopen-*.rpm
+if [[ ${IS_UBUNTU} == 1 ]]; then
+  sudo dpkg -i miopen-hip*.deb
+else
+  yum install -y miopen-*.rpm
+fi

 popd
 rm -rf MIOpen
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -32,7 +32,7 @@ pip_install coloredlogs packaging

 pip_install onnxruntime==1.18.1
 pip_install onnx==1.16.2
-pip_install onnxscript==0.1.0.dev20240831 --no-deps
+pip_install onnxscript==0.1.0.dev20241008 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -15,8 +15,11 @@ conda_reinstall() {
 if [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
+elif [ -n "${TRITON_CPU}" ]; then
+  TRITON_REPO="https://github.com/triton-lang/triton-cpu"
+  TRITON_TEXT_FILE="triton-cpu"
 else
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/triton-lang/triton"
  TRITON_TEXT_FILE="triton"
 fi

@ -44,9 +47,10 @@ chown -R jenkins /var/lib/jenkins/triton
 chgrp -R jenkins /var/lib/jenkins/triton
 pushd /var/lib/jenkins/

-as_jenkins git clone ${TRITON_REPO} triton
+as_jenkins git clone --recursive ${TRITON_REPO} triton
 cd triton
 as_jenkins git checkout ${TRITON_PINNED_COMMIT}
+as_jenkins git submodule update --init --recursive
 cd python

 # TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -37,6 +37,12 @@ esac

 (
  set -x
+  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+  sudo systemctl daemon-reload
+  sudo systemctl restart docker
+
  docker build \
    --target final \
    --progress plain \
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,6 +10,7 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=9
+
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -124,7 +124,14 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-    DOCKER_BUILDKIT=1 docker build \
+
+    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
+    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
+    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
+    sudo systemctl daemon-reload
+    sudo systemctl restart docker
+
+    DOCKER_BUILDKIT=1 docker build  \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -139,9 +139,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.12.1
+optree==0.13.0
 #Description: A library for tree manipulation
-#Pinned versions: 0.12.1
+#Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,6 +68,8 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
+ADD ./common/install_miopen.sh install_miopen.sh
+RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -121,5 +123,8 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

+# Install LLVM dev version (Defined in the pytorch/builder github repository)
+COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
+
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -147,6 +147,13 @@ COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt

+ARG TRITON_CPU
+COPY ./common/install_triton.sh install_triton.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/triton-cpu.txt triton-cpu.txt
+RUN if [ -n "${TRITON_CPU}" ]; then bash ./install_triton.sh; fi
+RUN rm install_triton.sh common_utils.sh triton-cpu.txt
+
 ARG EXECUTORCH
 # Build and install executorch
 COPY ./common/install_executorch.sh install_executorch.sh
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -49,13 +49,8 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi

 # Enable LLVM dependency for TensorExpr testing
-if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-  export USE_LLVM=/opt/rocm/llvm
-  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
-else
-  export USE_LLVM=/opt/llvm
-  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
-fi
+export USE_LLVM=/opt/llvm
+export LLVM_DIR=/opt/llvm/lib/cmake/llvm

 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -183,7 +178,7 @@ fi
 # sccache will fail for CUDA builds if all cores are used for compiling
 # gcc 7 with sccache seems to have intermittent OOM issue if all cores are used
 if [ -z "$MAX_JOBS" ]; then
-  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]] || [[ "$BUILD_ENVIRONMENT" == *gcc7* ]]; } && which sccache > /dev/null; then
+  if { [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; } && which sccache > /dev/null; then
    export MAX_JOBS=$(($(nproc) - 1))
  fi
 fi
@ -223,10 +218,6 @@ if [[ "${BUILD_ENVIRONMENT}" == *-pch* ]]; then
    export USE_PRECOMPILED_HEADERS=1
 fi

-if [[ "${BUILD_ENVIRONMENT}" == *linux-focal-py3.7-gcc7-build*  ]]; then
-  export USE_GLOO_WITH_OPENSSL=ON
-fi
-
 if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]; then
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi
@ -237,7 +228,7 @@ fi

 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -283,6 +274,7 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
+          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
@ -345,11 +337,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"

    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -359,10 +351,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +366,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -407,6 +399,6 @@ fi

 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -191,9 +191,22 @@ function install_torchrec_and_fbgemm() {
  pip_uninstall torchrec-nightly
  pip_uninstall fbgemm-gpu-nightly
  pip_install setuptools-git-versioning scikit-build pyre-extensions
+
+  # TODO (huydhn): I still have no clue on why sccache doesn't work with only fbgemm_gpu here, but it
+  # seems to be an sccache-related issue
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    unset CMAKE_CUDA_COMPILER_LAUNCHER
+    sudo mv /opt/cache/bin /opt/cache/bin-backup
+  fi
+
  # See https://github.com/pytorch/pytorch/issues/106971
  CUDA_PATH=/usr/local/cuda-12.1 pip_install --no-use-pep517 --user "git+https://github.com/pytorch/FBGEMM.git@${fbgemm_commit}#egg=fbgemm-gpu&subdirectory=fbgemm_gpu"
  pip_install --no-use-pep517 --user "git+https://github.com/pytorch/torchrec.git@${torchrec_commit}"
+
+  if [[ "$IS_A100_RUNNER" == "1" ]]; then
+    export CMAKE_CUDA_COMPILER_LAUNCHER=/opt/cache/bin/sccache
+    sudo mv /opt/cache/bin-backup /opt/cache/bin
+  fi
 }

 function clone_pytorch_xla() {
--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta
+from datetime import datetime, timedelta, timezone
 from tempfile import mkdtemp

 from cryptography import x509
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.utcnow())
+        .not_valid_before(datetime.now(timezone.utc))
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.utcnow()
+            datetime.now(timezone.utc)
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -375,9 +375,8 @@ test_inductor_cpp_wrapper_abi_compatible() {
  mkdir -p "$TEST_REPORTS_DIR"

  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
-  # cpu stack allocation causes segfault and needs more investigation
  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro inductor/test_extension_backend

  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@ -404,7 +403,7 @@ pr_time_benchmarks() {
  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
  echo "benchmark results on current PR: "
  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-
+  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks python benchmarks/dynamo/pr_time_benchmarks/check_results.py "benchmarks/dynamo/pr_time_benchmarks/expected_results.csv" "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "$TEST_REPORTS_DIR/new_expected_results.csv"
 }

 if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
@ -607,6 +606,11 @@ test_inductor_halide() {
  assert_git_not_dirty
 }

+test_inductor_triton_cpu() {
+  python test/run_test.py --include inductor/test_triton_cpu_backend.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -661,15 +665,6 @@ test_inductor_torchbench_smoketest_perf() {
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
-    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
-  # The threshold value needs to be actively maintained to make this check useful
-  # The perf number of nanogpt seems not very stable, e.g.
-  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
-  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
-  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
-
  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
    python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --amp --training \
@ -713,6 +708,10 @@ test_inductor_set_cpu_affinity(){
    export KMP_BLOCKTIME=1
  fi
  cores=$(test_inductor_get_core_number)
+  # Set number of cores to 16 on Aarch64 for performance runs.
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
+    cores=16
+  fi
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
  export TASKSET="taskset -c 0-$end_core"
@ -1402,7 +1401,7 @@ test_linux_aarch64() {
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes \
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
 }

@ -1436,6 +1435,8 @@ elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
  test_inductor_halide
+elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
+  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1459,7 +1460,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
-    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
+    checkout_install_torchbench hf_Bert hf_Albert timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
--- a/.ci/pytorch/win-build.sh
+++ b/.ci/pytorch/win-build.sh
@ -26,7 +26,7 @@ fi
 export SCRIPT_HELPERS_DIR=$SCRIPT_PARENT_DIR/win-test-helpers

 set +ex
-grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h --exclude=eval_frame.c torch/
+grep -E -R 'PyLong_(From|As)(Unsigned|)Long\(' --exclude=python_numbers.h  --exclude=pythoncapi_compat.h --exclude=eval_frame.c torch/
 PYLONG_API_CHECK=$?
 if [[ $PYLONG_API_CHECK == 0 ]]; then
  echo "Usage of PyLong_{From,As}{Unsigned}Long API may lead to overflow errors on Windows"
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -27,12 +27,11 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  source activate testenv >/dev/null
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}"
-  # Prior to Python 3.8 paths were suffixed with an 'm'
-  if [[ -d  "\${python_path}/bin" ]]; then
-    export PATH="\${python_path}/bin:\$PATH"
-  elif [[ -d "\${python_path}m/bin" ]]; then
-    export PATH="\${python_path}m/bin:\$PATH"
+  if [[ "\$python_nodot" = *t ]]; then
+    python_digits="\$(echo $DESIRED_PYTHON | tr -cd [:digit:])"
+    python_path="/opt/python/cp\$python_digits-cp\${python_digits}t"
  fi
+  export PATH="\${python_path}/bin:\$PATH"
 fi

 EXTRA_CONDA_FLAGS=""
--- a/.clang-format
+++ b/.clang-format
@ -44,7 +44,9 @@ ContinuationIndentWidth: 4
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
-ForEachMacros:   [ FOR_EACH_RANGE, FOR_EACH, ]
+ForEachMacros:
+  - FOR_EACH_RANGE
+  - FOR_EACH
 IncludeCategories:
  - Regex:           '^<.*\.h(pp)?>'
    Priority:        1
@ -58,6 +60,24 @@ IndentWrappedFunctionNames: false
 KeepEmptyLinesAtTheStartOfBlocks: false
 MacroBlockBegin: ''
 MacroBlockEnd:   ''
+Macros:
+  - >-
+    PyObject_HEAD_INIT(type)={
+        /* this is not exactly match with PyObject_HEAD_INIT in Python source code
+         * but it is enough for clang-format */
+        { 0xFFFFFFFF },
+        (type)
+    },
+  - >-
+    PyVarObject_HEAD_INIT(type, size)={
+        {
+            /* manually expand PyObject_HEAD_INIT(type) above
+             * because clang-format do not support recursive expansion */
+            { 0xFFFFFFFF },
+            (type)
+        },
+        (size)
+    },
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
 PenaltyBreakBeforeFirstCallParameter: 1
@ -79,7 +99,11 @@ SpacesInContainerLiterals: true
 SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
-Standard:        Cpp11
+Standard:        c++17
+StatementMacros:
+  - PyObject_HEAD
+  - PyObject_VAR_HEAD
+  - PyException_HEAD
 TabWidth:        8
 UseTab:          Never
 ---
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@ -1,38 +0,0 @@
-If you have a question or would like help and support, please ask at our
-[forums](https://discuss.pytorch.org/).
-
-If you are submitting a feature request, please preface the title with [feature request].
-If you are submitting a bug report, please fill in the following details.
-
-## Issue description
-
-Provide a short description.
-
-## Code example
-
-Please try to provide a minimal example to repro the bug.
-Error messages and stack traces are also helpful.
-
-## System Info
-Please copy and paste the output from our
-[environment collection script](https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py)
-(or fill out the checklist below manually).
-
-You can get the script and run it with:
-```
-wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
-# For security purposes, please check the contents of collect_env.py before running it.
-python collect_env.py
-```
-
- PyTorch or Caffe2:
- How you installed PyTorch (conda, pip, source):
- Build command you used (if compiling from source):
- OS:
- PyTorch version:
- Python version:
- CUDA/cuDNN version:
- GPU models and configuration:
- GCC version (if compiling from source):
- CMake version:
- Versions of any other relevant libraries:
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -5,7 +5,8 @@ about: Tracking incidents for PyTorch's CI infra.

 > NOTE: Remember to label this issue with "`ci: sev`"

-**MERGE BLOCKING** <!-- remove this line if you don't want this SEV to block merges -->
+ <!-- uncomment the below line if you don't want this SEV to block merges -->
+ <!--  **MERGE BLOCKING** -->

 ## Current Status
 *Status could be: preemptive, ongoing, mitigated, closed. Also tell people if they need to take action to fix it (i.e. rebase)*.
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -32,30 +32,6 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
-    - amz2023.linux.large
-    - amz2023.linux.2xlarge
-    - amz2023.linux.4xlarge
-    - amz2023.linux.12xlarge
-    - amz2023.linux.24xlarge
-    - amz2023.linux.arm64.2xlarge
-    - amz2023.linux.arm64.m7g.4xlarge
-    - amz2023.linux.arm64.m7g.4xlarge.ephemeral
-    - amz2023.linux.4xlarge.nvidia.gpu
-    - amz2023.linux.8xlarge.nvidia.gpu
-    - amz2023.linux.16xlarge.nvidia.gpu
-    - amz2023.linux.g5.4xlarge.nvidia.gpu
-    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
-    - amz2023.lf.linux.large
-    - amz2023.lf.linux.2xlarge
-    - amz2023.lf.linux.4xlarge
-    - amz2023.lf.linux.12xlarge
-    - amz2023.lf.linux.24xlarge
-    - amz2023.lf.linux.arm64.2xlarge
-    - amz2023.lf.linux.4xlarge.nvidia.gpu
-    - amz2023.lf.linux.8xlarge.nvidia.gpu
-    - amz2023.lf.linux.16xlarge.nvidia.gpu
-    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
--- a/.github/actions/checkout-pytorch/action.yml
+++ b/.github/actions/checkout-pytorch/action.yml
@ -18,8 +18,14 @@ inputs:
 runs:
  using: composite
  steps:
+    - name: Check if in a container runner
+      shell: bash
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"
+
    - name: Clean workspace
      shell: bash
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      env:
        NO_SUDO: ${{ inputs.no-sudo }}
      run: |
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -85,15 +85,25 @@ runs:
      with:
        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
      id: install-nvidia-driver
      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+    - name: Setup GPU_FLAG for docker run
+      id: setup-gpu-flag
+      run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+    - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+      id: setup-sscache-port-flag
+      run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

    - name: Lock NVIDIA A100 40GB Frequency
      shell: bash
@ -101,7 +111,7 @@ runs:
        sudo nvidia-smi -pm 1
        sudo nvidia-smi -ac 1215,1410
        nvidia-smi
-      if: contains(matrix.runner, 'a100')
+      if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

    - name: Start monitoring script
      id: monitor-script
@ -172,6 +182,7 @@ runs:
        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
        DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -181,6 +192,9 @@ runs:
        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+        SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+        IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
+
      shell: bash
      run: |
        set -x
@ -199,6 +213,7 @@ runs:
        # shellcheck disable=SC2086,SC2090
        container_name=$(docker run \
          ${GPU_FLAG:-} \
+          ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
          -e BUILD_ENVIRONMENT \
          -e PR_NUMBER \
          -e GITHUB_ACTIONS \
@ -227,6 +242,7 @@ runs:
          -e PR_LABELS \
          -e MAX_JOBS="$(nproc --ignore=2)" \
          -e SCCACHE_BUCKET \
+          -e SCCACHE_REGION \
          -e SCCACHE_S3_KEY_PREFIX \
          -e XLA_CUDA \
          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -234,7 +250,9 @@ runs:
          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
          -e SKIP_SCCACHE_INITIALIZATION=1 \
          -e HUGGING_FACE_HUB_TOKEN \
+          -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
          -e DASHBOARD_TAG \
+          -e IS_A100_RUNNER \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -305,7 +323,7 @@ runs:

    - name: Teardown Linux
      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
+      if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -28,14 +28,14 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Check if in a ARC runner
+    - name: Check if in a container runner
      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
+      id: check_container_runner
+      run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

    - name: Start docker if docker deamon is not running
      shell: bash
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
@ -73,7 +73,7 @@ runs:
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

    - name: Kill any existing containers, clean up images
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
      shell: bash
      run: |
        # ignore expansion of "docker ps -q" since it could be empty
@ -116,7 +116,7 @@ runs:
    - name: Check that the docker daemon is running
      shell: bash
      continue-on-error: true
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
+      if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
      run: |
        set +x

--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ba696ea3dfec4cbe693bf06a84c75dc196077f5b
+3f0569939c4369bec943fc27d1c9d8dfbc828c26
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -35,38 +35,35 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
@ -76,149 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -35,38 +35,35 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
    max_available: 450
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
    variants:
      am2:
        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
@ -76,149 +73,140 @@ runner_types:
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
    max_available: 500
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge.ephemeral:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
    max_available: 50
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-    variants:
-      amz2023:
-        ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.2xlarge.ephemeral:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge.ephemeral:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: true
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.windows.g4dn.xlarge:
    disk_size: 256
    instance_type: g4dn.xlarge
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -544,6 +544,7 @@
  - anijain2305
  - bdhirsh
  - zou3519
+  - isuruf
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -16,6 +16,7 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/periodic
 - ciflow/rocm
+- ciflow/s390
 - ciflow/slow
 - ciflow/trunk
 - ciflow/unstable
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.12.1
+optree==0.13.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -27,7 +27,7 @@ pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.12.1
+optree==0.13.0
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -333,7 +333,7 @@ def generate_wheels_matrix(
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS + ["3.13", "3.13t"]

    if arches is None:
        # Define default compute archivectures
@ -369,7 +369,13 @@ def generate_wheels_matrix(
            # TODO: Enable python 3.13 on rocm, aarch64, windows
            if (
                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
-            ) and python_version == "3.13":
+            ) and (python_version == "3.13" or python_version == "3.13t"):
+                continue
+
+            # TODO: Enable python 3.13t on xpu and cpu-s390x
+            if (
+                gpu_arch_type == "xpu" or gpu_arch_type == "cpu-s390x"
+            ) and python_version == "3.13t":
                continue

            if use_split_build and (
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -17,6 +17,11 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
    cp -r "${CACHE_DIRECTORY}" . || true
 fi

+# if lintrunner is not installed, install it
+if ! command -v lintrunner &> /dev/null; then
+    python3 -m pip install lintrunner==0.12.5
+fi
+
 # This has already been cached in the docker image
 lintrunner init 2> /dev/null

@ -33,7 +38,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
-if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,5 +1,9 @@
 # flake8: noqa: G004

+# Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+#       must be kept in sync. You can do it easily by running the following command:
+#           python .github/scripts/update_runner_determinator.py
+
 """
 This runner determinator is used to determine which set of runners to run a
 GitHub job on. It uses the first comment of a GitHub issue (by default
@ -79,6 +83,9 @@ class Experiment(NamedTuple):
    rollout_perc: float = (
        0  # Percentage of workflows to experiment on when user is not opted-in.
    )
+    all_branches: bool = (
+        False  # If True, the experiment is also enabled on the exception branches
+    )

    # Add more fields as needed

@ -212,7 +219,7 @@ def get_potential_pr_author(

 def is_exception_branch(branch: str) -> bool:
    """
-    Branches that get opted out of all experiments and should always use Meta runners
+    Branches that get opted out of experiments by default, until they're explicitly enabled.
    """
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -338,7 +345,10 @@ def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -


 def get_runner_prefix(
-    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+    rollout_state: str,
+    workflow_requestors: Iterable[str],
+    branch: str,
+    is_canary: bool = False,
 ) -> str:
    settings = parse_settings(rollout_state)
    user_optins = parse_users(rollout_state)
@ -348,6 +358,12 @@ def get_runner_prefix(
    for experiment_name, experiment_settings in settings.experiments.items():
        enabled = False

+        if not experiment_settings.all_branches and is_exception_branch(branch):
+            log.info(
+                f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+            )
+            continue
+
        # Is any workflow_requestor opted in to this experiment?
        opted_in_users = [
            requestor
@ -407,35 +423,34 @@ def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -
 def main() -> None:
    args = parse_args()

-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(
-            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+    runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+    try:
+        rollout_state = get_rollout_state_from_issue(
+            args.github_token, args.github_issue_repo, args.github_issue
        )
-        runner_label_prefix = DEFAULT_LABEL_PREFIX
-    else:
-        try:
-            rollout_state = get_rollout_state_from_issue(
-                args.github_token, args.github_issue_repo, args.github_issue
-            )

-            username = get_potential_pr_author(
-                args.github_token,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
+        username = get_potential_pr_author(
+            args.github_token,
+            args.github_repo,
+            args.github_actor,
+            args.github_ref_type,
+            args.github_branch,
+        )

-            is_canary = args.github_repo == "pytorch/pytorch-canary"
+        is_canary = args.github_repo == "pytorch/pytorch-canary"

-            runner_label_prefix = get_runner_prefix(
-                rollout_state, (args.github_issue_owner, username), is_canary
-            )
+        runner_label_prefix = get_runner_prefix(
+            rollout_state,
+            (args.github_issue_owner, username),
+            args.github_branch,
+            is_canary,
+        )

-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-            )
+    except Exception as e:
+        log.error(
+            f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+        )

    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -4,6 +4,10 @@ from unittest.mock import Mock, patch
 import runner_determinator as rd


+USER_BRANCH = "somebranch"
+EXCEPTION_BRANCH = "main"
+
+
 class TestRunnerDeterminatorIssueParser(TestCase):
    def test_parse_settings(self) -> None:
        settings_text = """
@ -66,6 +70,40 @@ class TestRunnerDeterminatorIssueParser(TestCase):
            "otherExp settings not parsed correctly",
        )

+    def test_parse_all_branches_setting(self) -> None:
+        settings_text = """
+        ```
+        experiments:
+            lf:
+                rollout_perc: 25
+                all_branches: true
+            otherExp:
+                all_branches: True
+                rollout_perc: 0
+        ```
+
+        ---
+
+        Users:
+        @User1,lf
+        @User2,lf,otherExp
+
+        """
+
+        settings = rd.parse_settings(settings_text)
+
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=25, all_branches=True),
+            settings.experiments["lf"],
+            "lf settings not parsed correctly",
+        )
+        self.assertTrue(settings.experiments["otherExp"].all_branches)
+        self.assertTupleEqual(
+            rd.Experiment(rollout_perc=0, all_branches=True),
+            settings.experiments["otherExp"],
+            "otherExp settings not parsed correctly",
+        )
+
    def test_parse_users(self) -> None:
        settings_text = """
        experiments:
@ -119,7 +157,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")

    def test_opted_in_user_two_experiments(self) -> None:
@ -136,7 +174,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")

    @patch("random.uniform", return_value=50)
@ -154,7 +192,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        @User2,lf,otherExp

        """
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    @patch("random.uniform", return_value=10)
@ -174,7 +212,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
        """

        # User3 is opted out, but is pulled into both experiments by the 10% rollout
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
+        prefix = rd.get_runner_prefix(settings_text, ["User3"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

    def test_lf_prefix_always_comes_first(self) -> None:
@ -192,7 +230,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
+        prefix = rd.get_runner_prefix(settings_text, ["User2"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

    def test_ignores_commented_users(self) -> None:
@ -210,7 +248,7 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("", prefix, "Runner prefix not correct for user")

    def test_ignores_extra_experiments(self) -> None:
@ -229,9 +267,44 @@ class TestRunnerDeterminatorGetRunnerPrefix(TestCase):

        """

-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], USER_BRANCH)
        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")

+    def test_disables_experiment_on_exception_branches_when_not_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("", prefix, "Runner prefix not correct for user")
+
+    def test_allows_experiment_on_exception_branches_when_explicitly_opted_in(
+        self,
+    ) -> None:
+        settings_text = """
+        experiments:
+            lf:
+                rollout_perc: 100
+                all_branches: true
+        ---
+
+        Users:
+        @User,lf,otherExp
+
+        """
+
+        prefix = rd.get_runner_prefix(settings_text, ["User1"], EXCEPTION_BRANCH)
+        self.assertEqual("lf.", prefix, "Runner prefix not correct for user")
+

 if __name__ == "__main__":
    main()
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -12,7 +12,7 @@ import json
 import os
 import warnings
 from hashlib import sha256
-from typing import Any, Dict, List, Optional
+from typing import Any, List, Optional
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

@ -24,7 +24,6 @@ from trymerge import (
    find_matching_merge_rule,
    get_classifications,
    get_drci_classifications,
-    get_rockset_results,
    gh_get_team_members,
    GitHubPR,
    JobCheckState,
@ -42,7 +41,6 @@ if "GIT_REMOTE_URL" not in os.environ:
    os.environ["GIT_REMOTE_URL"] = "https://github.com/pytorch/pytorch"

 GQL_MOCKS = "gql_mocks.json.gz"
-ROCKSET_MOCKS = "rockset_mocks.json.gz"
 DRCI_MOCKS = "drci_mocks.json.gz"


@ -77,16 +75,11 @@ def mock_query(
        if err.code == 401 or err.code == 403:
            err_msg = f"If you are seeing this message during workflow run, please make sure to update {file_name}"
            err_msg += f" locally, by deleting it and running {os.path.basename(__file__)} with"
-            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN,"
-            err_msg += " the rockset api key passed via ROCKSET_API_KEY,"
+            err_msg += " GitHub Personal Access Token passed via GITHUB_TOKEN"
            err_msg += " and drci api key passed via DRCI_BOT_KEY environment variables"
-            if (
-                os.getenv("GITHUB_TOKEN") is None
-                or os.getenv("ROCKSET_API_KEY") is None
-                or os.getenv("DRCI_BOT_KEY") is None
-            ):
+            if os.getenv("GITHUB_TOKEN") is None or os.getenv("DRCI_BOT_KEY") is None:
                err_msg = (
-                    "Failed to update cached queries as GITHUB_TOKEN or ROCKSET_API_KEY or DRCI_BOT_KEY "
+                    "Failed to update cached queries as GITHUB_TOKEN or DRCI_BOT_KEY "
                    + "is not defined. "
                    + err_msg
                )
@ -110,16 +103,6 @@ def mocked_gh_graphql(query: str, **kwargs: Any) -> Any:
    return mock_query(gh_graphql_wrapper, GQL_MOCKS, key_function, query, kwargs)


-def mocked_rockset_results(head_sha: str, merge_base: str, num_retries: int = 3) -> Any:
-    return mock_query(
-        get_rockset_results,
-        ROCKSET_MOCKS,
-        lambda x, y: f"{x} {y}",
-        head_sha,
-        merge_base,
-    )
-
-
 def mocked_drci_classifications(pr_num: int, project: str, num_retries: int = 3) -> Any:
    return mock_query(
        get_drci_classifications,
@ -273,10 +256,6 @@ def xla_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule]:
    ]


-def empty_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    return []
-
-
 class DummyGitRepo(GitRepo):
    def __init__(self) -> None:
        super().__init__(get_git_repo_dir(), get_git_remote_name())
@ -288,7 +267,6 @@ class DummyGitRepo(GitRepo):
        return "super awsome commit message"


-@mock.patch("trymerge.get_rockset_results", side_effect=empty_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch(
    "trymerge.get_drci_classifications", side_effect=mocked_drci_classifications
@ -604,7 +582,6 @@ class TestTryMerge(TestCase):
            mocked_gh_fetch_merge_base.assert_called_once()


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
@ -843,7 +820,7 @@ class TestBypassFailures(TestCase):
        checks = pr.get_checkrun_conclusions()

        # Known flaky failure takes precedence over ignore current (need to set the
-        # merge base here to get the results from Rockset, and that categorize the
+        # merge base here to get the results from Dr. CI, and that categorize the
        # broken trunk failure too
        checks = get_classifications(
            pr.pr_num,
@ -929,7 +906,6 @@ class TestBypassFailures(TestCase):
        )


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch("trymerge.get_drci_classifications", return_value={})
@ -1008,7 +984,6 @@ class TestBypassFailuresOnSandCastle(TestCase):
        self.assertTrue(len(failed) == 2)


-@mock.patch("trymerge.get_rockset_results", side_effect=mocked_rockset_results)
@mock.patch("trymerge.gh_graphql", side_effect=mocked_gh_graphql)
@mock.patch("trymerge.gh_fetch_merge_base", return_value="")
@mock.patch(
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -452,8 +452,6 @@ RE_DIFF_REV = re.compile(r"^Differential Revision:.+?(D[0-9]+)", re.MULTILINE)
 CIFLOW_LABEL = re.compile(r"^ciflow/.+")
 CIFLOW_TRUNK_LABEL = re.compile(r"^ciflow/trunk")
 MERGE_RULE_PATH = Path(".github") / "merge_rules.yaml"
-ROCKSET_MERGES_COLLECTION = "merges"
-ROCKSET_MERGES_WORKSPACE = "commons"
 REMOTE_MAIN_BRANCH = "origin/main"
 DRCI_CHECKRUN_NAME = "Dr.CI"
 INTERNAL_CHANGES_CHECKRUN_NAME = "Meta Internal-Only Changes Check"
@ -1180,7 +1178,7 @@ class GitHubPR:
        merge_commit_sha = repo.rev_parse(name=self.default_branch())

        if comment_id and self.pr_num:
-            # Finally, upload the record to Rockset. The list of pending and failed
+            # Finally, upload the record to s3. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
                comment_id=comment_id,
@ -1202,7 +1200,7 @@ class GitHubPR:
                ignore_current=bool(ignore_current_checks),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")

        # Usually Github will see that the commit has "resolves <pr_num>" in the
        # commit message and close the PR, but sometimes it doesn't, leading to
@ -1481,7 +1479,7 @@ def find_matching_merge_rule(

        # Categorize all checks when skip_mandatory_checks (force merge) is set. Do it here
        # where the list of checks is readily available. These records will be saved into
-        # Rockset merge records
+        # s3 merge records
        (
            pending_mandatory_checks,
            failed_mandatory_checks,
@ -1568,7 +1566,7 @@ def save_merge_record(
    This saves the merge records as a json, which can later be uploaded to s3
    """

-    # Prepare the record to be written into Rockset
+    # Prepare the record to be written into s3
    data = [
        {
            "comment_id": comment_id,
@ -1590,7 +1588,8 @@ def save_merge_record(
            "ignore_current": ignore_current,
            "error": error,
            # This is a unique identifier for the record for deduping purposes
-            # in rockset.  Any unique string would work
+            # in Rockset.  Any unique string would work.  This will not be used
+            # after we migrate off Rockset
            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
        }
    ]
@ -1600,36 +1599,6 @@ def save_merge_record(
        json.dump(data, f)


-@retries_decorator(rc=[])
-def get_rockset_results(head_sha: str, merge_base: str) -> List[Dict[str, Any]]:
-    query = f"""
-SELECT
-    w.name as workflow_name,
-    j.id,
-    j.name,
-    j.conclusion,
-    j.completed_at,
-    j.html_url,
-    j.head_sha,
-    j.torchci_classification.captures as failure_captures,
-    LENGTH(j.steps) as steps,
-FROM
-    commons.workflow_job j join commons.workflow_run w on w.id = j.run_id
-where
-    j.head_sha in ('{head_sha}','{merge_base}')
-"""
-    try:
-        import rockset  # type: ignore[import]
-
-        res = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        ).sql(query)
-        return cast(List[Dict[str, Any]], res.results)
-    except ModuleNotFoundError:
-        print("Could not use RockSet as rocket dependency is missing")
-        return []
-
-
@retries_decorator()
 def get_drci_classifications(pr_num: int, project: str = "pytorch") -> Any:
    """
@ -2067,7 +2036,7 @@ def categorize_checks(
    pending_checks: List[Tuple[str, Optional[str], Optional[int]]] = []
    failed_checks: List[Tuple[str, Optional[str], Optional[int]]] = []

-    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on Rockset
+    # failed_checks_categorization is used to keep track of all ignorable failures when saving the merge record on s3
    failed_checks_categorization: Dict[str, List[Any]] = defaultdict(list)

    # If required_checks is not set or empty, consider all names are relevant
@ -2126,7 +2095,7 @@ def categorize_checks(
    ):
        failed_checks = failed_checks + flaky_or_broken_trunk

-    # The list of failed_checks_categorization is returned so that it can be saved into the Rockset merge record
+    # The list of failed_checks_categorization is returned so that it can be saved into the s3 merge record
    return (pending_checks, failed_checks, failed_checks_categorization)


@ -2410,7 +2379,7 @@ def main() -> None:
        handle_exception(e)

        if args.comment_id and args.pr_num:
-            # Finally, upload the record to Rockset, we don't have access to the
+            # Finally, upload the record to s3, we don't have access to the
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
@ -2433,7 +2402,7 @@ def main() -> None:
                error=str(e),
            )
        else:
-            print("Missing comment ID or PR number, couldn't upload to Rockset")
+            print("Missing comment ID or PR number, couldn't upload to s3")
    finally:
        if not args.check_mergeability:
            gh_remove_label(
--- a/.github/scripts/update_runner_determinator.py
+++ b/.github/scripts/update_runner_determinator.py
@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+
+import re
+
+
+# Read the contents of runner_determinator.py
+with open(".github/scripts/runner_determinator.py") as script_file:
+    script_content = script_file.read()
+
+# Indent the script content by 10 spaces to match destination indentation
+indented_script_content = "\n".join(
+    [" " * 10 + line if line else line for line in script_content.splitlines()]
+)
+
+# Read the contents of _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml") as yml_file:
+    yml_content = yml_file.read()
+
+# Replace the content between the markers
+new_yml_content = re.sub(
+    r"(cat <<EOF > runner_determinator.py\n)(.*?)(\n\s+EOF)",
+    lambda match: match.group(1) + indented_script_content + match.group(3),
+    yml_content,
+    flags=re.DOTALL,
+)
+
+# Save the modified content back to _runner-determinator.yml
+with open(".github/workflows/_runner-determinator.yml", "w") as yml_file:
+    yml_file.write(new_yml_content)
+
+print("Updated _runner-determinator.yml with the contents of runner_determinator.py")
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -68,6 +68,7 @@ jobs:
    needs: get-label-type
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
@ -102,6 +103,7 @@ jobs:
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
      {%- if "aarch64" in build_environment %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -91,14 +91,14 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ inputs.cuda-version != 'cpu' && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Output disk space left
        run: |
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -109,6 +109,7 @@ jobs:
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}

@ -118,13 +119,16 @@ jobs:
      # checkout. In other cases you should prefer a local checkout.
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          no-sudo: ${{ inputs.build-environment == 'linux-s390x-binary-manywheel' }}

      - name: Setup Linux
        uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'

      - name: configure aws credentials
        uses: aws-actions/configure-aws-credentials@v3
-        if: ${{ inputs.aws-role-to-assume != '' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-build
@ -133,11 +137,13 @@ jobs:
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image-name: ${{ inputs.docker-image-name }}

      - name: Use following to pull public copy of the image
        id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        env:
          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
        shell: bash
@ -147,6 +153,7 @@ jobs:

      - name: Pull docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

@ -174,6 +181,7 @@ jobs:
      - name: Download pytest cache
        uses: ./.github/actions/pytest-cache-download
        continue-on-error: true
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          cache_dir: .pytest_cache
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
@ -195,6 +203,7 @@ jobs:
          PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
          TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          DOCKER_IMAGE_S390X: ${{ inputs.docker-image-name }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
@ -202,7 +211,21 @@ jobs:
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            JENKINS_USER=
+            USED_IMAGE="${DOCKER_IMAGE_S390X}"
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          else
+            JENKINS_USER="--user jenkins"
+            USED_IMAGE="${DOCKER_IMAGE}"
+          fi
+
          # detached container should get cleaned up by teardown_ec2_linux
+          # Used for JENKINS_USER, which can be empty
+          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
            -e MAX_JOBS="$(nproc --ignore=2)" \
@ -225,10 +248,10 @@ jobs:
            --cap-add=SYS_PTRACE \
            --tty \
            --detach \
-            --user jenkins \
+            ${JENKINS_USER} \
            -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
            -w /var/lib/jenkins/workspace \
-            "${DOCKER_IMAGE}"
+            "${USED_IMAGE}"
          )
          docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'

@ -239,7 +262,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -249,7 +272,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3 for split build
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment != 'linux-s390x-binary-manywheel'
        with:
          name: ${{ inputs.build-environment }}-experimental-split-build
          retention-days: 14
@ -257,8 +280,26 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts for s390x
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && !inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
+      - name: Store PyTorch Build Artifacts for s390x for split build
+        uses: actions/upload-artifact@v3
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+
      - name: Upload sccache stats
-        if: steps.build.outcome != 'skipped'
+        if: steps.build.outcome != 'skipped' && inputs.build-environment != 'linux-s390x-binary-manywheel'
        uses: seemethere/upload-artifact-s3@v5
        with:
          s3-prefix: |
@ -270,4 +311,13 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && inputs.build-environment != 'linux-s390x-binary-manywheel'
+
+      - name: Cleanup docker
+        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -114,22 +114,32 @@ jobs:
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

-      - name: Check if in a ARC runner
+      - name: Check if in a container runner
        shell: bash
-        id: check_arc_runner
-        run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
+        id: check_container_runner
+        run: echo "IN_CONTAINER_RUNNER=$(if [ -f /.inarc ] || [ -f /.incontainer ]; then echo true ; else echo false; fi)" >> "$GITHUB_OUTPUT"

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+
+      - name: Setup GPU_FLAG for docker run
+        id: setup-gpu-flag
+        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+
+      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
+        id: setup-sscache-port-flag
+        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
          sudo nvidia-smi -pm 1
          sudo nvidia-smi -ac 1215,1410
          nvidia-smi
-        if: contains(matrix.runner, 'a100')
+        if: ${{ contains(matrix.runner, 'a100') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}

      - name: Start monitoring script
        id: monitor-script
@ -208,6 +218,7 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+          SCCACHE_REGION: us-east-1
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
@ -218,6 +229,7 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}

        run: |
          set -x
@ -236,6 +248,7 @@ jobs:
          # shellcheck disable=SC2086,SC2090
          container_name=$(docker run \
            ${GPU_FLAG:-} \
+            ${SCCACHE_SERVER_PORT_DOCKER_FLAG:-} \
            -e BUILD_ENVIRONMENT \
            -e PR_NUMBER \
            -e GITHUB_ACTIONS \
@ -265,6 +278,7 @@ jobs:
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
+            -e SCCACHE_REGION \
            -e SCCACHE_S3_KEY_PREFIX \
            -e XLA_CUDA \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
@ -274,6 +288,7 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
+            -e IS_A100_RUNNER \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -343,7 +358,7 @@ jobs:

      - name: Teardown Linux
        uses: pytorch/test-infra/.github/actions/teardown-linux@main
-        if: always()
+        if: always() && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false'

      # NB: We are currently having an intermittent GPU-related issue on G5 runners with
      # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -88,6 +88,13 @@ jobs:
          environment-file: .github/requirements/conda-env-${{ runner.os }}-${{ runner.arch }}
          pip-requirements-file: .github/requirements/pip-requirements-${{ runner.os }}.txt

+      - name: Get workflow job id
+        id: get-job-id
+        uses: ./.github/actions/get-workflow-job-id
+        if: always()
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+
      - name: Install PyTorch and run MPS tests
        id: test
        env:
@ -103,6 +110,14 @@ jobs:
          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
+          GITHUB_REPOSITORY: ${{ github.repository }}
+          GITHUB_WORKFLOW: ${{ github.workflow }}
+          GITHUB_JOB: ${{ github.job }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
+          GITHUB_RUN_NUMBER: ${{ github.run_number }}
+          GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
+          JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
+          JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
        run: |
          # shellcheck disable=SC1090
@ -144,13 +159,6 @@ jobs:
        run: |
          cat test/**/*_toprint.log || true

-      - name: Get workflow job id
-        id: get-job-id
-        uses: ./.github/actions/get-workflow-job-id
-        if: always()
-        with:
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-
      - name: Upload test artifacts
        uses: ./.github/actions/upload-test-artifacts
        if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -59,6 +59,10 @@ jobs:
          cat <<EOF > runner_determinator.py
          # flake8: noqa: G004

+          # Note: Copies of this script in runner_determinator.py and _runner-determinator.yml
+          #       must be kept in sync. You can do it easily by running the following command:
+          #           python .github/scripts/update_runner_determinator.py
+
          """
          This runner determinator is used to determine which set of runners to run a
          GitHub job on. It uses the first comment of a GitHub issue (by default
@ -138,6 +142,9 @@ jobs:
              rollout_perc: float = (
                  0  # Percentage of workflows to experiment on when user is not opted-in.
              )
+              all_branches: bool = (
+                  False  # If True, the experiment is also enabled on the exception branches
+              )

              # Add more fields as needed

@ -271,7 +278,7 @@ jobs:

          def is_exception_branch(branch: str) -> bool:
              """
-              Branches that get opted out of all experiments and should always use Meta runners
+              Branches that get opted out of experiments by default, until they're explicitly enabled.
              """
              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}

@ -397,7 +404,10 @@ jobs:


          def get_runner_prefix(
-              rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
+              rollout_state: str,
+              workflow_requestors: Iterable[str],
+              branch: str,
+              is_canary: bool = False,
          ) -> str:
              settings = parse_settings(rollout_state)
              user_optins = parse_users(rollout_state)
@ -407,6 +417,12 @@ jobs:
              for experiment_name, experiment_settings in settings.experiments.items():
                  enabled = False

+                  if not experiment_settings.all_branches and is_exception_branch(branch):
+                      log.info(
+                          f"Branch {branch} is an exception branch. Not enabling experiment {experiment_name}."
+                      )
+                      continue
+
                  # Is any workflow_requestor opted in to this experiment?
                  opted_in_users = [
                      requestor
@ -466,35 +482,34 @@ jobs:
          def main() -> None:
              args = parse_args()

-              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(
-                      f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
+              runner_label_prefix = DEFAULT_LABEL_PREFIX
+
+              try:
+                  rollout_state = get_rollout_state_from_issue(
+                      args.github_token, args.github_issue_repo, args.github_issue
                  )
-                  runner_label_prefix = DEFAULT_LABEL_PREFIX
-              else:
-                  try:
-                      rollout_state = get_rollout_state_from_issue(
-                          args.github_token, args.github_issue_repo, args.github_issue
-                      )

-                      username = get_potential_pr_author(
-                          args.github_token,
-                          args.github_repo,
-                          args.github_actor,
-                          args.github_ref_type,
-                          args.github_branch,
-                      )
+                  username = get_potential_pr_author(
+                      args.github_token,
+                      args.github_repo,
+                      args.github_actor,
+                      args.github_ref_type,
+                      args.github_branch,
+                  )

-                      is_canary = args.github_repo == "pytorch/pytorch-canary"
+                  is_canary = args.github_repo == "pytorch/pytorch-canary"

-                      runner_label_prefix = get_runner_prefix(
-                          rollout_state, (args.github_issue_owner, username), is_canary
-                      )
+                  runner_label_prefix = get_runner_prefix(
+                      rollout_state,
+                      (args.github_issue_owner, username),
+                      args.github_branch,
+                      is_canary,
+                  )

-                  except Exception as e:
-                      log.error(
-                          f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
-                      )
+              except Exception as e:
+                  log.error(
+                      f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+                  )

              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)

--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -189,7 +189,7 @@ jobs:
        run: |
          pushd "${PYTORCH_FINAL_PACKAGE_DIR}"
          # shellcheck disable=SC2046,SC2102
-          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.12.1
+          python3 -mpip install $(echo *.whl)[opt-einsum,optree] optree==0.13.0
          popd

          .ci/pytorch/win-test.sh
--- a/.github/workflows/build-conda-images.yml
+++ b/.github/workflows/build-conda-images.yml
@ -32,7 +32,7 @@ concurrency:
 jobs:
  build-docker:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
-    runs-on: am2.linux.9xlarge.ephemeral
+    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
        cuda_version: ["11.8", "12.1", "12.4", "cpu"]
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -45,7 +45,7 @@ jobs:
  build-docker-cuda:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        cuda_version: ["12.4", "12.1", "11.8"]
@ -156,7 +156,7 @@ jobs:
  build-docker-rocm:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    strategy:
      matrix:
        rocm_version: ["6.1", "6.2"]
@ -192,7 +192,7 @@ jobs:
  build-docker-cpu:
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}am2.linux.9xlarge.ephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral"
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -43,7 +43,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
        device: ["cuda", "rocm", "xpu"]
        include:
          - device: "rocm"
@ -91,9 +91,6 @@ jobs:

          # Determine python executable for given version
          case $PY_VERS in
-          3.8)
-            PYTHON_EXECUTABLE=/opt/python/cp38-cp38/bin/python
-            ;;
          3.9)
            PYTHON_EXECUTABLE=/opt/python/cp39-cp39/bin/python
            ;;
@ -214,7 +211,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        py_vers: [ "3.8", "3.9", "3.10", "3.11", "3.12" ]
+        py_vers: [ "3.9", "3.10", "3.11", "3.12" ]
    timeout-minutes: 40
    env:
      DOCKER_IMAGE: pytorch/conda-builder:cpu
--- a/.github/workflows/check-labels.yml
+++ b/.github/workflows/check-labels.yml
@ -30,6 +30,9 @@ concurrency:

 jobs:
  check-labels:
+    permissions:
+      contents: read
+      pull-requests: write
    name: Check labels
    if: github.repository_owner == 'pytorch'
    runs-on: linux.20_04.4x
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -67,6 +67,7 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
+          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-focal-py3-clang10-onnx,
          pytorch-linux-focal-linter,
          pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter,
@ -78,7 +79,9 @@ jobs:
          - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11-inductor-benchmarks
            runner: linux.arm64.m7g.4xlarge
            timeout-minutes: 600
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    # Docker uploads fail from LF runners, see https://github.com/pytorch/pytorch/pull/137358
+    # runs-on: "${{ needs.get-label-type.outputs.label-type }}${{ matrix.runner }}"
+    runs-on: "${{ matrix.runner }}"
    env:
      DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }}
    steps:
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -60,6 +60,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
@ -86,6 +87,7 @@ jobs:
      DESIRED_PYTHON: "3.9"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -130,6 +132,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.9"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64
@ -177,6 +180,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
@ -203,6 +207,7 @@ jobs:
      DESIRED_PYTHON: "3.10"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -247,6 +252,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64
@ -294,6 +300,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
@ -320,6 +327,7 @@ jobs:
      DESIRED_PYTHON: "3.11"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -364,6 +372,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64
@ -411,6 +420,7 @@ jobs:
      DOCKER_IMAGE: pytorch/manylinuxaarch64-builder:cpu-aarch64-main
      use_split_build: False
      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
@ -437,6 +447,7 @@ jobs:
      DESIRED_PYTHON: "3.12"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.2xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
    secrets:
@ -481,6 +492,7 @@ jobs:
      DESIRED_DEVTOOLSET: cxx11-abi
      use_split_build: False
      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.arm64.m7g.4xlarge.ephemeral
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -3324,3 +3324,353 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-cxx11-abi-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-cxx11-abi-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-cxx11-abi-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-cxx11-abi-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu-cxx11-abi
+      GPU_ARCH_TYPE: cpu-cxx11-abi
+      DOCKER_IMAGE: pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-main
+      DESIRED_DEVTOOLSET: cxx11-abi
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu-cxx11-abi
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: False
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-split-nightly.yml
@ -1514,3 +1514,283 @@ jobs:
      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda11_8-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda11_8-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda11_8-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda11_8-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda11_8
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_1-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_1-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_1-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_1-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_1
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cuda12_4-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_4-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_4-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_4-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_4
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_13t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cpu-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+      build_environment: linux-binary-manywheel-split
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.4xlarge
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cpu-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+      conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -5,9 +5,7 @@ on:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
    # Does not perform max_autotune on CPU, so skip the weekly run setup
-    # Run 6 times everyday to see if perf instablity can be reproduced
-    # Will change this back
-    - cron: 0 */4 * * *
+    - cron: 0 7 * * *
  # NB: GitHub has an upper limit of 10 inputs here
  workflow_dispatch:
    inputs:
@ -116,7 +114,7 @@ jobs:
    name: linux-jammy-aarch64-py3.10-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-aarch64-py3_10-inductor-build
-    if: github.event.schedule == '0 */4 * * *'
+    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-aarch64-py3.10
      # Turn off dynamic-shapes and aotinductor tests for now, to have faster iteration for debugging perf instability.
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -31,13 +31,13 @@ jobs:
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}

-  linux-focal-rocm6_1-py3_10-inductor-build:
-    name: rocm6.1-py3.10-inductor
+  linux-focal-rocm6_2-py3_10-inductor-build:
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -45,14 +45,14 @@ jobs:
          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_10-inductor-test:
+  linux-focal-rocm6_2-py3_10-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm6.1-py3.10-inductor
+    name: rocm6.2-py3.10-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm6_1-py3_10-inductor-build
+    needs: linux-focal-rocm6_2-py3_10-inductor-build
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_10-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm6_1-py3_10-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_2-py3_10-inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -58,8 +58,7 @@ jobs:
          { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_10-gcc9-inductor-test:
    name: cuda12.1-py3.10-gcc9-sm86
@ -69,8 +68,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
@ -86,6 +84,7 @@ jobs:
          { config: "inductor", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
+    secrets: inherit

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
    name: cuda12.1-py3.12-gcc9-sm86
@ -95,6 +94,7 @@ jobs:
      build-environment: linux-focal-cuda12.1-py3.12-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-build:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -108,6 +108,7 @@ jobs:
        { include: [
          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}
+    secrets: inherit

  linux-jammy-cpu-py3_12-inductor-halide-test:
    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
@ -117,6 +118,29 @@ jobs:
      build-environment: linux-jammy-py3.12-gcc11
      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+    secrets: inherit
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-triton-cpu
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      test-matrix: |
+        { include: [
+          { config: "inductor-triton-cpu", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-triton-cpu-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-triton-cpu
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-triton-cpu-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-triton-cpu-build.outputs.test-matrix }}

  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
@ -134,8 +158,7 @@ jobs:
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-focal-cuda12_4-py3_10-gcc9-inductor-test:
    name: cuda12.4-py3.10-gcc9-sm86
@ -146,8 +169,7 @@ jobs:
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-sm86
      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-build:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -201,8 +223,7 @@ jobs:
          { config: "cpu_inductor_freezing_avx2_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
          { config: "cpu_inductor_freezing_avx2_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit

  linux-jammy-cpu-py3_9-gcc11-inductor-test:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
@ -212,5 +233,4 @@ jobs:
      build-environment: linux-jammy-py3.9-gcc11-build
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+    secrets: inherit
--- a/.github/workflows/lint-autoformat.yml
+++ b/.github/workflows/lint-autoformat.yml
@ -0,0 +1,45 @@
+name: Apply lint suggestions
+
+on:
+
+  pull_request:
+    types: [opened, synchronize, reopened]
+
+jobs:
+  lintrunner-autoformat:
+    permissions:
+      contents: read
+      pull-requests: write
+    runs-on: lf.linux.2xlarge
+    if: ${{ github.repository_owner == 'pytorch' }}
+    steps:
+      - name: Checkout pytorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          submodules: true
+          fetch-depth: 0
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: "3.10"
+      - name: Run lintrunner (nonretryable)
+        continue-on-error: true
+        # we can't run all files here because only changes around where the diff are shown in the PR UI
+        run: |
+          export ADDITIONAL_LINTRUNNER_ARGS="format"
+          bash .github/scripts/lintrunner.sh
+      - name: Check for changes
+        id: git-check
+        continue-on-error: true
+        run: |
+          git diff --exit-code || echo "changes=true" >> "$GITHUB_OUTPUT"
+      - name: Suggest changes
+        if: steps.git-check.outputs.changes == 'true'
+        continue-on-error: true
+        uses: parkerbxyz/suggest-changes@v1
+        with:
+          comment: "Please commit the suggested changes from pytorch's linter."
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -36,7 +36,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT --all-files"
        export CLANG=1
        .github/scripts/lintrunner.sh

@ -53,7 +53,7 @@ jobs:
      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT --all-files"
        .github/scripts/lintrunner.sh

  quick-checks:
@ -215,14 +215,15 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.8
+      - name: Setup Python 3.9
        uses: actions/setup-python@v4
        with:
-          python-version: '3.8'
+          python-version: '3.9'
          architecture: x64
          cache: pip
      - name: Install dependencies
        run: |
+          python3 -m pip install --upgrade pip
          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.2.* fbscribelogger==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -57,10 +57,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}
  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -89,10 +89,10 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

@ -118,9 +118,10 @@ jobs:
      docker-image-name: pytorch-linux-jammy-py3.9-gcc11
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

  parallelnative-linux-jammy-py3_9-gcc11-test:
@ -218,7 +219,9 @@ jobs:
  # TODO: Figure out how to migrate this job to M1 runner
  ios-build-test:
    name: ios-build-test
-    if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    # Has been broken for a while, see https://github.com/pytorch/pytorch/issues/136284
+    # if: github.event_name != 'schedule' || github.event.schedule == '45 0,8,16 * * 1-5' || github.event.schedule == '45 4 * * 0,6' || github.event.schedule == '29 8 * * *'
+    if: false
    uses: ./.github/workflows/_ios-build-test.yml
    with:
      trigger-event: ${{ github.event_name }}
@ -297,13 +300,13 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_10-build:
-    name: linux-focal-rocm6.1-py3.10
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -312,19 +315,19 @@ jobs:
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_10-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.10
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_10-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
@ -337,10 +340,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -185,10 +185,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.9-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -217,10 +217,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.11-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -251,10 +251,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.12-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
@ -383,7 +383,7 @@ jobs:
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-focal-py3.9-clang9-xla
-      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
+      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.3-lite
      test-matrix: |
        { include: [
          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
@ -503,15 +503,15 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_10-build:
+  linux-focal-rocm6_2-py3_10-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm6.1-py3.10
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -588,9 +588,9 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.12-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -25,11 +25,11 @@ jobs:
      id-token: write
      contents: read

-  linux-focal-rocm6_1-py3_10-build:
-    name: linux-focal-rocm6.1-py3.10
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -42,16 +42,16 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm6_1-py3_10-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.10
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_10-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
--- a/.github/workflows/s390.yml
+++ b/.github/workflows/s390.yml
@ -0,0 +1,24 @@
+name: s390
+
+on:
+  push:
+    branches:
+      - main
+    tags:
+      - ciflow/s390/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -130,13 +130,13 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_9-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_9-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm6_1-py3_10-build:
-    name: linux-focal-rocm6.1-py3.10
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -144,19 +144,19 @@ jobs:
          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm6_1-py3_10-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.10
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_10-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -223,13 +223,13 @@ jobs:
      cuda-version: "12.1"
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"

-  linux-focal-rocm6_1-py3_10-build:
-    name: linux-focal-rocm6.1-py3.10
+  linux-focal-rocm6_2-py3_10-build:
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-focal-rocm6.1-py3.10
+      build-environment: linux-focal-rocm6.2-py3.10
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -240,19 +240,19 @@ jobs:
        ]}
    secrets: inherit

-  linux-focal-rocm6_1-py3_10-test:
+  linux-focal-rocm6_2-py3_10-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm6.1-py3.10
+    name: linux-focal-rocm6.2-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
-      - linux-focal-rocm6_1-py3_10-build
+      - linux-focal-rocm6_2-py3_10-build
      - target-determination
    with:
-      build-environment: linux-focal-rocm6.1-py3.10
-      docker-image: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_10-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.2-py3.10
+      docker-image: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_2-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"

  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
@ -266,10 +266,10 @@ jobs:
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "nogpu_AVX512", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_AVX512", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -28,7 +28,7 @@ jobs:
          check-latest: false
          cache: pip
          architecture: x64
-      - run: pip install pyyaml==6.0 rockset==1.0.3
+      - run: pip install pyyaml==6.0

      - name: Setup committer id
        run: |
@ -43,7 +43,6 @@ jobs:
          COMMENT_ID: ${{ github.event.client_payload.comment_id }}
          REBASE: ${{ github.event.client_payload.rebase }}
          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
          GITHUB_RUN_ID: ${{ github.run_id }}
        run: |
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -11,15 +11,39 @@ concurrency:

 jobs:
  do_update_viablestrict:
+    permissions:
+      id-token: write
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-20.04
    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
    steps:
      - name: Update viable/strict
        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
+        id: update_viablestrict
        with:
          repository: pytorch/pytorch
          stable-branch: viable/strict
          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
+
+      - name: Authenticate to AWS with OIDC
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Print sha
+        env:
+          LATEST_SHA: ${{ steps.update_viablestrict.outputs.latest_viable_sha }}
+          PUSH_RESULT: ${{ steps.update_viablestrict.outputs.push_result }}
+          TIME: ${{ steps.update_viablestrict.outputs.time }}
+        run: |
+          echo "${PUSH_RESULT}"
+          if [ "$PUSH_RESULT" = "Everything up-to-date" ]; then
+            echo "No update pushed"
+          else
+            echo "{\"sha\": \"${LATEST_SHA}\", \"repository\":\"pytorch/pytorch\", \"timestamp\": ${TIME}}" > "/tmp/${LATEST_SHA}.json"
+            pip install awscli==1.29.40
+            aws s3 cp "/tmp/${LATEST_SHA}.json" "s3://ossci-raw-job-status/stable_pushes/pytorch/pytorch/${LATEST_SHA}.json"
+          fi
--- a/.github/workflows/upload-alerts.yml
+++ b/.github/workflows/upload-alerts.yml
@ -1,55 +0,0 @@
-# upload alerts every 10 minutes
-
-name: Upload Alerts to AWS/Rockset
-
-on:
-  schedule:
-    - cron: '*/10 * * * *'
-  pull_request:
-    paths:
-      - 'tools/alerts/create_alerts.py'
-      - '.github/workflows/upload-alerts.yml'
-
-jobs:
-  upload-alerts:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    runs-on: ubuntu-22.04
-    environment: upload-stats
-    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 1
-
-      - uses: actions/setup-python@v4
-        with:
-          python-version: '3.11'
-          cache: pip
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3 boto3==1.19.12 requests==2.32.2
-
-      - name: Create alerts
-        run: |
-          output=$(PYTHONPATH=$PYTHONPATH:$(pwd) python3 "tools/alerts/create_alerts.py")
-          echo "uploading following alerts"
-          echo "$output"
-          echo "script-output=$output" >> "$GITHUB_OUTPUT"
-        id: alert_creation_step
-
-      - name: Upload alerts
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
-        uses: pytorch/test-infra/.github/actions/upload-alerts@main
-        with:
-          alerts: '${{ steps.alert_creation_step.outputs.script-output }}'
-          organization: "pytorch"
-          repo: "pytorch"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -153,7 +153,7 @@ init_command = [
    'junitparser==2.1.1',
    'rich==10.9.0',
    'pyyaml==6.0.1',
-    'optree==0.12.1',
+    'optree==0.13.0',
 ]

 [[linter]]
@ -195,6 +195,7 @@ include_patterns = [
    # and excluding most sub-directories for now.
    'aten/src/ATen/*.h',
    'aten/src/ATen/*.cpp',
+    'aten/src/ATen/cuda/*.cpp',
    'aten/src/ATen/cpu/*.h',
    'aten/src/ATen/cpu/*.cpp',
    'aten/src/ATen/core/*.h',
@ -215,6 +216,10 @@ include_patterns = [
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.cpp',
+    'torch/csrc/distributed/autograd/**/*.cpp',
+    'torch/csrc/distributed/autograd/**/*.h',
+    'torch/csrc/distributed/rpc/**/*.cpp',
+    'torch/csrc/distributed/rpc/**/*.h',
    'torch/csrc/jit/serialization/*.h',
    'torch/csrc/jit/serialization/*.cpp',
 ]
@ -224,7 +229,6 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
-    'aten/**/cuda/*pp',
    'c10/xpu/**/*.h',
    'c10/xpu/**/*.cpp',
    'c10/cuda/CUDAAlgorithm.h',
@ -246,7 +250,6 @@ exclude_patterns = [
    'torch/csrc/inductor/aoti_torch/c/shim.h',
    'torch/csrc/jit/**/*',
    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
-    'torch/csrc/lazy/**/*',
 ]
 init_command = [
    'python3',
@ -1255,7 +1258,6 @@ exclude_patterns = [
    'torch/fx/experimental/refinement_types.py',
    'torch/fx/experimental/rewriter.py',
    'torch/fx/experimental/schema_type_annotation.py',
-    'torch/fx/experimental/symbolic_shapes.py',
    'torch/fx/experimental/unification/__init__.py',
    'torch/fx/experimental/unification/core.py',
    'torch/fx/experimental/unification/dispatch.py',
@ -1271,7 +1273,6 @@ exclude_patterns = [
    'torch/fx/experimental/unification/utils.py',
    'torch/fx/experimental/unification/variable.py',
    'torch/fx/experimental/unify_refinements.py',
-    'torch/fx/experimental/validator.py',
    'torch/fx/graph.py',
    'torch/fx/graph_module.py',
    'torch/fx/interpreter.py',
@ -1585,6 +1586,27 @@ command = [
 ]
 is_formatter = true

+
+[[linter]]
+code = 'META_NO_CREATE_UNBACKED'
+include_patterns = [
+  "torch/_meta_registrations.py"
+]
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=create_unbacked',
+    '--linter-name=META_NO_CREATE_UNBACKED',
+    '--error-name=no create_unbacked in meta registrations',
+    """--error-description=\
+        Data-dependent operators should have their meta \
+        registration in torch/_subclasses/fake_impls.py, \
+        not torch/_meta_registrations.py
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'ATEN_CPU_GPU_AGNOSTIC'
 include_patterns = [
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -305,7 +305,6 @@ if(NOT DEFINED USE_VULKAN)
  cmake_dependent_option(USE_VULKAN "Use Vulkan GPU backend" ON "ANDROID" OFF)
 endif()

-option(USE_SLEEF_FOR_ARM_VEC256 "Use sleef for arm" OFF)
 option(USE_SOURCE_DEBUG_ON_MOBILE "Enable" ON)
 option(USE_LITE_INTERPRETER_PROFILER "Enable" ON)
 cmake_dependent_option(
@ -369,7 +368,7 @@ cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
    USE_TENSORPIPE "Use TensorPipe. Only available if USE_DISTRIBUTED is on." ON
-    "USE_DISTRIBUTED" OFF)
+    "USE_DISTRIBUTED AND NOT WIN32" OFF)
 option(ONNX_ML "Enable traditional ONNX ML API." ON)
 option(HAVE_SOVERSION "Whether to add SOVERSION to the shared objects" OFF)
 option(BUILD_LIBTORCH_CPU_WITH_DEBUG
@ -912,11 +911,6 @@ if(USE_PYTORCH_QNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_PYTORCH_QNNPACK")
 endif()

-if(USE_SLEEF_FOR_ARM_VEC256)
-  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
-  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
-endif()
-
 # Enable sleef on macOS with Apple silicon by default
 if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64"))
  message(STATUS "Running on macOS with Apple silicon")
@ -924,6 +918,14 @@ if((${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") AND ("${CMAKE_SYSTEM_PROCESSOR}" STR
  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
 endif()

+# Enable sleef on Arm(R) architecture by default (except Android)
+if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+  AND("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "aarch64"))
+  string(APPEND CMAKE_CXX_FLAGS " -DAT_BUILD_ARM_VEC256_WITH_SLEEF")
+  add_definitions(-DAT_BUILD_ARM_VEC256_WITH_SLEEF)
+endif()
+
+
 if(USE_XNNPACK)
  string(APPEND CMAKE_CXX_FLAGS " -DUSE_XNNPACK")
 endif()
@ -1081,8 +1083,16 @@ if(NOT MSVC)
  append_cxx_flag_if_supported("-Wno-unused-but-set-variable" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fstandalone-debug" CMAKE_CXX_FLAGS_DEBUG)
-  string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
-  string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64" AND CMAKE_CXX_COMPILER_ID MATCHES "GNU")
+    if(CMAKE_BUILD_TYPE MATCHES Debug)
+      message(Warning "Applying -Og optimization for aarch64 GCC debug build to workaround ICE")
+    endif()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -Og")
+  else()
+    string(APPEND CMAKE_CXX_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+    string(APPEND CMAKE_LINKER_FLAGS_DEBUG " -fno-omit-frame-pointer -O0")
+  endif()
  append_cxx_flag_if_supported("-fno-math-errno" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-fno-trapping-math" CMAKE_CXX_FLAGS)
  append_cxx_flag_if_supported("-Werror=format" CMAKE_CXX_FLAGS)
--- a/6
+++ b/6
@ -98,6 +98,10 @@ test/test_type_promotion.py @mruberry
 test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345

+# HOPs
+torch/_higher_order_ops/*.py @zou3519
+torch/_dynamo/variables/higher_order_ops.py @zou3519
+
 # torch MPS
 test/test_mps.py @kulinseth @malfet
 aten/src/ATen/mps/ @kulinseth @malfet
@ -117,7 +121,7 @@ torch/profiler/ @aaronenyeshi @sraikund16
 test/functorch/test_aotdispatch.py @ezyang @Chillee

 # Dataloader
-torch/utils/data/ @andrewkho @gokulavasan
+torch/utils/data/ @andrewkho @divyanshk

 # hipify
 torch/utils/hipify/ @jeffdaily @jithunnair-amd
--- a/README.md
+++ b/README.md
@ -208,6 +208,8 @@ If you want to compile with ROCm support, install
 - [AMD ROCm](https://rocm.docs.amd.com/en/latest/deploy/linux/quick_start.html) 4.0 and above installation
 - ROCm is currently supported only for Linux systems.

+By default the build system expects ROCm to be installed in `/opt/rocm`. If ROCm is installed in a different directory, the `ROCM_PATH` environment variable must be set to the ROCm installation directory. The build system automatically detects the AMD GPU architecture. Optionally, the AMD GPU architecture can be explicitly set with the `PYTORCH_ROCM_ARCH` environment variable [AMD GPU architecture](https://rocm.docs.amd.com/projects/install-on-linux/en/latest/reference/system-requirements.html#supported-gpus)
+
 If you want to disable ROCm support, export the environment variable `USE_ROCM=0`.
 Other potentially useful environment variables may be found in `setup.py`.

--- a/RELEASE.md
+++ b/RELEASE.md
@ -48,16 +48,16 @@

 Following is the Release Compatibility Matrix for PyTorch releases:

-| PyTorch version | Python | Stable CUDA | Experimental CUDA | Stable ROCm |
-| --- | --- | --- | --- | --- |
-| 2.5 | >=3.9, <=3.12, (3.13 experimental) | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
-| 2.4 | >=3.8, <=3.12 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
-| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
-| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
-| 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
-| 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
-| 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
-| 1.12 | >=3.7, <=3.10 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |
+| PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
+| --- | --- | --- | --- | --- | --- |
+| 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
+| 2.4 | >=3.8, <=3.12 | C++17 | CUDA 11.8, CUDA 12.1, CUDNN 9.1.0.70  | CUDA 12.4, CUDNN 9.1.0.70 | ROCm 6.1 |
+| 2.3 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 6.0 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.7 |
+| 2.1 | >=3.8, <=3.11 | C++17 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | ROCm 5.6 |
+| 2.0 | >=3.8, <=3.11 | C++14 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | ROCm 5.4 |
+| 1.13 | >=3.7, <=3.10 | C++14 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | ROCm 5.2 |
+| 1.12 | >=3.7, <=3.10 | C++14 | CUDA 11.3, CUDNN 8.3.2.44 | CUDA 11.6, CUDNN 8.3.2.44 | ROCm 5.0 |

 ## Release Cadence

@ -234,7 +234,7 @@ Typically, within a release cycle fixes are necessary for regressions, test fixe
 For fixes that are to go into a release after the release branch has been cut we typically employ the use of a cherry pick tracker.

 An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Please also make sure to add milestone target to the PR/issue, especially if it needs to be considered for inclusion into the dot release.

@ -243,7 +243,9 @@ Please also make sure to add milestone target to the PR/issue, especially if it
 #### How to do Cherry Picking

 You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed
-to the main branch using `@pytorchbot cherry-pick` command as follows.
+to the main branch using `@pytorchbot cherry-pick` command as follows (make sure
+that the cherry-pick tracker issue for the target release labelled as "release tracker" -
+this will allow the bot to find it and post comments).

 ```
 usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c
@ -380,7 +382,7 @@ Patch release process takes around 4-5 weeks to complete.
 ### Issue Tracker for Patch releases

 For patch releases issue tracker needs to be created. For patch release, we require all cherry-pick changes to have links to either a high-priority GitHub issue or a CI failure from previous RC. An example of this would look like:
-* https://github.com/pytorch/pytorch/issues/51886
+* https://github.com/pytorch/pytorch/issues/128436

 Only following issues are accepted:
 1. Fixes to regressions against previous major version (e.g. regressions introduced in 1.13.0 from 1.12.0 are pickable for 1.13.1)
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -467,6 +467,9 @@ if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
 endif()

 if(USE_CUDA AND NOT USE_ROCM)
+  add_definitions(-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1)
+  add_definitions(-DCUTLASS_ENABLE_SM90_EXTENDED_MMA_SHAPES=1)
+  add_definitions(-DCUTE_SM90_EXTENDED_MMA_SHAPES_ENABLED)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  if($ENV{ATEN_STATIC_CUDA})
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -145,6 +145,14 @@ void Context::setSDPUseMath(bool e) {
  enabled_mathSDP = e;
 }

+bool Context::allowFP16BF16ReductionMathSDP() const {
+  return allow_fp16_bf16_reduction_mathSDP;
+}
+
+void Context::setAllowFP16BF16ReductionMathSDP(bool e) {
+  allow_fp16_bf16_reduction_mathSDP = e;
+}
+
 bool Context::userEnabledCuDNNSDP() const {
  return enabled_cudnnSDP;
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -39,25 +39,16 @@ class TORCH_API Context {

  const Generator& defaultGenerator(Device device) {
    c10::DeviceType device_type = device.type();
-    initCUDAIfNeeded(device_type);
-    initHIPIfNeeded(device_type);
+    lazyInitDevice(device_type);
+
    if (device_type == at::kCPU) {
      return at::detail::getDefaultCPUGenerator();
-    } else if (device_type == at::kCUDA) {
-      return at::detail::getCUDAHooks().getDefaultCUDAGenerator(device.index());
-    } else if (device_type == at::kMPS) {
-      return at::detail::getMPSHooks().getDefaultMPSGenerator();
-    } else if (device_type == at::kXPU) {
-      return at::detail::getXPUHooks().getDefaultXPUGenerator(device.index());
-    } else if (device_type == at::kIPU) {
-      return at::detail::getIPUHooks().getDefaultIPUGenerator(device.index());
-    } else if (device_type == at::kPrivateUse1) {
-      return at::detail::getPrivateUse1Hooks().getDefaultGenerator(
-          device.index());
    } else {
-      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
+      return getAcceleratorHooksInterface(device_type)
+          .getDefaultGenerator(device.index());
    }
  }
+
  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
      std::optional<c10::DeviceType> opt_device_type = std::nullopt) {
    c10::DeviceType device_type = opt_device_type.has_value()
@ -80,10 +71,10 @@ class TORCH_API Context {
          c10::DeviceTypeName(device_type), " device type not an accelerator.");
    }
  }
+
  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
-    initCUDAIfNeeded(device_type);
-    initHIPIfNeeded(device_type);
-    initXPUIfNeeded(device_type);
+    lazyInitDevice(device_type);
+
    if (device_type == at::kCPU) {
      return c10::DeviceType::CPU;
    } else if (device_type == at::kCUDA) {
@ -96,6 +87,7 @@ class TORCH_API Context {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
+
  bool isPinnedPtr(
      const void* data,
      std::optional<c10::DeviceType> device_type = std::nullopt) {
@ -106,13 +98,22 @@ class TORCH_API Context {
            opt_device_type.value())) { // passed device not an accelerator
      return false;
    }
-    return getAcceleratorHooksInterface(opt_device_type.value())
-        .isPinnedPtr(data);
+    return getAcceleratorHooksInterface(opt_device_type).isPinnedPtr(data);
  }
+
  Allocator* getPinnedMemoryAllocator(
      std::optional<c10::DeviceType> device_type = std::nullopt) {
    return getAcceleratorHooksInterface(device_type).getPinnedMemoryAllocator();
  }
+
+  void lazyInitDevice(c10::DeviceType device_type) {
+    if (device_type != at::kCPU) {
+      c10::call_once(init_[static_cast<int8_t>(device_type)], [&] {
+        getAcceleratorHooksInterface(device_type).init();
+      });
+    }
+  }
+
  static bool hasOpenMP();
  static bool hasMKL();
  static bool hasLAPACK();
@ -165,27 +166,6 @@ class TORCH_API Context {
  static bool hasMAIA() {
    return c10::impl::hasDeviceGuardImpl(c10::DeviceType::MAIA);
  }
-  // defined in header so that getNonVariableType has ability to inline
-  // call_once check. getNonVariableType is called fairly frequently
-  void lazyInitCUDA() {
-    c10::call_once(thc_init, [&] { detail::getCUDAHooks().initCUDA(); });
-  }
-  void lazyInitHIP() {
-    c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
-  }
-  void lazyInitXPU() {
-    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
-  }
-  void lazyInitMTIA() {
-    c10::call_once(th_mtia_init, [&] { detail::getMTIAHooks().initMTIA(); });
-  }
-  void lazyInitPrivateUse1() {
-    c10::call_once(thp_init, [&] {
-      if (isPrivateUse1HooksRegistered()) {
-        at::detail::getPrivateUse1Hooks().initPrivateUse1();
-      }
-    });
-  }
  static const at::cuda::NVRTC& getNVRTC() {
    return detail::getCUDAHooks().nvrtc();
  }
@ -234,6 +214,9 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

+  void setAllowFP16BF16ReductionMathSDP(bool);
+  bool allowFP16BF16ReductionMathSDP() const;
+
  void setSDPUseOverrideable(bool);
  bool userEnabledOverrideableSDP() const;

@ -358,27 +341,8 @@ class TORCH_API Context {
  void setAllowFP16ReductionCPU(bool);

 private:
-  void initCUDAIfNeeded(c10::DeviceType p) {
-    if (p == c10::DeviceType::CUDA) {
-      lazyInitCUDA();
-    }
-  }
-  void initHIPIfNeeded(c10::DeviceType p) {
-    if (p == c10::DeviceType::HIP) {
-      lazyInitHIP();
-    }
-  }
-  void initXPUIfNeeded(c10::DeviceType p) {
-    if (p == c10::DeviceType::XPU) {
-      lazyInitXPU();
-    }
-  }
  static bool checkCuBLASConfigDeterministic();
-  c10::once_flag thc_init;
-  c10::once_flag thh_init;
-  c10::once_flag thx_init;
-  c10::once_flag th_mtia_init;
-  c10::once_flag thp_init;
+  std::array<c10::once_flag, at::COMPILE_TIME_MAX_DEVICE_TYPES> init_;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
  bool deterministic_mkldnn = false;
@ -390,6 +354,7 @@ class TORCH_API Context {
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = true;
  bool enabled_overrideable = true;
+  bool allow_fp16_bf16_reduction_mathSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
@ -509,7 +474,7 @@ inline size_t getNumGPUs() {
        "to be CUDA (e.g., when you say CUDA, on a HIP build of ATen, this actually "
        "means HIP.  Rebuild PyTorch with one or the other disabled.");
  } else if (hasCUDA()) {
-    return detail::getCUDAHooks().getNumGPUs();
+    return detail::getCUDAHooks().deviceCount();
  } else if (hasHIP()) {
    return detail::getHIPHooks().getNumGPUs();
  } else {
@ -546,7 +511,7 @@ inline void manual_seed(uint64_t seed) {
  }
  // NB: Sometimes we build with CUDA, but we don't have any GPUs
  // available. In that case, we must not seed CUDA; it will fail!
-  const auto cuda_num_gpus = detail::getCUDAHooks().getNumGPUs();
+  const auto cuda_num_gpus = detail::getCUDAHooks().deviceCount();
  if (hasCUDA() && cuda_num_gpus > 0) {
    for (const auto i : c10::irange(cuda_num_gpus)) {
      auto cuda_gen = globalContext().defaultGenerator(
@ -559,7 +524,7 @@ inline void manual_seed(uint64_t seed) {
    }
  }

-  const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs();
+  const auto xpu_num_gpus = detail::getXPUHooks().deviceCount();
  if (hasXPU() && xpu_num_gpus) {
    for (const auto i : c10::irange(xpu_num_gpus)) {
      auto xpu_gen = globalContext().defaultGenerator(
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -18,6 +18,8 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
    // To properly support this, see https://github.com/pytorch/pytorch/issues/14560
    if (at::globalContext().hasCUDA()) {
      return at::detail::getCUDAHooks().getPinnedMemoryAllocator();
+    } else if (at::globalContext().hasMTIA()) {
+      return at::detail::getMTIAHooks().getPinnedMemoryAllocator();
    } else if (at::globalContext().hasXPU()) {
      return at::detail::getXPUHooks().getPinnedMemoryAllocator();
    } else if(at::isPrivateUse1HooksRegistered()) {
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -420,15 +420,15 @@ inline c10::MaybeOwned<Tensor> expand_size(
 inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  // expands a list of Tensors; ignores undefined (null) tensors
  bool first = true;
-  DimVector sizes;
+  SymDimVector sizes;
  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
    } else if (first) {
-      sizes = to_expand[i].sizes();
+      sizes = to_expand[i].sym_sizes();
      first = false;
    } else {
-      sizes = infer_size_dimvector(sizes, to_expand[i].sizes());
+      sizes = infer_size_symdimvector(sizes, to_expand[i].sym_sizes());
    }
  }

@ -436,10 +436,10 @@ inline std::vector<Tensor> expand_outplace(TensorList to_expand) {
  for (const auto i : c10::irange(to_expand.size())) {
    if (!to_expand[i].defined()) {
      continue;
-    } else if (to_expand[i].sizes().equals(sizes)) {
+    } else if (to_expand[i].sym_sizes().equals(sizes)) {
      result[i] = to_expand[i];
    } else {
-      result[i] = to_expand[i].expand(sizes);
+      result[i] = to_expand[i].expand_symint(sizes);
    }
  }
  return result;
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@ -61,9 +61,8 @@ void set_num_threads(int nthreads) {
 #endif
 #ifdef USE_PTHREADPOOL
  // because PyTorch uses caffe2::pthreadpool() in QNNPACK
-  caffe2::PThreadPool* const pool = caffe2::pthreadpool();
+  caffe2::PThreadPool* const pool = caffe2::pthreadpool(nthreads);
  TORCH_INTERNAL_ASSERT(pool, "Invalid thread pool!");
-  pool->set_thread_count(nthreads);
 #endif
 #if AT_MKLDNN_ENABLED()
  at::native::mkldnn::clear_computation_cache();
--- a/aten/src/ATen/ScalarOps.cpp
+++ b/aten/src/ATen/ScalarOps.cpp
@ -19,7 +19,7 @@ Tensor& scalar_fill(Tensor& self, const Scalar& value) {
  AT_DISPATCH_V2(
      self.scalar_type(), "fill_out", AT_WRAP([&]() {
        fill_inplace<scalar_t>(self, value);
-      }), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
+      }), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
  return self;
 }

--- a/aten/src/ATen/SparseCsrTensorUtils.h
+++ b/aten/src/ATen/SparseCsrTensorUtils.h
@ -144,8 +144,8 @@ class CheckSparseTensorInvariants {
  bool old_state;

 public:
-  CheckSparseTensorInvariants(bool state) {
-    old_state = at::globalContext().checkSparseTensorInvariants();
+  CheckSparseTensorInvariants(bool state)
+      : old_state(at::globalContext().checkSparseTensorInvariants()) {
    at::globalContext().setCheckSparseTensorInvariants(state);
  }

--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -255,7 +255,9 @@ inline Tensor applySelect(
    // the other hand, indexing wraping is valid for all negative int64_t
    // values, as x[INT64_MIN] is the same as x[INT64_MAX]
    TORCH_CHECK_INDEX(
-        size > -1 - index && size > index,
+        size.sym_gt(-1 - index)
+            .sym_and(size.sym_gt(index))
+            .expect_true(__FILE__, __LINE__),
        "index ",
        index,
        " is out of bounds for dimension ",
--- a/aten/src/ATen/ThreadLocalState.h
+++ b/aten/src/ATen/ThreadLocalState.h
@ -82,7 +82,7 @@ class TORCH_API ThreadLocalState {
    !defined(BUILD_LITE_INTERPRETER)
  // TLS for autocast dtypes
  std::array<at::ScalarType, at::COMPILE_TIME_MAX_DEVICE_TYPES>
-      autocast_dtypes_;
+      autocast_dtypes_{};
 #endif

  friend class ThreadLocalStateGuard;
--- a/aten/src/ATen/core/CachingHostAllocator.h
+++ b/aten/src/ATen/core/CachingHostAllocator.h
@ -111,17 +111,6 @@ template <
    typename E,
    typename B = HostBlock<S>>
 struct CachingHostAllocatorImpl {
-  CachingHostAllocatorImpl() {
-    // Launch the background thread and process events in a loop.
-    if (pinned_use_background_threads()) {
-      getBackgroundThreadPool()->run([&]() {
-        while (true) {
-          process_events();
-          std::this_thread::sleep_for(std::chrono::microseconds(100));
-        }
-      });
-    }
-  }
  virtual ~CachingHostAllocatorImpl() = default;

 public:
@ -155,6 +144,17 @@ struct CachingHostAllocatorImpl {
      if (block) {
        return {block->ptr_, reinterpret_cast<void*>(block)};
      }
+
+      // Launch the background thread and process events in a loop.
+      static c10::once_flag background_thread_flag;
+      c10::call_once(background_thread_flag, [this] {
+        getBackgroundThreadPool()->run([&]() {
+          while (true) {
+            process_events();
+            std::this_thread::sleep_for(std::chrono::microseconds(100));
+          }
+        });
+      });
    }

    // Slow path: if we can't allocate from the cached free list, we need
--- a/aten/src/ATen/core/PhiloxRNGEngine.h
+++ b/aten/src/ATen/core/PhiloxRNGEngine.h
@ -13,8 +13,6 @@

 #include <ATen/core/Array.h>
 #include <c10/macros/Macros.h>
-#include <c10/util/Exception.h>
-#include <c10/util/Half.h>
 #include <cmath>
 #include <cstdint>

--- a/aten/src/ATen/core/PythonFallbackKernel.cpp
+++ b/aten/src/ATen/core/PythonFallbackKernel.cpp
@ -45,7 +45,7 @@ private:
  c10::impl::LocalDispatchKeySet saved_;
 };

-void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
+void pythonFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) {
  TORCH_INTERNAL_ASSERT(tls_on_entry.has_value());
  // c10::impl::ForceDispatchKeyGuard dispatcher_guard(tls_on_entry.value());
  // StashTLSOnEntryGuard stash_guard;
@ -68,12 +68,20 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
  // we actually run dispatch(), we will take out PyObjects in the context
  // of that interpreter, and this will ensure that everyone is on the same
  // interpreter.
+  bool tensors_with_python_key_present = false;
+  c10::impl::PyInterpreter* interpreter = nullptr;
  for (const auto& ivalue : torch::jit::last(*stack, num_arguments)) {
    if (ivalue.isTensor()) {
-      auto* interpreter = ivalue.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-      if (interpreter) {
-        (*interpreter)->dispatch(op, stack);
-        return;
+      auto* t = ivalue.unsafeToTensorImpl();
+      if (t->key_set().has(c10::DispatchKey::Python)) {
+        tensors_with_python_key_present = true;
+      }
+
+      if (!interpreter) {
+        auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+        if (t_interpreter) {
+          interpreter = t_interpreter;
+        }
      }
    } else if (ivalue.isTensorList() || ivalue.isOptionalTensorList()) {
      // NB: use toListRef as it doesn't induce refcount bumps (toTensorListRef
@ -82,14 +90,43 @@ void pythonFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) {
        if (nv.isNone()) {
          continue;
        }
-        auto* interpreter = nv.unsafeToTensorImpl()->pyobj_slot()->pyobj_interpreter();
-        if (interpreter) {
-          (*interpreter)->dispatch(op, stack);
-          return;
+
+        auto* t = nv.unsafeToTensorImpl();
+        if (t->key_set().has(c10::DispatchKey::Python)) {
+          tensors_with_python_key_present = true;
+        }
+
+        if (!interpreter) {
+          auto* t_interpreter = t->pyobj_slot()->pyobj_interpreter();
+          if (t_interpreter) {
+            interpreter = t_interpreter;
+          }
        }
      }
    }
  }
+
+  if (interpreter) {
+    if (tensors_with_python_key_present) {
+      (*interpreter)->dispatch(op, stack);
+    } else {
+      // At this point, there are no modes in the stack and no tensors with the python key.
+      // so disable the python key before redispatching.
+      // See https://github.com/pytorch/pytorch/issues/136565
+      c10::DispatchKeySet keyset = dispatch_keys.remove(c10::DispatchKey::Python);
+
+      // Remove Python key from the included set as well (modes add it there).
+      c10::impl::LocalDispatchKeySet local_keyset = c10::impl::tls_local_dispatch_key_set();
+      c10::impl::ForceDispatchKeyGuard no_python_guard(
+        local_keyset.included_.remove(c10::DispatchKey::Python),
+        local_keyset.excluded_
+      );
+
+      op.redispatchBoxed(keyset, stack);
+    }
+    return;
+  }
+
  TORCH_INTERNAL_ASSERT(0, "Hit Python dispatch key but no arguments had PyInterpreter (no tensor args?)");
 }

--- a/aten/src/ATen/core/dispatch/Dispatcher.cpp
+++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp
@ -17,8 +17,22 @@ TORCH_SDT_DEFINE_SEMAPHORE(operator_end)
 #endif

 bool show_dispatch_trace() {
-    static char const* temp = getenv("TORCH_SHOW_DISPATCH_TRACE");
-    return temp != nullptr;
+  static auto envar = std::getenv("TORCH_SHOW_DISPATCH_TRACE");
+
+  if (envar) {
+    if (strcmp(envar, "0") == 0) {
+      return false;
+    }
+    if (strcmp(envar, "1") == 0) {
+      return true;
+    }
+    TORCH_WARN(
+        "ignoring invalid value for TORCH_SHOW_DISPATCH_TRACE: ",
+        envar,
+        " valid values are 0 or 1.");
+  }
+
+  return false;
 }

 static thread_local int64_t dispatch_trace_nesting_value_;
--- a/aten/src/ATen/cpu/vec/sve/vec_double.h
+++ b/aten/src/ATen/cpu/vec/sve/vec_double.h
@ -261,7 +261,7 @@ public:
  Vectorized<double> nextafter(const Vectorized<double> &b) const {
    USE_SLEEF(
      {
-        return Vectorized<double>(Sleef_nextafterfx_sve(values, b));
+        return Vectorized<double>(Sleef_nextafterdx_sve(values, b));
      },
      {
        __at_align__ double tmp[size()];
--- a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h
@ -208,8 +208,27 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm256_castps256_ps128(_mm256_castsi256_ps(vec2));
+    __m256 combined = _mm256_insertf128_ps(_mm256_castsi256_ps(vec1), lane2, 1);
+    // Shuffle [191:128] bit from combined in to [127:64] bit of result
+    __m256i result = _mm256_permute4x64_epi64(_mm256_castps_si256(combined), 0b11011000);
+    return at::vec::Vectorized<dst_t>(result);
  }
 };

@ -226,6 +245,25 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    // Shuffle [127:64] bit from src[0] in to [191:128] bit of shuffled
+    __m256i shuffled = _mm256_permute4x64_epi64(src[0], 0b11011000);
+    __m256i src2 = _mm256_castsi128_si256(
+      _mm_castps_si128(
+        _mm256_extractf128_ps(_mm256_castsi256_ps(shuffled), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};

 template <typename dst_t>
 struct VecConvert<
@ -268,11 +306,10 @@ struct VecConvert<float, 1, BFloat16, 1> {
      const VectorizedN<BFloat16, 1>& src) {
    VectorizedN<float, 1> result;
    uint16x8_t u16_8 = vld1q_u16(reinterpret_cast<const uint16_t*>(&src[0]));
-    int32x4_t shift = vdupq_n_s32(16);
    auto u16_low1 = vget_low_u16(u16_8);
    auto u16_high1 = vget_high_u16(u16_8);
-    float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16_low1), shift));
-    float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_u32(vmovl_u16(u16_high1), shift));
+    float32x4_t f32x4_0 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_low1), 16));
+    float32x4_t f32x4_1 = vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(u16_high1), 16));
    result[0] = {f32x4_0, f32x4_1};
    return result;
  }
--- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h
@ -216,27 +216,27 @@ public:
  }
  Vectorized<float> exp_u20() const {
    // A faster version of exp with ULP=20
-    static __m256 vec_factorial_1 =
+    const __m256 vec_factorial_1 =
        _mm256_set1_ps(0.999999701f); // 1/factorial(1)
-    static __m256 vec_factorial_2 =
+    const __m256 vec_factorial_2 =
        _mm256_set1_ps(0.499991506f); // 1/factorial(2)
-    static __m256 vec_factorial_3 =
+    const __m256 vec_factorial_3 =
        _mm256_set1_ps(0.166676521f); // 1/factorial(3)
-    static __m256 vec_factorial_4 =
+    const __m256 vec_factorial_4 =
        _mm256_set1_ps(0.0418978221f); // 1/factorial(4)
-    static __m256 vec_factorial_5 =
+    const __m256 vec_factorial_5 =
        _mm256_set1_ps(0.00828929059f); // 1/factorial(5)
-    static __m256 vec_exp_log2ef =
+    const __m256 vec_exp_log2ef =
        _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e)
-    static __m256 vec_half = _mm256_set1_ps(0.5f);
-    static __m256 vec_one = _mm256_set1_ps(1.f);
-    static __m256 vec_zero = _mm256_set1_ps(0.f);
-    static __m256 vec_two = _mm256_set1_ps(2.f);
-    static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
-    static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
-    static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
-    static __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
-    static int n_mantissa_bits = 23;
+    const __m256 vec_half = _mm256_set1_ps(0.5f);
+    const __m256 vec_one = _mm256_set1_ps(1.f);
+    const __m256 vec_zero = _mm256_set1_ps(0.f);
+    const __m256 vec_two = _mm256_set1_ps(2.f);
+    const __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2)
+    const __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50));
+    const __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218));
+    const __m256i vec_127 = _mm256_set1_epi32(0x0000007f);
+    const int n_mantissa_bits = 23;

    // exp(x) =
    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
--- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
+++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h
@ -75,7 +75,7 @@ inline __m256i pack_saturate_and_clamp<int32_t>(
    int32_t /*min_val*/,
    int32_t /*max_val*/) {
  // This function is for linkage only, will not be used
-  AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
+  TORCH_CHECK(false, "pack_saturate_and_clamp<int32_t> is not supported");
 }

 template <>
--- a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h
@ -209,8 +209,25 @@ struct VecConvert<
            (is_reduced_floating_point_v<src_t> && is_8bit_integer_v<dst_t>),
        void>> {
  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<src_t, 1>& src) {
-    VectorizedN<float, 1> tmp_fp32 = VecConvert<float, 1, src_t, 1>::apply(src);
-    return VecConvert<dst_t, 1, float, 1>::apply(tmp_fp32);
+    VectorizedN<float, 2> tmp_fp32 = VecConvert<float, 2, src_t, 1>::apply(src);
+    return VecConvert<dst_t, 1, float, 2>::apply(tmp_fp32);
+  }
+};
+
+template <typename dst_t>
+struct VecConvert<
+    dst_t,
+    1,
+    float,
+    2,
+    typename std::enable_if_t<is_8bit_integer_v<dst_t>,
+        void>> {
+  static inline VectorizedN<dst_t, 1> apply(const VectorizedN<float, 2>& src) {
+    at::vec::Vectorized<dst_t> vec1 = convert_float_to_int8<dst_t>(src[0]);
+    at::vec::Vectorized<dst_t> vec2 = convert_float_to_int8<dst_t>(src[1]);
+    __m128 lane2 = _mm512_castps512_ps128(_mm512_castsi512_ps(vec2));
+    __m512 result = _mm512_insertf32x4(_mm512_castsi512_ps(vec1), lane2, 1); // Insert lane2 into the second 128-bit lane
+    return at::vec::Vectorized<dst_t>(_mm512_castps_si512(result));
  }
 };

@ -227,6 +244,24 @@ struct VecConvert<
  }
 };

+template <typename src_t>
+struct VecConvert<
+    float,
+    2,
+    src_t,
+    1,
+    typename std::enable_if_t<is_8bit_integer_v<src_t>,
+        void>> {
+  static inline VectorizedN<float, 2> apply(const VectorizedN<src_t, 1>& src) {
+    __m512i src2 = _mm512_castsi128_si512(
+      _mm_castps_si128(
+        _mm512_extractf32x4_ps(_mm512_castsi512_ps(src[0]), 1) // Extract the second 128-bit lane
+      )
+    );
+    return VectorizedN<float, 2>(convert_int8_to_float<src_t>(src[0]), convert_int8_to_float<src_t>(src2));
+  }
+};
+
 template <typename src_t>
 struct VecConvert<
    float,
--- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h
@ -236,27 +236,27 @@ public:
  }
  Vectorized<float> exp_u20() const {
    // A faster version of exp with ULP=20
-    static __m512 vec_factorial_1 =
+    const __m512 vec_factorial_1 =
        _mm512_set1_ps(0.999999701f); // 1/factorial(1)
-    static __m512 vec_factorial_2 =
+    const __m512 vec_factorial_2 =
        _mm512_set1_ps(0.499991506f); // 1/factorial(2)
-    static __m512 vec_factorial_3 =
+    const __m512 vec_factorial_3 =
        _mm512_set1_ps(0.166676521f); // 1/factorial(3)
-    static __m512 vec_factorial_4 =
+    const __m512 vec_factorial_4 =
        _mm512_set1_ps(0.0418978221f); // 1/factorial(4)
-    static __m512 vec_factorial_5 =
+    const __m512 vec_factorial_5 =
        _mm512_set1_ps(0.00828929059f); // 1/factorial(5)
-    static __m512 vec_exp_log2ef =
+    const __m512 vec_exp_log2ef =
        _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e)
-    static __m512 vec_half = _mm512_set1_ps(0.5f);
-    static __m512 vec_one = _mm512_set1_ps(1.f);
-    static __m512 vec_zero = _mm512_set1_ps(0.f);
-    static __m512 vec_two = _mm512_set1_ps(2.f);
-    static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
-    static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
-    static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
-    static __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
-    static int n_mantissa_bits = 23;
+    const __m512 vec_half = _mm512_set1_ps(0.5f);
+    const __m512 vec_one = _mm512_set1_ps(1.f);
+    const __m512 vec_zero = _mm512_set1_ps(0.f);
+    const __m512 vec_two = _mm512_set1_ps(2.f);
+    const __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2)
+    const __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50));
+    const __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218));
+    const __m512i vec_127 = _mm512_set1_epi32(0x0000007f);
+    const int n_mantissa_bits = 23;

    // exp(x) =
    // = exp(n * ln(2) + r) // divide x by ln(2) and get quot and rem
--- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
+++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h
@ -77,7 +77,7 @@ inline __m512i pack_saturate_and_clamp<int32_t>(
    int32_t min_val [[maybe_unused]],
    int32_t max_val [[maybe_unused]]) {
  // This function is for linkage only, will not be used
-  AT_ERROR("pack_saturate_and_clamp<int32_t> is not supported");
+  TORCH_CHECK(false, "pack_saturate_and_clamp<int32_t> is not supported");
  return __m512i{};
 }

--- a/aten/src/ATen/cuda/CUDAGraph.cpp
+++ b/aten/src/ATen/cuda/CUDAGraph.cpp
@ -125,7 +125,7 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt
  // due to the capture status being updated _after_ a capture had already started.
  c10::cuda::CUDACachingAllocator::beginAllocateToPool(capture_dev_, mempool_id_, [this](cudaStream_t stream) {
      cudaStreamCaptureStatus status;
-      CaptureId_t stream_capture_id;
+      CaptureId_t stream_capture_id = 0;
      AT_CUDA_CHECK(cudaStreamGetCaptureInfo(stream, &status, &stream_capture_id));
      return status == cudaStreamCaptureStatus::cudaStreamCaptureStatusActive && stream_capture_id == capture_id_;
  });
@ -160,7 +160,7 @@ void CUDAGraph::capture_end() {

  c10::cuda::CUDACachingAllocator::endAllocateToPool(capture_dev_, mempool_id_);

-  TORCH_CHECK(graph_ != NULL, "Invalid capture.");
+  TORCH_CHECK(graph_ != nullptr, "Invalid capture.");
  has_graph_ = true;

  // In typical graph usage some tensors (e.g. the tensors used for graph IO) are not freed
@ -175,7 +175,7 @@ void CUDAGraph::capture_end() {
  // cudaGraphInstantiateWithFlags
  // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH_1ga2c652a24ba93e52b99a47bec0888233
 #if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040)
-  int version;
+  int version = 0;
  AT_CUDA_CHECK(cudaDriverGetVersion(&version));
  if (version < 11040) {
 #endif
@ -203,7 +203,7 @@ void CUDAGraph::capture_end() {
  }

  size_t numCUDAGraphNodes = 0;
-  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes));
+  AT_CUDA_CHECK(cudaGraphGetNodes(graph_, nullptr, &numCUDAGraphNodes));
  if (numCUDAGraphNodes == 0) {
      TORCH_WARN("The CUDA Graph is empty. This usually means that the graph was ",
                 "attempted to be captured on wrong device or stream.");
@ -233,7 +233,7 @@ void CUDAGraph::replay() {
  // graph_exec_ may be replayed in any stream.
  AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream()));

-  int version;
+  int version = 0;
  AT_CUDA_CHECK(cudaDriverGetVersion(&version));
  if (version < 11040) {
    // Workaround for bug in libcuda.so that causes replayed graphs with
--- a/Show More
+++ b/Show More