test commit

fix random_ op issues in dynamo
2025-11-01 04:54:55 +08:00 · 2024-07-30 10:09:11 -07:00 · 2024-07-30 09:51:02 -07:00
3568 changed files with 87645 additions and 148765 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
-0.7b
+0.6b
 manylinux_2_17
-rocm6.2
-9be04068c3c0857a4cfd17d7e39e71d0423ebac2
-3e9e1959d23b93d78a08fcc5f868125dc3854dece32fd9458be9ef4467982291
+rocm6.1
+7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -92,7 +92,7 @@ _UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
 # from scratch
 case "$image" in
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.1
+    CUDA_VERSION=12.4.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -120,7 +120,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
+    CUDA_VERSION=12.4.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -165,7 +165,7 @@ case "$image" in
    INDUCTOR_BENCHMARKS=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.1
+    CUDA_VERSION=12.4.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.12
    GCC_VERSION=9
@ -194,7 +194,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.1
+    CUDA_VERSION=12.4.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -222,7 +222,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.1
+    CUDA_VERSION=12.4.0
    CUDNN_VERSION=9
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
@ -236,7 +236,7 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-py3-clang10-onnx)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -245,7 +245,7 @@ case "$image" in
    ONNX=yes
    ;;
  pytorch-linux-focal-py3-clang9-android-ndk-r21e)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=9
    LLVMDEV=yes
    PROTOBUF=yes
@ -254,8 +254,8 @@ case "$image" in
    GRADLE_VERSION=6.8.3
    NINJA_VERSION=1.9.0
    ;;
-  pytorch-linux-focal-py3.9-clang10)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-focal-py3.8-clang10)
+    ANACONDA_PYTHON_VERSION=3.8
    CLANG_VERSION=10
    PROTOBUF=yes
    DB=yes
@ -276,8 +276,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-py3.9-gcc9)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-focal-py3.8-gcc9)
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -286,7 +286,18 @@ case "$image" in
    TRITON=yes
    ;;
  pytorch-linux-focal-rocm-n-1-py3)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.8
+    GCC_VERSION=9
+    PROTOBUF=yes
+    DB=yes
+    VISION=yes
+    ROCM_VERSION=6.0
+    NINJA_VERSION=1.9.0
+    CONDA_CMAKE=yes
+    TRITON=yes
+    ;;
+  pytorch-linux-focal-rocm-n-py3)
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=9
    PROTOBUF=yes
    DB=yes
@ -296,19 +307,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-rocm-n-py3)
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    ROCM_VERSION=6.2
-    NINJA_VERSION=1.9.0
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
  pytorch-linux-jammy-xpu-2024.0-py3)
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -318,8 +318,8 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-    pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks)
-    ANACONDA_PYTHON_VERSION=3.9
+    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -330,8 +330,8 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-clang12)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
+    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
    CUDNN_VERSION=9
    CLANG_VERSION=12
@ -355,8 +355,8 @@ case "$image" in
    CONDA_CMAKE=yes
    VISION=yes
    ;;
-  pytorch-linux-jammy-py3.9-gcc11)
-    ANACONDA_PYTHON_VERSION=3.9
+  pytorch-linux-jammy-py3.8-gcc11)
+    ANACONDA_PYTHON_VERSION=3.8
    GCC_VERSION=11
    PROTOBUF=yes
    DB=yes
@ -379,7 +379,6 @@ case "$image" in
    GCC_VERSION=11
    CONDA_CMAKE=yes
    HALIDE=yes
-    TRITON=yes
    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -108,10 +108,10 @@ ENV CMAKE_C_COMPILER cc
 ENV CMAKE_CXX_COMPILER c++
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

 # Install AOTriton (Early fail)
 COPY ./aotriton_version.txt aotriton_version.txt
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-cd1c833b079adb324871dcbbe75b43d42ffc0ade
+91298923a0076c1b41059efb6dad2876426e4b03
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +1 @@
-461c12871f336fe6f57b55d6a297f13ef209161b
+340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/timm.txt
+++ b/.ci/docker/ci_commit_pins/timm.txt
@ -1 +1 @@
-ac3470188b914c5d7a5058a7e28b9eb685a62427
+730b907b4d45a4713cbc425cbf224c46089fd514
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -0,0 +1 @@
+21eae954efa5bf584da70324b640288c3ee7aede
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-91b14bf5593cf58a8541f3e6b9125600a867d4ef
+1b2f15840e0d70eec50d84c7a0575cb835524def
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-5fe38ffd73c2ac6ed6323b554205186696631c6f
+dedb7bdf339a3546896d4820366ca562c586bfa0
--- a/.ci/docker/common/aotriton_version.txt
+++ b/.ci/docker/common/aotriton_version.txt
@ -0,0 +1,5 @@
+0.6b
+manylinux_2_17
+rocm6.1
+04b5df8c8123f90cba3ede7e971e6fbc6040d506
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,12 +4,12 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.gz'
+TARBALL='aotriton.tar.bz2'
 # This read command alwasy returns with exit code 1
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.gz"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -5,22 +5,32 @@ set -ex
 # Optionally install conda
 if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
  BASE_URL="https://repo.anaconda.com/miniconda"
-  CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
-  if [[ $(uname -m) == "aarch64" ]] || [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
-    BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
-    CONDA_FILE="Miniforge3-Linux-$(uname -m).sh"
-  fi

  MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1)
  MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2)

+if [[ $(uname -m) == "aarch64" ]]; then
+  BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download"
  case "$MAJOR_PYTHON_VERSION" in
-    3);;
+    3)
+      CONDA_FILE="Miniforge3-Linux-aarch64.sh"
+    ;;
    *)
      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
      exit 1
      ;;
  esac
+else
+  case "$MAJOR_PYTHON_VERSION" in
+    3)
+      CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh"
+    ;;
+    *)
+      echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION"
+      exit 1
+      ;;
+  esac
+fi

  mkdir -p /opt/conda
  chown jenkins:jenkins /opt/conda
@ -68,20 +78,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2"

    if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then
-      NUMPY_VERSION=1.24.4
+      conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS}
    else
-      NUMPY_VERSION=1.26.2
+      conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS}
    fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
-      NUMPY_VERSION=1.26.0
+      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
-      NUMPY_VERSION=1.21.2
+      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
    fi
  fi
-  conda_install ${CONDA_COMMON_DEPS}

  # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source
  # and libpython-static for torch deploy
@ -103,7 +112,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install some other packages, including those needed for Python test reporting
  pip_install -r /opt/conda/requirements-ci.txt
-  pip_install numpy=="$NUMPY_VERSION"
+
  pip_install -U scikit-learn

  if [ -n "$DOCS" ]; then
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -7,7 +7,7 @@ PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/hea
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.8.1 3.9.0 3.10.1 3.11.0 3.12.0 3.13.0"}

 function check_var {
    if [ -z "$1" ]; then
@ -22,13 +22,6 @@ function do_cpython_build {
    check_var $py_ver
    check_var $py_folder
    tar -xzf Python-$py_ver.tgz
-
-    local additional_flags=""
-    if [ "$py_ver" == "3.13.0t" ]; then
-        additional_flags=" --disable-gil"
-        mv cpython-3.13/ cpython-3.13t/
-    fi
-
    pushd $py_folder

    local prefix="/opt/_internal/cpython-${py_ver}"
@ -44,10 +37,8 @@ function do_cpython_build {
        local openssl_flags="--with-openssl=${WITH_OPENSSL} --with-openssl-rpath=auto"
    fi

-
-
    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} ${additional_flags} > /dev/null
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} ${openssl_flags} ${shared_flags} > /dev/null

    make -j40 > /dev/null
    make install > /dev/null
@ -67,8 +58,7 @@ function do_cpython_build {
    if [ -e ${prefix}/bin/pip3 ] && [ ! -e ${prefix}/bin/pip ]; then
        ln -s pip3 ${prefix}/bin/pip
    fi
-    # install setuptools since python 3.12 is required to use distutils
-    ${prefix}/bin/pip install wheel==0.34.2 setuptools==68.2.2
+    ${prefix}/bin/pip install wheel==0.34.2
    local abi_tag=$(${prefix}/bin/python -c "from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag; print('{0}{1}-{2}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))")
    ln -s ${prefix} /opt/python/${abi_tag}
 }
@ -78,14 +68,7 @@ function build_cpython {
    check_var $py_ver
    check_var $PYTHON_DOWNLOAD_URL
    local py_ver_folder=$py_ver
-
-    if [ "$py_ver" = "3.13.0t" ]; then
-        PY_VER_SHORT="3.13"
-        PYT_VER_SHORT="3.13t"
-        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
-        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
-        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
-    elif [ "$py_ver" = "3.13.0" ]; then
+    if [ "$py_ver" = "3.13.0" ]; then
        PY_VER_SHORT="3.13"
        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -27,17 +27,6 @@ function install_cusparselt_052 {
    rm -rf tmp_cusparselt
 }

-function install_cusparselt_062 {
-    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
-    mkdir tmp_cusparselt && pushd tmp_cusparselt
-    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-    tar xf libcusparse_lt-linux-x86_64-0.6.2.3-archive.tar.xz
-    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/include/* /usr/local/cuda/include/
-    cp -a libcusparse_lt-linux-x86_64-0.6.2.3-archive/lib/* /usr/local/cuda/lib64/
-    popd
-    rm -rf tmp_cusparselt
-}
-
 function install_118 {
    echo "Installing CUDA 11.8 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.4.0"
    rm -rf /usr/local/cuda-11.8 /usr/local/cuda
@ -105,13 +94,13 @@ function install_121 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4 and cuDNN ${CUDNN_VERSION} and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.1 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux.run
-  chmod +x cuda_12.4.1_550.54.15_linux.run
-  ./cuda_12.4.1_550.54.15_linux.run --toolkit --silent
-  rm -f cuda_12.4.1_550.54.15_linux.run
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux.run
+  chmod +x cuda_12.4.0_550.54.14_linux.run
+  ./cuda_12.4.0_550.54.14_linux.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
@ -132,7 +121,7 @@ function install_124 {
  cd ..
  rm -rf nccl

-  install_cusparselt_062
+  install_cusparselt_052

  ldconfig
 }
--- a/.ci/docker/common/install_cuda_aarch64.sh
+++ b/.ci/docker/common/install_cuda_aarch64.sh
@ -17,13 +17,13 @@ function install_cusparselt_052 {
 }

 function install_124 {
-  echo "Installing CUDA 12.4.1 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
+  echo "Installing CUDA 12.4 and cuDNN 9.1 and NCCL ${NCCL_VERSION} and cuSparseLt-0.5.2"
  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
-  # install CUDA 12.4.1 in the same container
-  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda_12.4.1_550.54.15_linux_sbsa.run
-  chmod +x cuda_12.4.1_550.54.15_linux_sbsa.run
-  ./cuda_12.4.1_550.54.15_linux_sbsa.run --toolkit --silent
-  rm -f cuda_12.4.1_550.54.15_linux_sbsa.run
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
--- a/.ci/docker/common/install_cudss.sh
+++ b/.ci/docker/common/install_cudss.sh
@ -1,25 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-# cudss license: https://docs.nvidia.com/cuda/cudss/license.html
-mkdir tmp_cudss && cd tmp_cudss
-
-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUDSS_NAME="libcudss-linux-${arch_path}-0.3.0.9_cuda12-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudss/redist/libcudss/linux-${arch_path}/${CUDSS_NAME}.tar.xz
-
-    # only for cuda 12
-    tar xf ${CUDSS_NAME}.tar.xz
-    cp -a ${CUDSS_NAME}/include/* /usr/local/cuda/include/
-    cp -a ${CUDSS_NAME}/lib/* /usr/local/cuda/lib64/
-fi
-
-cd ..
-rm -rf tmp_cudss
-ldconfig
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[2-6]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.6.2.3-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_miopen.sh
+++ b/.ci/docker/common/install_miopen.sh
@ -10,21 +10,6 @@ if [[ -z $ROCM_VERSION ]]; then
    exit 1;
 fi

-IS_UBUNTU=0
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  ubuntu)
-    IS_UBUNTU=1
-    ;;
-  centos)
-    IS_UBUNTU=0
-    ;;
-  *)
-    echo "Unable to determine OS..."
-    exit 1
-    ;;
-esac
-
 # To make version comparison easier, create an integer representation.
 save_IFS="$IFS"
 IFS=. ROCM_VERSION_ARRAY=(${ROCM_VERSION})
@ -72,12 +57,7 @@ MIOPEN_CMAKE_COMMON_FLAGS="
 -DMIOPEN_BUILD_DRIVER=OFF
 "
 # Pull MIOpen repo and set DMIOPEN_EMBED_DB based on ROCm version
-if [[ $ROCM_INT -ge 60300 ]]; then
-    echo "ROCm 6.3+ MIOpen does not need any patches, do not build from source"
-    exit 0
-elif [[ $ROCM_INT -ge 60200 ]] && [[ $ROCM_INT -lt 60300 ]]; then
-    MIOPEN_BRANCH="release/rocm-rel-6.2-staging"
-elif [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
+if [[ $ROCM_INT -ge 60100 ]] && [[ $ROCM_INT -lt 60200 ]]; then
    echo "ROCm 6.1 MIOpen does not need any patches, do not build from source"
    exit 0
 elif [[ $ROCM_INT -ge 60000 ]] && [[ $ROCM_INT -lt 60100 ]]; then
@ -110,21 +90,12 @@ else
    exit 1
 fi

-
-if [[ ${IS_UBUNTU} == 1 ]]; then
-  apt-get remove -y miopen-hip
-else
-  yum remove -y miopen-hip
-fi
+yum remove -y miopen-hip

 git clone https://github.com/ROCm/MIOpen -b ${MIOPEN_BRANCH}
 pushd MIOpen
 # remove .git to save disk space since CI runner was running out
 rm -rf .git
-# Don't build CK to save docker build time
-if [[ $ROCM_INT -ge 60200 ]]; then
-    sed -i '/composable_kernel/d' requirements.txt
-fi
 # Don't build MLIR to save docker build time
 # since we are disabling MLIR backend for MIOpen anyway
 if [[ $ROCM_INT -ge 50400 ]] && [[ $ROCM_INT -lt 50500 ]]; then
@ -137,15 +108,10 @@ cmake -P install_deps.cmake --minimum

 # clean up since CI runner was running out of disk space
 rm -rf /tmp/*
-if [[ ${IS_UBUNTU} == 1 ]]; then
-  apt-get autoclean && apt-get clean
-  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-else
-  yum clean all
-  rm -rf /var/cache/yum
-  rm -rf /var/lib/yum/yumdb
-  rm -rf /var/lib/yum/history
-fi
+yum clean all
+rm -rf /var/cache/yum
+rm -rf /var/lib/yum/yumdb
+rm -rf /var/lib/yum/history

 ## Build MIOpen
 mkdir -p build
@ -162,11 +128,7 @@ make -j $(nproc) package
 # clean up since CI runner was running out of disk space
 rm -rf /usr/local/cget

-if [[ ${IS_UBUNTU} == 1 ]]; then
-  sudo dpkg -i miopen-hip*.deb
-else
-  yum install -y miopen-*.rpm
-fi
+yum install -y miopen-*.rpm

 popd
 rm -rf MIOpen
--- a/.ci/docker/common/install_nvpl.sh
+++ b/.ci/docker/common/install_nvpl.sh
@ -1,20 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-function install_nvpl {
-
-    mkdir -p /opt/nvpl/lib /opt/nvpl/include
-
-    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_blas/linux-sbsa/nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
-    tar xf nvpl_blas-linux-sbsa-0.3.0-archive.tar.xz
-    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/lib/* /opt/nvpl/lib/
-    cp -r nvpl_blas-linux-sbsa-0.3.0-archive/include/* /opt/nvpl/include/
-
-    wget https://developer.download.nvidia.com/compute/nvpl/redist/nvpl_lapack/linux-sbsa/nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
-    tar xf nvpl_lapack-linux-sbsa-0.2.3.1-archive.tar.xz
-    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/lib/* /opt/nvpl/lib/
-    cp -r nvpl_lapack-linux-sbsa-0.2.3.1-archive/include/* /opt/nvpl/include/
-}
-
-install_nvpl
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -15,7 +15,7 @@ pip_install \
  flatbuffers==2.0 \
  mock==5.0.1 \
  ninja==1.10.2 \
-  networkx==2.5 \
+  networkx==2.0 \
  numpy==1.24.2

 # ONNXRuntime should be installed before installing
@ -30,9 +30,10 @@ pip_install \

 pip_install coloredlogs packaging

-pip_install onnxruntime==1.18.1
-pip_install onnx==1.16.2
-pip_install onnxscript==0.1.0.dev20240831 --no-deps
+pip_install onnxruntime==1.18
+pip_install onnx==1.16.0
+# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
+pip_install onnxscript==0.1.0.dev20240613 --no-deps
 # required by onnxscript
 pip_install ml_dtypes

--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -12,7 +12,10 @@ conda_reinstall() {
  as_jenkins conda install -q -n py_$ANACONDA_PYTHON_VERSION -y --force-reinstall $*
 }

-if [ -n "${XPU_VERSION}" ]; then
+if [ -n "${ROCM_VERSION}" ]; then
+  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_TEXT_FILE="triton-rocm"
+elif [ -n "${XPU_VERSION}" ]; then
  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
  TRITON_TEXT_FILE="triton-xpu"
 else
@ -38,33 +41,19 @@ if [ -z "${MAX_JOBS}" ]; then
    export MAX_JOBS=$(nproc)
 fi

-# Git checkout triton
-mkdir /var/lib/jenkins/triton
-chown -R jenkins /var/lib/jenkins/triton
-chgrp -R jenkins /var/lib/jenkins/triton
-pushd /var/lib/jenkins/
-
-as_jenkins git clone ${TRITON_REPO} triton
-cd triton
-as_jenkins git checkout ${TRITON_PINNED_COMMIT}
-cd python
-
-# TODO: remove patch setup.py once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
-as_jenkins sed -i -e 's/https:\/\/tritonlang.blob.core.windows.net\/llvm-builds/https:\/\/oaitriton.blob.core.windows.net\/public\/llvm-builds/g' setup.py
-
 if [ -n "${UBUNTU_VERSION}" ] && [ -n "${GCC_VERSION}" ] && [[ "${GCC_VERSION}" == "7" ]]; then
  # Triton needs at least gcc-9 to build
  apt-get install -y g++-9

-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 elif [ -n "${UBUNTU_VERSION}" ] && [ -n "${CLANG_VERSION}" ]; then
  # Triton needs <filesystem> which surprisingly is not available with clang-9 toolchain
  add-apt-repository -y ppa:ubuntu-toolchain-r/test
  apt-get install -y g++-9

-  CXX=g++-9 pip_install -e .
+  CXX=g++-9 pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 else
-  pip_install -e .
+  pip_install "git+${TRITON_REPO}@${TRITON_PINNED_COMMIT}#subdirectory=python"
 fi

 if [ -n "${CONDA_CMAKE}" ]; then
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -16,11 +16,11 @@ function install_ubuntu() {

    apt-get update -y
    apt-get install -y gpg-agent wget
-    # To add the online network package repository for the GPU Driver
+    # To add the online network package repository for the GPU Driver LTS releases
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --yes --dearmor --output /usr/share/keyrings/intel-graphics.gpg
    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}${XPU_DRIVER_VERSION} unified" \
+        https://repositories.intel.com/gpu/ubuntu ${VERSION_CODENAME}/lts/2350 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-${VERSION_CODENAME}.list
    # To add the online network network package repository for the Intel Support Packages
    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
@ -45,9 +45,9 @@ function install_ubuntu() {
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
    # Install Intel Support Packages
    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION} intel-pti-dev
+        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
    else
-        apt-get install -y intel-for-pytorch-gpu-dev intel-pti-dev
+        apt-get install -y intel-for-pytorch-gpu-dev
    fi

    # Cleanup
@ -55,6 +55,52 @@ function install_ubuntu() {
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 }

+function install_centos() {
+    dnf install -y 'dnf-command(config-manager)'
+    dnf config-manager --add-repo \
+        https://repositories.intel.com/gpu/rhel/8.6/production/2328/unified/intel-gpu-8.6.repo
+    # To add the EPEL repository needed for DKMS
+    dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm
+        # https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm
+
+    # Create the YUM repository file in the /temp directory as a normal user
+    tee > /tmp/oneAPI.repo << EOF
+[oneAPI]
+name=Intel® oneAPI repository
+baseurl=https://yum.repos.intel.com/oneapi
+enabled=1
+gpgcheck=1
+repo_gpgcheck=1
+gpgkey=https://yum.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB
+EOF
+
+    # Move the newly created oneAPI.repo file to the YUM configuration directory /etc/yum.repos.d
+    mv /tmp/oneAPI.repo /etc/yum.repos.d
+
+    # The xpu-smi packages
+    dnf install -y flex bison xpu-smi
+    # Compute and Media Runtimes
+    dnf install -y \
+        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
+        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
+        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
+        mesa-libxatracker libvpl-tools intel-metrics-discovery \
+        intel-metrics-library intel-igc-core intel-igc-cm \
+        libva libva-utils intel-gmmlib libmetee intel-gsc intel-ocloc hwinfo clinfo
+    # Development packages
+    dnf install -y --refresh \
+        intel-igc-opencl-devel level-zero-devel intel-gsc-devel libmetee-devel \
+        level-zero-devel
+    # Install Intel® oneAPI Base Toolkit
+    dnf install intel-basekit -y
+
+    # Cleanup
+    dnf clean all
+    rm -rf /var/cache/yum
+    rm -rf /var/lib/yum/yumdb
+    rm -rf /var/lib/yum/history
+}
+
 function install_rhel() {
    . /etc/os-release
    if [[ "${ID}" == "rhel" ]]; then
@ -68,9 +114,9 @@ function install_rhel() {
    fi

    dnf install -y 'dnf-command(config-manager)'
-    # To add the online network package repository for the GPU Driver
+    # To add the online network package repository for the GPU Driver LTS releases
    dnf config-manager --add-repo \
-        https://repositories.intel.com/gpu/rhel/${VERSION_ID}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_ID}.repo
+        https://repositories.intel.com/gpu/rhel/${VERSION_ID}/lts/2350/unified/intel-gpu-${VERSION_ID}.repo
    # To add the online network network package repository for the Intel Support Packages
    tee > /etc/yum.repos.d/intel-for-pytorch-gpu-dev.repo << EOF
 [intel-for-pytorch-gpu-dev]
@ -85,7 +131,7 @@ EOF
    # The xpu-smi packages
    dnf install -y xpu-smi
    # Compute and Media Runtimes
-    dnf install --skip-broken -y \
+    dnf install -y \
        intel-opencl intel-media intel-mediasdk libmfxgen1 libvpl2\
        level-zero intel-level-zero-gpu mesa-dri-drivers mesa-vulkan-drivers \
        mesa-vdpau-drivers libdrm mesa-libEGL mesa-libgbm mesa-libGL \
@ -114,9 +160,9 @@ function install_sles() {
        exit
    fi

-    # To add the online network package repository for the GPU Driver
+    # To add the online network package repository for the GPU Driver LTS releases
    zypper addrepo -f -r \
-        https://repositories.intel.com/gpu/sles/${VERSION_SP}${XPU_DRIVER_VERSION}/unified/intel-gpu-${VERSION_SP}.repo
+        https://repositories.intel.com/gpu/sles/${VERSION_SP}/lts/2350/unified/intel-gpu-${VERSION_SP}.repo
    rpm --import https://repositories.intel.com/gpu/intel-graphics.key
    # To add the online network network package repository for the Intel Support Packages
    zypper addrepo https://yum.repos.intel.com/intel-for-pytorch-gpu-dev intel-for-pytorch-gpu-dev
@ -135,12 +181,6 @@ function install_sles() {

 }

-# Default use GPU driver LTS releases
-XPU_DRIVER_VERSION="/lts/2350"
-if [[ "${XPU_DRIVER_TYPE,,}" == "rolling" ]]; then
-    # Use GPU driver rolling releases
-    XPU_DRIVER_VERSION=""
-fi

 # The installation depends on the base OS
 ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
@ -148,6 +188,9 @@ case "$ID" in
    ubuntu)
        install_ubuntu
    ;;
+    centos)
+        install_centos
+    ;;
    rhel|almalinux)
        install_rhel
    ;;
--- a/.ci/docker/conda/Dockerfile
+++ b/.ci/docker/conda/Dockerfile
@ -21,8 +21,9 @@ RUN sed -i s/^#.*baseurl=http/baseurl=http/g /etc/yum.repos.d/*.repo
 RUN sed -i s/^mirrorlist=http/#mirrorlist=http/g /etc/yum.repos.d/*.repo
 RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_VERSION}-gcc-c++ devtoolset-${DEVTOOLSET_VERSION}-gcc-gfortran devtoolset-${DEVTOOLSET_VERSION}-binutils
 # EPEL for cmake
-RUN yum --enablerepo=extras install -y epel-release
-
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm
 # cmake
 RUN yum install -y cmake3 && \
    ln -s /usr/bin/cmake3 /usr/bin/cmake
--- a/.ci/docker/conda/build.sh
+++ b/.ci/docker/conda/build.sh
@ -37,12 +37,6 @@ esac

 (
  set -x
-  # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-  # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-  sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-  sudo systemctl daemon-reload
-  sudo systemctl restart docker
-
  docker build \
    --target final \
    --progress plain \
--- a/.ci/docker/libtorch/Dockerfile
+++ b/.ci/docker/libtorch/Dockerfile
@ -89,7 +89,7 @@ RUN bash ./install_rocm_magma.sh && rm install_rocm_magma.sh

 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/linter-cuda/Dockerfile
+++ b/.ci/docker/linter-cuda/Dockerfile
@ -29,7 +29,7 @@ RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/re

 # Install cuda and cudnn
 ARG CUDA_VERSION
-COPY ./common/install_cuda.sh install_cuda.sh
+RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/docker/manywheel/Dockerfile
+++ b/.ci/docker/manywheel/Dockerfile
@ -10,7 +10,6 @@ ENV LANG en_US.UTF-8
 ENV LANGUAGE en_US.UTF-8

 ARG DEVTOOLSET_VERSION=9
-
 # Note: This is required patch since CentOS have reached EOL
 # otherwise any yum install setp will fail
 RUN sed -i s/mirror.centos.org/vault.centos.org/g /etc/yum.repos.d/*.repo
@ -30,7 +29,9 @@ RUN yum install -y devtoolset-${DEVTOOLSET_VERSION}-gcc devtoolset-${DEVTOOLSET_
 ENV PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/bin:$PATH
 ENV LD_LIBRARY_PATH=/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/opt/rh/devtoolset-${DEVTOOLSET_VERSION}/root/usr/lib:$LD_LIBRARY_PATH

-RUN yum --enablerepo=extras install -y epel-release
+RUN wget http://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm && \
+    rpm -ivh epel-release-latest-7.noarch.rpm && \
+    rm -f epel-release-latest-7.noarch.rpm

 # cmake-3.18.4 from pip
 RUN yum install -y python3-pip && \
@ -116,8 +117,7 @@ RUN yum install -y \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
-
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
@ -197,7 +197,7 @@ RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

 # Install AOTriton
 COPY ./common/common_utils.sh common_utils.sh
-COPY ./aotriton_version.txt aotriton_version.txt
+COPY ./common/aotriton_version.txt aotriton_version.txt
 COPY ./common/install_aotriton.sh install_aotriton.sh
 RUN bash ./install_aotriton.sh /opt/rocm && rm install_aotriton.sh aotriton_version.txt
 ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
--- a/.ci/docker/manywheel/Dockerfile_2014
+++ b/.ci/docker/manywheel/Dockerfile_2014
@ -93,8 +93,7 @@ RUN yum install -y \
        yasm
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
-
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -87,10 +87,10 @@ RUN yum install -y \
        xz \
        gcc-toolset-${DEVTOOLSET_VERSION}-toolchain \
        glibc-langpack-en
+
 RUN yum install -y \
    https://repo.ius.io/ius-release-el7.rpm \
-    https://ossci-linux.s3.amazonaws.com/epel-release-7-14.noarch.rpm
-
+    https://dl.fedoraproject.org/pub/epel/epel-release-latest-7.noarch.rpm
 RUN yum swap -y git git236-core
 # git236+ would refuse to run git commands in repos owned by other users
 # Which causes version check to fail, as pytorch repo is bind-mounted into the image
@ -145,13 +145,9 @@ ADD ./common/install_miopen.sh install_miopen.sh
 RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh

 FROM cpu_final as xpu_final
-# XPU CD use rolling driver
-ENV XPU_DRIVER_TYPE ROLLING
 # cmake-3.28.4 from pip
 RUN python3 -m pip install --upgrade pip && \
    python3 -mpip install cmake==3.28.4
-# Install setuptools and wheel for python 3.13
-RUN /opt/python/cp313-cp313/bin/python -m pip install setuptools wheel
 ADD ./common/install_xpu.sh install_xpu.sh
 RUN bash ./install_xpu.sh && rm install_xpu.sh
 RUN pushd /opt/_internal && tar -xJf static-libs-for-embedding-only.tar.xz && popd
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -75,17 +75,17 @@ ARG BASE_CUDA_VERSION
 ADD ./common/install_magma.sh install_magma.sh
 RUN bash ./install_magma.sh ${BASE_CUDA_VERSION} && rm install_magma.sh

-FROM base as nvpl
-# Install nvpl
-ADD ./common/install_nvpl.sh install_nvpl.sh
-RUN bash ./install_nvpl.sh && rm install_nvpl.sh
+FROM base as openblas
+# Install openblas
+ADD ./common/install_openblas.sh install_openblas.sh
+RUN bash ./install_openblas.sh && rm install_openblas.sh

 FROM final as cuda_final
 ARG BASE_CUDA_VERSION
 RUN rm -rf /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=cuda     /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
 COPY --from=magma    /usr/local/cuda-${BASE_CUDA_VERSION}  /usr/local/cuda-${BASE_CUDA_VERSION}
-COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
-COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
+COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -124,14 +124,7 @@ if [[ -n ${MANY_LINUX_VERSION} && -z ${DOCKERFILE_SUFFIX} ]]; then
 fi
 (
    set -x
-
-    # TODO: Remove LimitNOFILE=1048576 patch once https://github.com/pytorch/test-infra/issues/5712
-    # is resolved. This patch is required in order to fix timing out of Docker build on Amazon Linux 2023.
-    sudo sed -i s/LimitNOFILE=infinity/LimitNOFILE=1048576/ /usr/lib/systemd/system/docker.service
-    sudo systemctl daemon-reload
-    sudo systemctl restart docker
-
-    DOCKER_BUILDKIT=1 docker build  \
+    DOCKER_BUILDKIT=1 docker build \
        ${DOCKER_GPU_BUILD_ARG} \
        --build-arg "GPU_IMAGE=${GPU_IMAGE}" \
        --target "${TARGET}" \
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -30,14 +30,9 @@ dill==0.3.7
 #Pinned versions: 0.3.7
 #test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py

-expecttest==0.2.1
+expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
-#Pinned versions: 0.2.1
-#test that import:
-
-fbscribelogger==0.1.6
-#Description: write to scribe from authenticated jobs on CI
 #Pinned versions: 0.1.6
 #test that import:

@ -90,7 +85,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.11.2
+mypy==1.10.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
 #Pinned versions: 1.10.0
@ -109,7 +104,7 @@ networkx==2.8.8
 #test that import: run_test.py, test_cpp_extensions_aot.py,test_determination.py

 numba==0.49.0 ; python_version < "3.9"
-numba==0.55.2 ; python_version == "3.9"
+numba==0.54.1 ; python_version == "3.9"
 numba==0.55.2 ; python_version == "3.10"
 #Description: Just-In-Time Compiler for Numerical Functions
 #Pinned versions: 0.54.1, 0.49.0, <=0.49.1
@ -223,7 +218,7 @@ pygments==2.15.0
 #test that import:

 scikit-image==0.19.3 ; python_version < "3.10"
-scikit-image==0.22.0 ; python_version >= "3.10"
+scikit-image==0.20.0 ; python_version >= "3.10"
 #Description: image processing routines
 #Pinned versions:
 #test that import: test_nn.py
@ -274,10 +269,6 @@ lintrunner==0.12.5
 #Pinned versions: 0.12.5
 #test that import:

-redis>=4.0.0
-#Description: redis database
-#test that import: anything that tests OSS caching/mocking (inductor/test_codecache.py, inductor/test_max_autotune.py)
-
 rockset==1.0.3
 #Description: queries Rockset
 #Pinned versions: 1.0.3
@ -321,24 +312,3 @@ lxml==5.0.0
 # Python-3.9 binaries

 PyGithub==2.3.0
-
-sympy==1.12.1 ; python_version == "3.8"
-sympy==1.13.1 ; python_version >= "3.9"
-#Description: Required by coremltools, also pinned in .github/requirements/pip-requirements-macOS.txt
-#Pinned versions:
-#test that import:
-
-onnx==1.16.1
-#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
-#Pinned versions:
-#test that import:
-
-onnxscript==0.1.0.dev20240817
-#Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
-#Pinned versions:
-#test that import:
-
-parameterized==0.8.1
-#Description: Parameterizes unittests, both the tests themselves and the entire testing class
-#Pinned versions:
-#test that import:
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.1.0
+3.0.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -156,12 +156,6 @@ COPY ./common/install_cusparselt.sh install_cusparselt.sh
 RUN bash install_cusparselt.sh
 RUN rm install_cusparselt.sh

-# Install CUDSS
-ARG CUDA_VERSION
-COPY ./common/install_cudss.sh install_cudss.sh
-RUN bash install_cudss.sh
-RUN rm install_cudss.sh
-
 # Delete /usr/local/cuda-11.X/cuda-11.X symlinks
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -68,8 +68,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
-ADD ./common/install_miopen.sh install_miopen.sh
-RUN bash ./install_miopen.sh ${ROCM_VERSION} && rm install_miopen.sh
 ENV ROCM_PATH /opt/rocm
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
@ -102,10 +100,10 @@ ARG TRITON
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton.txt triton.txt
+COPY ci_commit_pins/triton-rocm.txt triton-rocm.txt
 COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

 # Install AOTriton
 COPY ./aotriton_version.txt aotriton_version.txt
@ -123,8 +121,5 @@ RUN bash ./install_cache.sh && rm install_cache.sh
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

-# Install LLVM dev version (Defined in the pytorch/builder github repository)
-COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
-
 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -30,7 +30,6 @@ RUN bash ./install_docs_reqs.sh && rm install_docs_reqs.sh
 ARG ANACONDA_PYTHON_VERSION
 ARG CONDA_CMAKE
 ARG DOCS
-ARG BUILD_ENVIRONMENT
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 ENV PATH /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/bin:/opt/conda/bin:$PATH
 ENV DOCS=$DOCS
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -50,7 +50,7 @@ RUN  bash ./install_lcov.sh && rm install_lcov.sh

 # Install cuda and cudnn
 ARG CUDA_VERSION
-COPY ./common/install_cuda.sh install_cuda.sh
+RUN wget -q https://raw.githubusercontent.com/pytorch/builder/main/common/install_cuda.sh -O install_cuda.sh
 RUN bash ./install_cuda.sh ${CUDA_VERSION} && rm install_cuda.sh
 ENV DESIRED_CUDA ${CUDA_VERSION}
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -49,8 +49,13 @@ if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
 fi

 # Enable LLVM dependency for TensorExpr testing
-export USE_LLVM=/opt/llvm
-export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  export USE_LLVM=/opt/rocm/llvm
+  export LLVM_DIR=/opt/rocm/llvm/lib/cmake/llvm
+else
+  export USE_LLVM=/opt/llvm
+  export LLVM_DIR=/opt/llvm/lib/cmake/llvm
+fi

 if [[ "$BUILD_ENVIRONMENT" == *executorch* ]]; then
  # To build test_edge_op_registration
@ -171,8 +176,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
-  # XPU kineto feature dependencies are not fully ready, disable kineto build as temp WA
-  export USE_KINETO=0
+  export USE_XPU=1
 fi

 # sccache will fail for CUDA builds if all cores are used for compiling
@ -232,7 +236,7 @@ fi

 # Do not change workspace permissions for ROCm CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
  cleanup_workspace() {
@ -278,11 +282,11 @@ else
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
-          "$BUILD_ENVIRONMENT" != *s390x*   &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
-        # Install numpy-2.0.2 for builds which are backward compatible with 1.X
-        python -mpip install --pre numpy==2.0.2
+        # Install numpy-2.0 release candidate for builds
+        # Which should be backward compatible with Numpy-1.X
+        python -mpip install --pre numpy==2.0.0rc1
      fi

      WERROR=1 python setup.py clean
@ -341,11 +345,11 @@ else
    CUSTOM_OP_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/custom-op-build"
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"

    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -355,10 +359,10 @@ else
    JIT_HOOK_BUILD="${CUSTOM_TEST_ARTIFACT_BUILD_DIR}/jit-hook-build"
    JIT_HOOK_TEST="$PWD/test/jit_hooks"
    python --version
-    SITE_PACKAGES="$(python -c 'import site; print(";".join([x for x in site.getsitepackages()] + [x + "/torch" for x in site.getsitepackages()]))')"
+    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -370,7 +374,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -403,6 +407,6 @@ fi

 # snadampal: skipping it till sccache support added for aarch64
 # https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* &&  "$BUILD_ENVIRONMENT" != *s390x* ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
  print_sccache_stats
 fi
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -179,7 +179,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.25"
+  pip_install --user "tlparse==0.3.7"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -1,4 +1,4 @@
-from datetime import datetime, timedelta, timezone
+from datetime import datetime, timedelta
 from tempfile import mkdtemp

 from cryptography import x509
@ -42,10 +42,10 @@ def create_cert(path, C, ST, L, O, key):
        .issuer_name(issuer)
        .public_key(key.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
+            datetime.utcnow()
            + timedelta(days=10)
        )
        .add_extension(
@ -88,10 +88,10 @@ def sign_certificate_request(path, csr_cert, ca_cert, private_ca_key):
        .issuer_name(ca_cert.subject)
        .public_key(csr_cert.public_key())
        .serial_number(x509.random_serial_number())
-        .not_valid_before(datetime.now(timezone.utc))
+        .not_valid_before(datetime.utcnow())
        .not_valid_after(
            # Our certificate will be valid for 10 days
-            datetime.now(timezone.utc)
+            datetime.utcnow()
            + timedelta(days=10)
            # Sign our certificate with our private key
        )
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -9,13 +9,15 @@ if [[ -n "$CONDA_ENV" ]]; then
  export PATH="$CONDA_ENV/bin":$PATH
 fi

-# Test that OpenMP is enabled
-pushd test
-if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
-  echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
-  exit 1
+# Test that OpenMP is enabled for non-arm64 build
+if [[ ${BUILD_ENVIRONMENT} != *arm64* ]]; then
+  pushd test
+  if [[ ! $(python -c "import torch; print(int(torch.backends.openmp.is_available()))") == "1" ]]; then
+    echo "Build should have OpenMP enabled, but torch.backends.openmp.is_available() is False"
+    exit 1
+  fi
+  popd
 fi
-popd

 setup_test_python() {
  # The CircleCI worker hostname doesn't resolve to an address.
@ -25,9 +27,8 @@ setup_test_python() {
  echo "Ninja version: $(ninja --version)"
  echo "Python version: $(which python) ($(python --version))"

-  # Set the limit on open file handles to 16384
-  # might help with intermittent compiler test failures
-  ulimit -n 16384
+  # Increase default limit on open file handles from 256 to 1024
+  ulimit -n 1024
 }

 test_python_all() {
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -44,15 +44,19 @@ time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compi
 time python test/run_test.py --verbose -i distributed/test_device_mesh

 # DTensor/TP tests
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
+time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state

 # FSDP2 tests
 time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh

+# Pipelining composability tests
+time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py
+
 # ND composability tests
 time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_2d_composability
-time python test/run_test.py --verbose -i distributed/_composable/test_composability/test_pp_composability

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -6,9 +6,6 @@

 set -ex

-# Suppress ANSI color escape sequences
-export TERM=vt100
-
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

@ -169,7 +166,7 @@ fi

 if [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  # Source Intel oneAPI envrioment script to enable xpu runtime related libraries
-  # refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
+  # refer to https://www.intel.com/content/www/us/en/docs/oneapi/programming-guide/2024-0/use-the-setvars-and-oneapi-vars-scripts-with-linux.html
  # shellcheck disable=SC1091
  source /opt/intel/oneapi/compiler/latest/env/vars.sh
  # Check XPU status before testing
@ -319,7 +316,7 @@ test_inductor_distributed() {
  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_micro_pipeline_tp.py --verbose
+  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
@ -361,12 +358,10 @@ test_inductor_shard() {
 test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
-  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
-    # We need to hipify before building again
-    python3 tools/amd_build/build_amd.py
+  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
+    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
  fi
-  BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-  CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
 }

 test_inductor_cpp_wrapper_abi_compatible() {
@ -375,8 +370,9 @@ test_inductor_cpp_wrapper_abi_compatible() {
  mkdir -p "$TEST_REPORTS_DIR"

  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
+  # cpu stack allocation causes segfault and needs more investigation
  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
-  python test/run_test.py --include inductor/test_cuda_cpp_wrapper inductor/test_cpu_repro
+  python test/run_test.py --include inductor/test_cuda_cpp_wrapper

  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
@ -394,22 +390,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
 # .github/workflows/inductor-perf-test-nightly.yml
 DYNAMO_BENCHMARK_FLAGS=()

-pr_time_benchmarks() {
-
-  pip_install --user "fbscribelogger"
-
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-  PYTHONPATH=$(pwd)/benchmarks/dynamo/pr_time_benchmarks source benchmarks/dynamo/pr_time_benchmarks/benchmark_runner.sh "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv" "benchmarks/dynamo/pr_time_benchmarks/benchmarks"
-  echo "benchmark results on current PR: "
-  cat  "$TEST_REPORTS_DIR/pr_time_benchmarks_results.csv"
-
-}
-
-if [[ "${TEST_CONFIG}" == *pr_time_benchmarks* ]]; then
-  pr_time_benchmarks
-  exit 0
-elif [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
+if [[ "${TEST_CONFIG}" == *dynamo_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend eager)
 elif [[ "${TEST_CONFIG}" == *aot_eager* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--backend aot_eager)
@ -505,12 +486,6 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_${device}_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        if [[ "$target" == "accuracy" ]]; then
-          # Also collect Export pass rate and display as a separate row
-          $TASKSET python "benchmarks/dynamo/$suite.py" \
-              "${target_flag[@]}" --"$mode" --"$dtype" --export --disable-cudagraphs "$@" \
-              --output "$TEST_REPORTS_DIR/${backend}_export_${suite}_${dtype}_${mode}_${device}_${target}.csv"
-        fi
        TORCHINDUCTOR_ABI_COMPATIBLE=1 $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_${device}_${target}.csv"
@ -574,10 +549,7 @@ test_single_dynamo_benchmark() {
    fi

    if [[ "${TEST_CONFIG}" == *_avx2* ]]; then
-      TEST_CONFIG=${TEST_CONFIG//_avx2/}
-    fi
-    if [[ "${TEST_CONFIG}" == *_avx512* ]]; then
-      TEST_CONFIG=${TEST_CONFIG//_avx512/}
+      TEST_CONFIG=${TEST_CONFIG::-5}
    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
@ -595,9 +567,6 @@ test_single_dynamo_benchmark() {

 test_inductor_micro_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
-    test_inductor_set_cpu_affinity
-  fi
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

@ -667,7 +636,8 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9
+  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -691,7 +661,7 @@ test_inductor_torchbench_smoketest_perf() {
 }

 test_inductor_get_core_number() {
-  if [[ "${TEST_CONFIG}" == *aarch64* ]]; then
+  if [[ "${TEST_CONFIG}" == *aarch64 ]]; then
    echo "$(($(lscpu | grep 'Cluster(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per cluster:' | awk '{print $4}')))"
  else
    echo "$(($(lscpu | grep 'Socket(s):' | awk '{print $2}') * $(lscpu | grep 'Core(s) per socket:' | awk '{print $4}')))"
@ -701,16 +671,11 @@ test_inductor_get_core_number() {
 test_inductor_set_cpu_affinity(){
  #set jemalloc
  JEMALLOC_LIB="$(find /usr/lib -name libjemalloc.so.2)"
-  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
+  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
+  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
-
-  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
-    # Use Intel OpenMP for x86
-    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
-    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
-    export KMP_AFFINITY=granularity=fine,compact,1,0
-    export KMP_BLOCKTIME=1
-  fi
+  export KMP_AFFINITY=granularity=fine,compact,1,0
+  export KMP_BLOCKTIME=1
  cores=$(test_inductor_get_core_number)
  export OMP_NUM_THREADS=$cores
  end_core=$((cores-1))
@ -728,7 +693,7 @@ test_inductor_torchbench_cpu_smoketest_perf(){
  do
    local model_name=${model_cfg[0]}
    local data_type=${model_cfg[2]}
-    local speedup_target=${model_cfg[5]}
+    # local speedup_target=${model_cfg[5]}
    local backend=${model_cfg[1]}
    if [[ ${model_cfg[4]} == "cpp" ]]; then
      export TORCHINDUCTOR_CPP_WRAPPER=1
@ -748,7 +713,8 @@ test_inductor_torchbench_cpu_smoketest_perf(){
    fi
    cat "$output_name"
    # The threshold value needs to be actively maintained to make this check useful.
-    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
+    # TODO: re-enable this after https://github.com/pytorch/pytorch/pull/131812 lands
+    # python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
  done

  # Add a few ABI-compatible accuracy tests for CPU. These can be removed once we turn on ABI-compatible as default.
@ -1074,113 +1040,11 @@ test_xla() {
  assert_git_not_dirty
 }

-function check_public_api_test_fails {
-    test_name=$1
-    invalid_item_name=$2
-    invalid_item_desc=$3
-
-    echo "Running public API test '${test_name}'..."
-    test_output=$(python test/test_public_bindings.py -k "${test_name}" 2>&1) && ret=$? || ret=$?
-
-    # Ensure test fails correctly.
-    if [ "$ret" -eq 0 ]; then
-        cat << EOF
-Expected the public API test '${test_name}' to fail after introducing
-${invalid_item_desc}, but it succeeded! Check test/test_public_bindings.py
-for any changes that may have broken the test.
-EOF
-        return 1
-    fi
-
-    # Ensure invalid item is in the test output.
-    echo "${test_output}" | grep -q "${invalid_item_name}" && ret=$? || ret=$?
-
-    if [ $ret -ne 0 ]; then
-        cat << EOF
-Expected the public API test '${test_name}' to identify ${invalid_item_desc}, but
-it didn't! It's possible the test may not have run. Check test/test_public_bindings.py
-for any changes that may have broken the test.
-EOF
-        return 1
-    fi
-
-    echo "Success! '${test_name}' identified ${invalid_item_desc} ${invalid_item_name}."
-    return 0
-}
-
 # Do NOT run this test before any other tests, like test_python_shard, etc.
 # Because this function uninstalls the torch built from branch and installs
 # the torch built on its base commit.
 test_forward_backward_compatibility() {
  set -x
-
-  # First, validate public API tests in the torch built from branch.
-  # Step 1. Make sure the public API test "test_correct_module_names" fails when a new file
-  # introduces an invalid public API function.
-  new_filename=$(mktemp XXXXXXXX.py -p "${TORCH_INSTALL_DIR}")
-
-  BAD_PUBLIC_FUNC=$(
-  cat << 'EOF'
-def new_public_func():
-  pass
-
-# valid public API functions have __module__ set correctly
-new_public_func.__module__ = None
-EOF
-  )
-
-  echo "${BAD_PUBLIC_FUNC}" >> "${new_filename}"
-  invalid_api="torch.$(basename -s '.py' "${new_filename}").new_public_func"
-  echo "Created an invalid public API function ${invalid_api}..."
-
-  check_public_api_test_fails \
-      "test_correct_module_names" \
-      "${invalid_api}" \
-      "an invalid public API function" && ret=$? || ret=$?
-
-  rm -v "${new_filename}"
-
-  if [ "$ret" -ne 0 ]; then
-      exit 1
-  fi
-
-  # Step 2. Make sure that the public API test "test_correct_module_names" fails when an existing
-  # file is modified to introduce an invalid public API function.
-  EXISTING_FILEPATH="${TORCH_INSTALL_DIR}/nn/parameter.py"
-  cp -v "${EXISTING_FILEPATH}" "${EXISTING_FILEPATH}.orig"
-  echo "${BAD_PUBLIC_FUNC}" >> "${EXISTING_FILEPATH}"
-  invalid_api="torch.nn.parameter.new_public_func"
-  echo "Appended an invalid public API function to existing file ${EXISTING_FILEPATH}..."
-
-  check_public_api_test_fails \
-      "test_correct_module_names" \
-      "${invalid_api}" \
-      "an invalid public API function" && ret=$? || ret=$?
-
-  mv -v "${EXISTING_FILEPATH}.orig" "${EXISTING_FILEPATH}"
-
-  if [ "$ret" -ne 0 ]; then
-      exit 1
-  fi
-
-  # Step 3. Make sure that the public API test "test_modules_can_be_imported" fails when a module
-  # cannot be imported.
-  new_module_dir=$(mktemp XXXXXXXX -d -p "${TORCH_INSTALL_DIR}")
-  echo "invalid syntax garbage" > "${new_module_dir}/__init__.py"
-  invalid_module_name="torch.$(basename "${new_module_dir}")"
-
-  check_public_api_test_fails \
-      "test_modules_can_be_imported" \
-      "${invalid_module_name}" \
-      "a non-importable module" && ret=$? || ret=$?
-
-  rm -rv "${new_module_dir}"
-
-  if [ "$ret" -ne 0 ]; then
-      exit 1
-  fi
-
-  # Next, build torch from the merge base.
  REPO_DIR=$(pwd)
  if [[ "${BASE_SHA}" == "${SHA1}" ]]; then
    echo "On trunk, we should compare schemas with torch built from the parent commit"
@ -1382,16 +1246,14 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64() {
+test_linux_aarch64(){
  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-        test_transformers test_multiprocessing test_numpy_interop \
-        --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
+       test_transformers test_multiprocessing test_numpy_interop --verbose

  # Dynamo tests
  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles \
-       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
+       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose

  # Inductor tests
  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
@ -1401,8 +1263,7 @@ test_linux_aarch64() {
       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes inductor/test_memory \
-       --shard "$SHARD_NUMBER" "$NUM_TEST_SHARDS" --verbose
+       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
 }

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
@ -1461,9 +1322,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
-    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_edgecnn \
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
-      functorch_maml_omniglot yolov3 mobilenet_v2 resnext50_32x4d densenet121 mnasnet1_0
+      shufflenet_v2_x1_0 hf_GPT2 yolov3 mobilenet_v2 resnext50_32x4d hf_T5_base
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
    checkout_install_torchbench
@ -1484,7 +1345,9 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
+    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.8-gcc11-build ]]; then
+      # Temporarily skip test_inductor_aoti due to https://github.com/pytorch/pytorch/issues/130311
+      test_inductor_aoti
      test_inductor_distributed
    fi
  fi
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -24,12 +24,6 @@ call %INSTALLER_DIR%\install_sccache.bat
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail

-if "%USE_XPU%"=="1" (
-  :: Install xpu support packages
-  call %INSTALLER_DIR%\install_xpu.bat
-  if errorlevel 1 exit /b 1
-)
-
 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
@ -49,16 +43,6 @@ if "%VC_VERSION%" == "" (
 )
 if errorlevel 1 goto fail
 if not errorlevel 0 goto fail
-
-if "%USE_XPU%"=="1" (
-  :: Activate xpu environment - VS env is required for xpu
-  call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat"
-  if errorlevel 1 exit /b 1
-  :: Reduce build time. Only have MTL self-hosted runner now
-  SET TORCH_XPU_ARCH_LIST=xe-lpg
-  SET USE_KINETO=0
-)
-
@echo on
 popd

@ -81,6 +65,13 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
+
+set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
+set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
+set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%

 :cuda_build_end
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_xpu.bat
@ -1,91 +0,0 @@
-@echo on
-REM Description: Install Intel Support Packages on Windows
-REM BKM reference: https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpu/2-5.html
-
-set XPU_INSTALL_MODE=%~1
-if "%XPU_INSTALL_MODE%"=="" goto xpu_bundle_install_start
-if "%XPU_INSTALL_MODE%"=="bundle" goto xpu_bundle_install_start
-if "%XPU_INSTALL_MODE%"=="driver" goto xpu_driver_install_start
-if "%XPU_INSTALL_MODE%"=="all" goto xpu_driver_install_start
-
-:arg_error
-
-echo Illegal XPU installation mode. The value can be "bundle"/"driver"/"all"
-echo If keep the value as space, will use default "bundle" mode
-exit /b 1
-
-:xpu_driver_install_start
-:: TODO Need more testing for driver installation
-set XPU_DRIVER_LINK=https://downloadmirror.intel.com/830975/gfx_win_101.5972.exe
-curl -o xpu_driver.exe --retry 3 --retry-all-errors -k %XPU_DRIVER_LINK%
-echo "XPU Driver installing..."
-start /wait "Intel XPU Driver Installer" "xpu_driver.exe"
-if errorlevel 1 exit /b 1
-del xpu_driver.exe
-if "%XPU_INSTALL_MODE%"=="driver" goto xpu_install_end
-
-:xpu_bundle_install_start
-
-set XPU_BUNDLE_PARENT_DIR=C:\Program Files (x86)\Intel\oneAPI
-set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-for-pytorch-gpu-dev_p_0.5.3.37_offline.exe
-set XPU_PTI_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d1a91e2-e8b8-40a5-8c7f-5db768a6a60c/w_intel-pti-dev_p_0.9.0.37_offline.exe
-set XPU_BUNDLE_VERSION=0.5.3+31
-set XPU_PTI_VERSION=0.9.0+36
-set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.intel-for-pytorch-gpu-dev.product
-set XPU_PTI_PRODUCT_NAME=intel.oneapi.win.intel-pti-dev.product
-set XPU_BUNDLE_INSTALLED=0
-set XPU_PTI_INSTALLED=0
-set XPU_BUNDLE_UNINSTALL=0
-set XPU_PTI_UNINSTALL=0
-
-:: Check if XPU bundle is target version or already installed
-if exist "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" goto xpu_bundle_ver_check
-goto xpu_bundle_install
-
-:xpu_bundle_ver_check
-
-"%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --list-products > xpu_bundle_installed_ver.log
-
-for /f "tokens=1,2" %%a in (xpu_bundle_installed_ver.log) do (
-    if "%%a"=="%XPU_BUNDLE_PRODUCT_NAME%" (
-        echo %%a Installed Version: %%b
-        set XPU_BUNDLE_INSTALLED=1
-        if not "%XPU_BUNDLE_VERSION%"=="%%b" (
-            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_BUNDLE_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
-            set XPU_BUNDLE_UNINSTALL=1
-        )
-    )
-    if "%%a"=="%XPU_PTI_PRODUCT_NAME%" (
-        echo %%a Installed Version: %%b
-        set XPU_PTI_INSTALLED=1
-        if not "%XPU_PTI_VERSION%"=="%%b" (
-            start /wait "Installer Title" "%XPU_BUNDLE_PARENT_DIR%\Installer\installer.exe" --action=remove --eula=accept --silent --product-id %XPU_PTI_PRODUCT_NAME% --product-ver %%b --log-dir uninstall_bundle
-            set XPU_PTI_UNINSTALL=1
-        )
-    )
-)
-if errorlevel 1 exit /b 1
-if exist xpu_bundle_installed_ver.log del xpu_bundle_installed_ver.log
-if "%XPU_BUNDLE_INSTALLED%"=="0" goto xpu_bundle_install
-if "%XPU_BUNDLE_UNINSTALL%"=="1" goto xpu_bundle_install
-if "%XPU_PTI_INSTALLED%"=="0" goto xpu_pti_install
-if "%XPU_PTI_UNINSTALL%"=="1" goto xpu_pti_install
-goto xpu_install_end
-
-:xpu_bundle_install
-
-curl -o xpu_bundle.exe --retry 3 --retry-all-errors -k %XPU_BUNDLE_URL%
-echo "XPU Bundle installing..."
-start /wait "Intel Pytorch Bundle Installer" "xpu_bundle.exe" --action=install --eula=accept --silent --log-dir install_bundle
-if errorlevel 1 exit /b 1
-del xpu_bundle.exe
-
-:xpu_pti_install
-
-curl -o xpu_pti.exe --retry 3 --retry-all-errors -k %XPU_PTI_URL%
-echo "XPU PTI installing..."
-start /wait "Intel PTI Installer" "xpu_pti.exe" --action=install --eula=accept --silent --log-dir install_bundle
-if errorlevel 1 exit /b 1
-del xpu_pti.exe
-
-:xpu_install_end
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -40,6 +40,7 @@ set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
 set CUDNN_LIB_DIR=%CUDA_PATH%\lib\x64
 set CUDA_TOOLKIT_ROOT_DIR=%CUDA_PATH%
 set CUDNN_ROOT_DIR=%CUDA_PATH%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
 set PATH=%CUDA_PATH%\bin;%CUDA_PATH%\libnvvp;%PATH%
 set NUMBAPRO_CUDALIB=%CUDA_PATH%\bin
 set NUMBAPRO_LIBDEVICE=%CUDA_PATH%\nvvm\libdevice
--- a/.ci/pytorch/win-test-helpers/test_custom_backend.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_backend.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_backend.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
+++ b/.ci/pytorch/win-test-helpers/test_custom_script_ops.bat
@ -31,6 +31,6 @@ if ERRORLEVEL 1 exit /b 1

 :: Run tests C++-side and load the exported script module.
 cd build
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%
 test_custom_ops.exe model.pt
 if ERRORLEVEL 1 exit /b 1
--- a/.ci/pytorch/win-test-helpers/test_libtorch.bat
+++ b/.ci/pytorch/win-test-helpers/test_libtorch.bat
@ -5,7 +5,7 @@ if errorlevel 1 exit /b 1
 set CWD=%cd%

 set CPP_TESTS_DIR=%TMP_DIR_WIN%\build\torch\bin
-set PATH=%TMP_DIR_WIN%\build\torch\lib;%PATH%
+set PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64;%TMP_DIR_WIN%\build\torch\lib;%PATH%

 set TORCH_CPP_TEST_MNIST_PATH=%CWD%\test\cpp\api\mnist
 python tools\download_mnist.py --quiet -d %TORCH_CPP_TEST_MNIST_PATH%
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -40,12 +40,6 @@ python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==
 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.12.2.0

-# Install tlparse for test\dynamo\test_structured_trace.py UTs.
-python -m pip install tlparse==0.3.25
-
-# Install parameterized
-python -m pip install parameterized==0.8.1
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -116,14 +116,15 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

+if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
+  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
+  set +u
+  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
+fi
+
 # Test the package
 /builder/check_binary.sh

-if [[ "\$GPU_ARCH_TYPE" != *s390x* && "\$GPU_ARCH_TYPE" != *xpu* && "\$GPU_ARCH_TYPE" != *rocm*  && "$PACKAGE_TYPE" != libtorch ]]; then
-  # Exclude s390, xpu, rocm and libtorch builds from smoke testing
-  python /builder/test/smoke_test/smoke_test.py --package=torchonly --torch-compile-check disabled
-fi
-
 # Clean temp files
 cd /builder && git clean -ffdx

--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -90,7 +90,7 @@ fi
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
+        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
@ -102,10 +102,10 @@ fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@ -10,11 +10,6 @@ export SCCACHE_BUCKET=ossci-compiler-cache
 export SCCACHE_IGNORE_SERVER_IO_ERROR=1
 export VC_YEAR=2019

-if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-    export VC_YEAR=2022
-    export USE_SCCACHE=0
-fi
-
 echo "Free space on filesystem before build:"
 df -h

--- a/.circleci/scripts/binary_windows_test.sh
+++ b/.circleci/scripts/binary_windows_test.sh
@ -6,10 +6,6 @@ source "${BINARY_ENV_FILE:-/c/w/env}"
 export CUDA_VERSION="${DESIRED_CUDA/cu/}"
 export VC_YEAR=2019

-if [[ "$DESIRED_CUDA" == 'xpu' ]]; then
-    export VC_YEAR=2022
-fi
-
 pushd "$BUILDER_ROOT"

 ./windows/internal/smoke_test.bat
--- a/.flake8
+++ b/.flake8
@ -7,7 +7,7 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
+    E203,E305,E402,E501,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
@ -55,9 +55,6 @@ per-file-ignores =
    torch/distributed/_functional_collectives.py: TOR901
    torch/distributed/_spmd/data_parallel.py: TOR901
    torch/distributed/_tensor/_collective_utils.py: TOR901
-    # This is a full package that happen to live within the test
-    # folder, so ok to skip
-    test/cpp_extensions/open_registration_extension/pytorch_openreg/_aten_impl.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -3,20 +3,18 @@ self-hosted-runner:
    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
+    # Repo-specific LF hosted ARC runners
+    - linux.large.arc
    # Organization-wide AWS Linux Runners
    - linux.large
    - linux.2xlarge
    - linux.4xlarge
    - linux.9xlarge.ephemeral
-    - am2.linux.9xlarge.ephemeral
    - linux.12xlarge
    - linux.12xlarge.ephemeral
    - linux.24xlarge
-    - linux.24xlarge.ephemeral
    - linux.arm64.2xlarge
-    - linux.arm64.2xlarge.ephemeral
    - linux.arm64.m7g.4xlarge
-    - linux.arm64.m7g.4xlarge.ephemeral
    - linux.4xlarge.nvidia.gpu
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
@ -32,12 +30,32 @@ self-hosted-runner:
    - lf.linux.8xlarge.nvidia.gpu
    - lf.linux.16xlarge.nvidia.gpu
    - lf.linux.g5.4xlarge.nvidia.gpu
+    # Organization-wide AWS Linux Runners with new Amazon 2023 AMI
+    - amz2023.linux.large
+    - amz2023.linux.2xlarge
+    - amz2023.linux.4xlarge
+    - amz2023.linux.12xlarge
+    - amz2023.linux.24xlarge
+    - amz2023.linux.arm64.2xlarge
+    - amz2023.linux.arm64.m7g.4xlarge
+    - amz2023.linux.4xlarge.nvidia.gpu
+    - amz2023.linux.8xlarge.nvidia.gpu
+    - amz2023.linux.16xlarge.nvidia.gpu
+    - amz2023.linux.g5.4xlarge.nvidia.gpu
+    # Pytorch/pytorch AWS Linux Runners with the new Amazon 2023 AMI on Linux Foundation account
+    - amz2023.lf.linux.large
+    - amz2023.lf.linux.2xlarge
+    - amz2023.lf.linux.4xlarge
+    - amz2023.lf.linux.12xlarge
+    - amz2023.lf.linux.24xlarge
+    - amz2023.lf.linux.arm64.2xlarge
+    - amz2023.lf.linux.4xlarge.nvidia.gpu
+    - amz2023.lf.linux.8xlarge.nvidia.gpu
+    - amz2023.lf.linux.16xlarge.nvidia.gpu
+    - amz2023.lf.linux.g5.4xlarge.nvidia.gpu
    # Repo-specific IBM hosted S390x runner
    - linux.s390x
    # Organization wide AWS Windows runners
-    - windows.g4dn.xlarge
-    - windows.g4dn.xlarge.nonephemeral
-    - windows.4xlarge
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -57,7 +57,7 @@ outputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@v3.0.0
+    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      name: Setup dependencies
      env:
        GITHUB_TOKEN: ${{ inputs.github-token }}
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -17,7 +17,7 @@ inputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@v3.0.0
+    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      name: Setup dependencies
      with:
        shell: bash
--- a/.github/actions/pytest-cache-upload/action.yml
+++ b/.github/actions/pytest-cache-upload/action.yml
@ -24,7 +24,7 @@ inputs:
 runs:
  using: composite
  steps:
-    - uses: nick-fields/retry@v3.0.0
+    - uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      name: Setup dependencies
      with:
        shell: bash
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -44,7 +44,7 @@ runs:
        fi

    - name: Log in to ECR
-      uses: nick-fields/retry@v3.0.0
+      uses: nick-fields/retry@3e91a01664abd3c5cd539100d10d33b9c5b68482
      env:
        AWS_RETRY_MODE: standard
        AWS_MAX_ATTEMPTS: "5"
@ -59,13 +59,6 @@ runs:
          aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
              --password-stdin "$AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"

-          # For LF Runners we need to make sure we also login to Meta's ECR docker registry too.
-          META_AWS_ACCOUNT_ID=308535385114
-          if [ "$AWS_ACCOUNT_ID" != "$META_AWS_ACCOUNT_ID" ] ; then
-              aws ecr get-login-password --region "$AWS_DEFAULT_REGION" | docker login --username AWS \
-                  --password-stdin "$META_AWS_ACCOUNT_ID.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com"
-          fi
-
    - name: Preserve github env variables for use in docker
      shell: bash
      run: |
--- a/.github/actions/teardown-win/action.yml
+++ b/.github/actions/teardown-win/action.yml
@ -31,7 +31,7 @@ runs:
    # retry this step several time similar to how checkout-pytorch GHA does
    - name: Cleanup workspace
      if: always()
-      uses: nick-fields/retry@v3.0.0
+      uses: nick-fields/retry@v2.8.2
      env:
        EXTRA_DELETE_DIR: ${{ inputs.extra-delete-dir }}
      with:
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-ba696ea3dfec4cbe693bf06a84c75dc196077f5b
+b3f6f511f2a1082bd56b13a3f6794e7fc3ba4862
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-2eb4a60ed14a38260b85b0c765161f0ce45be6d1
+5ea4535f0699f366adb554183a65ebf7dc34a8be
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -1,50 +1,13 @@
 # Use this to auto apply labels based on other labels.  Applies to both PRs and
 # issues. Currently only supports any and all
 - any:
-  - "module: opcheck"
-  then:
-  - "module: custom-operators"
- any:
-  - "module: custom-operators"
-  - "module: functionalization"
+  - "module: custom operators"
  - "module: aotdispatch"
-  - "module: higher order operators"
-  - "module: fakeTensor"
-  - "module: ProxyTensor"
-  - "module: library"
-  - "module: reinplacing"
  then:
  - "module: pt2-dispatcher"
- any:
-  - "module: vmap"
-  then:
-  - "module: functorch"
- any:
-  - "module: reinplacing"
-  then:
-  - "module: inductor"
- any:
-  - "module: pt2 optimizer"
-  then:
-  - "module: dynamo"
- any:
-  - "module: flex attention"
-  then:
-  - "module: higher order operators"
- any:
-  - "module: aotinductor"
-  then:
-  - "oncall: export"
 - any:
  - "module: dynamo"
  - "module: pt2-dispatcher"
  - "module: inductor"
-  - "module: aotinductor"
-  - "module: cudagraphs"
-  - "oncall: export"
-  - "module: startup-tracing-compile"
-  - "module: compiled autograd"
-  - "module: flex attention"
-  - "module: dynamic shapes"
  then:
  - "oncall: pt2"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -29,6 +29,7 @@
 - torch/fx/experimental/recording.py
 - torch/fx/experimental/sym_node.py
 - torch/fx/experimental/validator.py
+- torch/fx/experimental/_sym_dispatch_mode.py
 - torch/fx/experimental/proxy_tensor.py
 - test/distributed/_tensor/test_dtensor_compile.py
 - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -7,14 +7,10 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
+# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTES:
-#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
-#    to avoid RequestLimitExceeded issues
-#  - When updating this file, run the following command to validate the YAML and to generate
-#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
-#    pytorch/pytorch changes before merging these changes.
-#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
+# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
+#                     to avoid RequestLimitExceeded issues
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -35,190 +31,132 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
-    max_available: 450
+    max_available: 60
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
-    max_available: 50
+    max_available: 20
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-    variants:
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.c.linux.12xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.12xlarge
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
-    max_available: 500
+    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.c.linux.24xlarge.ephemeral:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 30
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.c.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.2xlarge.ephemeral:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.linux.arm64.m7g.4xlarge.ephemeral:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.c.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.c.windows.g4dn.xlarge:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: true
-    max_available: 100
-    os: windows
-  lf.c.windows.g4dn.xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: false
-    max_available: 100
-    os: windows
  lf.c.windows.4xlarge:
    disk_size: 256
    instance_type: c5d.4xlarge
@ -249,3 +187,159 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
+
+  ### Setup runner types to test the Amazon Linux 2023 AMI
+  lf.c.amz2023.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 60
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 2400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.large:
+    max_available: 1200
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.c.amz2023.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.c.amz2023.linux.arm64.m7g.4xlarge:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.c.amz2023.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -7,14 +7,10 @@
 #   runners. Runners listed here will be available as self hosted
 #   runners, configuration is directly pulled from the main branch.
 #
+# NOTE (Apr, 5, 2021): Linux runners are currently all an amazonlinux2
 #
-# NOTES:
-#  - Linux runners are by default non-ephemeral to reduce the amount of CreateInstaces calls
-#    to avoid RequestLimitExceeded issues
-#  - When updating this file, run the following command to validate the YAML and to generate
-#    corresponding versions of scale-config for the pytorch/pytorch repo and merge the
-#    pytorch/pytorch changes before merging these changes.
-#    `python .github/scripts/validate_scale_config.py --test-infra-repo-root [path_to_test-infra_root] --pytorch-repo-root [path_to_pytorch_root]``
+# NOTE (Jan 5, 2021): Linux runners are all non-ephemeral to reduce the amount of CreateInstaces calls
+#                     to avoid RequestLimitExceeded issues
 #
 # TODO: Add some documentation on how the auto-scaling works
 #
@ -35,190 +31,132 @@ runner_types:
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.10xlarge.avx2:
    disk_size: 200
    instance_type: m4.10xlarge
    is_ephemeral: false
-    max_available: 450
+    max_available: 60
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xl.spr-metal:
    disk_size: 200
    instance_type: c7i.metal-24xl
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.spr:
    disk_size: 200
    instance_type: c7i.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.9xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.9xlarge
    is_ephemeral: true
-    max_available: 50
+    max_available: 20
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-    variants:
-      am2:
-        ami: amzn2-ami-hvm-2.0.20240306.2-x86_64-ebs
  lf.linux.12xlarge.ephemeral:
    disk_size: 200
    instance_type: c5.12xlarge
    is_ephemeral: true
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.16xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.16xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.24xlarge:
    disk_size: 150
    instance_type: c5.24xlarge
    is_ephemeral: false
-    max_available: 500
+    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
-  lf.linux.24xlarge.ephemeral:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.2xlarge:
    disk_size: 150
    instance_type: c5.2xlarge
    is_ephemeral: false
    max_available: 3120
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge:
    disk_size: 150
    instance_type: c5.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.4xlarge
    is_ephemeral: false
    max_available: 1000
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.8xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g3.8xlarge
    is_ephemeral: false
    max_available: 400
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.12xlarge
    is_ephemeral: false
    max_available: 250
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g4dn.metal.nvidia.gpu:
    disk_size: 150
    instance_type: g4dn.metal
    is_ephemeral: false
    max_available: 300
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.48xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.48xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.12xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.12xlarge
    is_ephemeral: false
    max_available: 150
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g5.4xlarge.nvidia.gpu:
    disk_size: 150
    instance_type: g5.4xlarge
    is_ephemeral: false
    max_available: 2400
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.g6.4xlarge.experimental.nvidia.gpu:
    disk_size: 150
    instance_type: g6.4xlarge
    is_ephemeral: false
-    max_available: 50
+    max_available: 30
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.large:
    max_available: 1200
    disk_size: 15
    instance_type: c5.large
    is_ephemeral: false
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-x86_64
  lf.linux.arm64.2xlarge:
    disk_size: 256
    instance_type: t4g.2xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.4xlarge:
    disk_size: 256
    instance_type: m7g.4xlarge
    is_ephemeral: false
    max_available: 200
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.2xlarge.ephemeral:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.linux.arm64.m7g.4xlarge.ephemeral:
-    disk_size: 256
-    instance_type: m7g.4xlarge
-    is_ephemeral: true
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
  lf.linux.arm64.m7g.metal:
    disk_size: 256
    instance_type: m7g.metal
    is_ephemeral: false
    max_available: 100
    os: linux
-    ami: al2023-ami-2023.5.202*-kernel-6.1-arm64
-  lf.windows.g4dn.xlarge:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: true
-    max_available: 100
-    os: windows
-  lf.windows.g4dn.xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: g4dn.xlarge
-    is_ephemeral: false
-    max_available: 100
-    os: windows
  lf.windows.4xlarge:
    disk_size: 256
    instance_type: c5d.4xlarge
@ -249,3 +187,159 @@ runner_types:
    is_ephemeral: false
    max_available: 250
    os: windows
+
+  ### Setup runner types to test the Amazon Linux 2023 AMI
+  lf.amz2023.linux.12xlarge:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.10xlarge.avx2:
+    disk_size: 200
+    instance_type: m4.10xlarge
+    is_ephemeral: false
+    max_available: 60
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.24xl.spr-metal:
+    disk_size: 200
+    instance_type: c7i.metal-24xl
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.16xlarge.spr:
+    disk_size: 200
+    instance_type: c7i.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.9xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.9xlarge
+    is_ephemeral: true
+    max_available: 20
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.12xlarge.ephemeral:
+    disk_size: 200
+    instance_type: c5.12xlarge
+    is_ephemeral: true
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.16xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.16xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.24xlarge:
+    disk_size: 150
+    instance_type: c5.24xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.2xlarge:
+    disk_size: 150
+    instance_type: c5.2xlarge
+    is_ephemeral: false
+    max_available: 3120
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.4xlarge:
+    disk_size: 150
+    instance_type: c5.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.4xlarge
+    is_ephemeral: false
+    max_available: 1000
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.8xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g3.8xlarge
+    is_ephemeral: false
+    max_available: 400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.12xlarge
+    is_ephemeral: false
+    max_available: 250
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
+    disk_size: 150
+    instance_type: g4dn.metal
+    is_ephemeral: false
+    max_available: 300
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.48xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.12xlarge
+    is_ephemeral: false
+    max_available: 150
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
+    disk_size: 150
+    instance_type: g5.4xlarge
+    is_ephemeral: false
+    max_available: 2400
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.g6.4xlarge.experimental.nvidia.gpu:
+    disk_size: 150
+    instance_type: g6.4xlarge
+    is_ephemeral: false
+    max_available: 30
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.large:
+    max_available: 1200
+    disk_size: 15
+    instance_type: c5.large
+    is_ephemeral: false
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
+  lf.amz2023.linux.arm64.2xlarge:
+    disk_size: 256
+    instance_type: t4g.2xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.amz2023.linux.arm64.m7g.4xlarge:
+    disk_size: 256
+    instance_type: m7g.4xlarge
+    is_ephemeral: false
+    max_available: 200
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
+  lf.amz2023.linux.arm64.m7g.metal:
+    disk_size: 256
+    instance_type: m7g.metal
+    is_ephemeral: false
+    max_available: 100
+    os: linux
+    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-arm64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -86,18 +86,6 @@
  - pull
  - inductor

- name: OSS CI / pytorchbot / slow tests
-  patterns:
-  - test/slow_tests.json
-  approved_by:
-  - pytorchbot
-  ignore_flaky_failures: false
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-  - slow
-
 - name: OSS CI /pytorchbot / Executorch
  patterns:
  - .ci/docker/ci_commit_pins/executorch.txt
@ -119,8 +107,8 @@
  mandatory_checks_name:
  - EasyCLA
  - Lint
-  - pull / linux-focal-py3_9-clang9-xla / build
-  - pull / linux-focal-py3_9-clang9-xla / test (xla, 1, 1, linux.12xlarge)
+  - pull / linux-focal-py3_8-clang9-xla / build
+  - pull / linux-focal-py3_8-clang9-xla / test (xla, 1, 1, linux.12xlarge)

 - name: Documentation
  patterns:
@ -294,11 +282,9 @@
  - torch/_C/_distributed*
  - torch/csrc/distributed/**
  - torch/testing/_internal/distributed/**
-  - torch/multiprocessing/**
  - test/distributed/**
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
-  - test/*multiprocessing*
  approved_by:
  - wconstab
  - mrshenli
@ -537,14 +523,6 @@
  - Skylion007
  - ngimel
  - peterbell10
-  - eqy
-  - jansel
-  - jeffdaily
-  - eellison
-  - anijain2305
-  - bdhirsh
-  - zou3519
-  - isuruf
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -559,8 +537,6 @@
  - ezyang
  - dzhulgakov
  - malfet
-  - albanD
-  - ptrblck
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/nitpicks.yml
+++ b/.github/nitpicks.yml
@ -1,5 +0,0 @@
- markdown: |
-    ## Attention! native_functions.yaml was changed
-    If you are adding a new function or defaulted argument to native_functions.yaml, you cannot use it from pre-existing Python frontend code until our FC window passes (two weeks).  Split your PR into two PRs, one which adds the new C++ functionality, and one that makes use of it from Python, and land them two weeks apart.  See https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy#forwards-compatibility-fc for more info.
-  pathFilter:
-    - 'aten/src/ATen/native/native_functions.yaml'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -9,7 +9,6 @@ ciflow_push_tags:
 - ciflow/inductor-rocm
 - ciflow/inductor-perf-compare
 - ciflow/inductor-micro-benchmark
- ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-cu124
 - ciflow/linux-aarch64
 - ciflow/mps
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -4,4 +4,4 @@ ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
 setuptools=68.2.2
-typing-extensions=4.11.0
+typing-extensions=4.9.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -1,7 +1,6 @@
 boto3==1.19.12
 hypothesis==6.56.4
-expecttest==0.2.1
-fbscribelogger==0.1.6
+expecttest==0.1.6
 librosa>=0.6.2
 mpmath==1.3.0
 networkx==2.8.7
@ -19,7 +18,7 @@ pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
 scipy==1.10.1
 sympy==1.12.1 ; python_version == "3.8"
-sympy==1.13.1 ; python_version >= "3.9"
+sympy>=1.13.0 ; python_version >= "3.9"
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
@ -31,4 +30,3 @@ optree==0.12.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
-parameterized==0.8.1
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -15,7 +15,9 @@ REPO_DIR = SCRIPT_DIR.parent.parent

 def read_triton_pin(device: str = "cuda") -> str:
    triton_file = "triton.txt"
-    if device == "xpu":
+    if device == "rocm":
+        triton_file = "triton-rocm.txt"
+    elif device == "xpu":
        triton_file = "triton-xpu.txt"
    with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
        return f.read().strip()
@ -48,25 +50,6 @@ def patch_init_py(
        f.write(orig)


-# TODO: remove patch_setup_py() once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
-def patch_setup_py(path: Path) -> None:
-    with open(path) as f:
-        orig = f.read()
-    try:
-        orig = check_and_replace(
-            orig,
-            "https://tritonlang.blob.core.windows.net/llvm-builds/",
-            "https://oaitriton.blob.core.windows.net/public/llvm-builds/",
-        )
-        with open(path, "w") as f:
-            f.write(orig)
-    except RuntimeError as e:
-        print(
-            f"Applying patch_setup_py() for llvm-build package failed: {e}.",
-            "If you are trying to build a newer version of Triton, you can ignore this.",
-        )
-
-
 def build_triton(
    *,
    version: str,
@ -108,9 +91,6 @@ def build_triton(
        else:
            check_call(["git", "checkout", commit_hash], cwd=triton_basedir)

-        # TODO: remove this and patch_setup_py() once we have a proper fix for https://github.com/triton-lang/triton/issues/4527
-        patch_setup_py(triton_pythondir / "setup.py")
-
        if build_conda:
            with open(triton_basedir / "meta.yaml", "w") as meta:
                print(
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -27,12 +27,6 @@ def parse_args() -> Any:

    parser = ArgumentParser("Check PR labels")
    parser.add_argument("pr_num", type=int)
-    # add a flag to return a non-zero exit code if the PR does not have the required labels
-    parser.add_argument(
-        "--exit-non-zero",
-        action="store_true",
-        help="Return a non-zero exit code if the PR does not have the required labels",
-    )

    return parser.parse_args()

@ -47,13 +41,10 @@ def main() -> None:
        if not has_required_labels(pr):
            print(LABEL_ERR_MSG)
            add_label_err_comment(pr)
-            if args.exit_non_zero:
-                sys.exit(1)
        else:
            delete_all_label_err_comments(pr)
    except Exception as e:
-        if args.exit_non_zero:
-            sys.exit(1)
+        pass

    sys.exit(0)

--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -169,8 +169,7 @@ def create_cherry_pick_branch(
    repo.create_branch_and_checkout(branch=cherry_pick_branch)

    # We might want to support ghstack later
-    # We don't want to resolve conflicts here.
-    repo._run_git("cherry-pick", "-x", commit_sha)
+    repo._run_git("cherry-pick", "-x", "-X", "theirs", commit_sha)
    repo.push(branch=cherry_pick_branch, dry_run=False)

    return cherry_pick_branch
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -18,13 +18,13 @@ from typing import Dict, List, Optional, Tuple
 CUDA_ARCHES = ["11.8", "12.1", "12.4"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.1"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}


 CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}


-ROCM_ARCHES = ["6.1", "6.2"]
+ROCM_ARCHES = ["6.0", "6.1"]

 XPU_ARCHES = ["xpu"]

@ -68,18 +68,18 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.4": (
-        "nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.4.5.8; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.2.1.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.5.147; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.6.1.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.3.1.170; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.4.127; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
 }

@ -215,7 +215,7 @@ LIBTORCH_CONTAINER_IMAGES: Dict[Tuple[str, str], str] = {
    ("cpu", CXX11_ABI): f"pytorch/libtorch-cxx11-builder:cpu-{DEFAULT_TAG}",
 }

-FULL_PYTHON_VERSIONS = ["3.9", "3.10", "3.11", "3.12"]
+FULL_PYTHON_VERSIONS = ["3.8", "3.9", "3.10", "3.11", "3.12"]


 def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
@ -325,7 +325,6 @@ def generate_wheels_matrix(
    os: str,
    arches: Optional[List[str]] = None,
    python_versions: Optional[List[str]] = None,
-    use_split_build: bool = False,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
@ -341,7 +340,7 @@ def generate_wheels_matrix(
        if os == "linux":
            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
-            arches += CUDA_ARCHES + XPU_ARCHES
+            arches += CUDA_ARCHES
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
@ -366,23 +365,13 @@ def generate_wheels_matrix(
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, aarch64, windows
+            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
            if (
-                gpu_arch_type == "rocm" or (os != "linux" and os != "linux-s390x")
+                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
            ) and python_version == "3.13":
                continue

-            if use_split_build and (
-                arch_version not in ["12.4", "12.1", "11.8", "cpu"] or os != "linux"
-            ):
-                raise RuntimeError(
-                    "Split build is only supported on linux with cuda 12.4, 12.1, 11.8, and cpu.\n"
-                    f"Currently attempting to build on arch version {arch_version} and os {os}.\n"
-                    "Please modify the matrix generation to exclude this combination."
-                )
-
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-
            if (
                arch_version in ["12.4", "12.1", "11.8"]
                and os == "linux"
@ -396,7 +385,6 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
                        ),
@ -412,8 +400,7 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                # Special build building to use on Colab. Python 3.11 for 12.1 CUDA
-                if python_version == "3.11" and arch_version == "12.1":
+                if arch_version != "cuda-aarch64":
                    ret.append(
                        {
                            "python_version": python_version,
@ -422,16 +409,40 @@ def generate_wheels_matrix(
                            "desired_cuda": translate_desired_cuda(
                                gpu_arch_type, gpu_arch_version
                            ),
-                            "use_split_build": "True" if use_split_build else "False",
+                            "use_split_build": "True",
                            "devtoolset": "",
                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": "",
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                            "pytorch_extra_install_requirements": (
+                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
+                                if os != "linux-aarch64"
+                                else ""
+                            ),
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
                                ".", "_"
                            ),
                        }
                    )
+                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
+                    if python_version == "3.10" and arch_version == "12.1":
+                        ret.append(
+                            {
+                                "python_version": python_version,
+                                "gpu_arch_type": gpu_arch_type,
+                                "gpu_arch_version": gpu_arch_version,
+                                "desired_cuda": translate_desired_cuda(
+                                    gpu_arch_type, gpu_arch_version
+                                ),
+                                "use_split_build": "False",
+                                "devtoolset": "",
+                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+                                "package_type": package_type,
+                                "pytorch_extra_install_requirements": "",
+                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
+                                    ".", "_"
+                                ),
+                            }
+                        )
            else:
                ret.append(
                    {
@ -441,9 +452,10 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "use_split_build": "True" if use_split_build else "False",
                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cpu-cxx11-abi" else ""
+                            "cxx11-abi"
+                            if arch_version in ["cpu-cxx11-abi", "xpu"]
+                            else ""
                        ),
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
@ -452,12 +464,11 @@ def generate_wheels_matrix(
                        ),
                        "pytorch_extra_install_requirements": (
                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                            if os != "linux" and gpu_arch_type != "xpu"
+                            if os != "linux"
                            else ""
                        ),
                    }
                )
-
    return ret


--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -61,7 +61,6 @@ class BinaryBuildWorkflow:
    # Mainly for macos
    cross_compile_arm64: bool = False
    macos_runner: str = "macos-14-xlarge"
-    use_split_build: bool = False

    def __post_init__(self) -> None:
        if self.abi_version:
@ -70,9 +69,6 @@ class BinaryBuildWorkflow:
            )
        else:
            self.build_environment = f"{self.os}-binary-{self.package_type}"
-        if self.use_split_build:
-            # added to distinguish concurrency groups
-            self.build_environment += "-split"

    def generate_workflow_file(self, workflow_template: jinja2.Template) -> None:
        output_file_path = (
@ -114,20 +110,6 @@ LINUX_BINARY_BUILD_WORFKLOWS = [
            isolated_workflow=True,
        ),
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            use_split_build=True,
-            arches=["11.8", "12.1", "12.4", "cpu"],
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL},
-            isolated_workflow=True,
-        ),
-        use_split_build=True,
-    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="conda",
@ -176,25 +158,10 @@ LINUX_BINARY_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["11.8", "12.1", "12.4"],
-            python_versions=["3.9"],
+            python_versions=["3.8"],
        ),
        branches="main",
    ),
-    BinaryBuildWorkflow(
-        os=OperatingSystem.LINUX,
-        package_type="manywheel",
-        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
-            OperatingSystem.LINUX,
-            arches=["11.8", "12.1", "12.4"],
-            python_versions=["3.9"],
-            use_split_build=True,
-        ),
-        ciflow_config=CIFlowConfig(
-            labels={LABEL_CIFLOW_PERIODIC},
-        ),
-        branches="main",
-        use_split_build=True,
-    ),
    BinaryBuildWorkflow(
        os=OperatingSystem.LINUX,
        package_type="libtorch",
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -46,24 +46,16 @@ def gh_fetch_url_and_headers(
        with urlopen(Request(url, headers=headers, data=data_, method=method)) as conn:
            return conn.headers, reader(conn)
    except HTTPError as err:
-        if (
-            err.code == 403
-            and all(
-                key in err.headers
-                for key in ["X-RateLimit-Limit", "X-RateLimit-Remaining"]
-            )
-            and int(err.headers["X-RateLimit-Remaining"]) == 0
+        if err.code == 403 and all(
+            key in err.headers for key in ["X-RateLimit-Limit", "X-RateLimit-Used"]
        ):
            print(
-                f"""{url}
-                Rate limit exceeded:
+                f"""Rate limit exceeded:
                Used: {err.headers['X-RateLimit-Used']}
                Limit: {err.headers['X-RateLimit-Limit']}
                Remaining: {err.headers['X-RateLimit-Remaining']}
                Resets at: {err.headers['x-RateLimit-Reset']}"""
            )
-        else:
-            print(f"Error fetching {url} {err}")
        raise


@ -168,14 +160,6 @@ def gh_post_commit_comment(
    )


-def gh_close_pr(org: str, repo: str, pr_num: int, dry_run: bool = False) -> None:
-    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/pulls/{pr_num}"
-    if dry_run:
-        print(f"Dry run closing PR {pr_num}")
-    else:
-        gh_fetch_url(url, method="PATCH", data={"state": "closed"})
-
-
 def gh_delete_comment(org: str, repo: str, comment_id: int) -> None:
    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues/comments/{comment_id}"
    gh_fetch_url(url, method="DELETE")
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -445,6 +445,7 @@ def retries_decorator(
                    print(
                        f'Attempt {idx} of {num_retries} to call {f.__name__} failed with "{e}"'
                    )
+                    pass
            return cast(T, rc)

        return wrapper
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -17,11 +17,6 @@ if [[ -d "${CACHE_DIRECTORY}" ]]; then
    cp -r "${CACHE_DIRECTORY}" . || true
 fi

-# if lintrunner is not installed, install it
-if ! command -v lintrunner &> /dev/null; then
-    python3 -m pip install lintrunner==0.12.5
-fi
-
 # This has already been cached in the docker image
 lintrunner init 2> /dev/null

@ -38,7 +33,7 @@ python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
-if ! lintrunner --force-color --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
    echo ""
    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,96 +1,23 @@
 # flake8: noqa: G004

-"""
-This runner determinator is used to determine which set of runners to run a
-GitHub job on. It uses the first comment of a GitHub issue (by default
-https://github.com/pytorch/test-infra/issues/5132) to define the configuration
-of which runners should be used to run which job.
-
-The configuration has two parts, the settings and a list of opted-in users,
-separated by a line containing "---".  If the line is not present, the
-settings are considered to be empty with only the second part, the user
-list, defined.
-
-The first part is a YAML block that defines the rollout settings. This can be
-used to define any settings that are needed to determine which runners to use.
-It's fields are defined by the RolloutSettings class below.
-
-The second part is a list of users who are explicitly opted in to the LF fleet.
-The user list is also a comma separated list of additional features or
-experiments which the user could be opted in to.
-
-The user list has the following rules:
-
- Users are GitHub usernames, which must start with the @ prefix
- Each user is also a comma-separated list of features/experiments to enable
- A "#" prefix opts the user out of all experiments
-
-Example config:
-    # A list of experiments that can be opted into.
-    # This defines the behavior they'll induce when opted into.
-    # Expected syntax is:
-    #   [experiment_name]: # Name of the experiment. Also used for the label prefix.
-    #      rollout_perc: [int] # % of workflows to run with this experiment when users are not opted in.
-
-    experiments:
-      lf:
-        rollout_percent: 25
-
-    ---
-
-    # Opt-ins:
-    # Users can opt into the LF fleet by adding their GitHub username to this list
-    # and specifying experiments to enable in a comma-separated list.
-    # Experiments should be from the above list.
-
-    @User1,lf,split_build
-    @User2,lf
-    @User3,split_build
-"""
-
 import logging
 import os
-import random
 from argparse import ArgumentParser
 from logging import LogRecord
-from typing import Any, Dict, Iterable, List, NamedTuple, Tuple
+from typing import Any, Iterable

-import yaml
 from github import Auth, Github
 from github.Issue import Issue


-DEFAULT_LABEL_PREFIX = ""  # use meta runners
+WORKFLOW_LABEL_META = ""  # use meta runners
 WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
 WORKFLOW_LABEL_LF_CANARY = "lf.c."  # use canary runners from the linux foundation

 GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
-GH_OUTPUT_KEY_AMI = "runner-ami"
 GH_OUTPUT_KEY_LABEL_TYPE = "label-type"


-SETTING_EXPERIMENTS = "experiments"
-
-LF_FLEET_EXPERIMENT = "lf"
-CANARY_FLEET_SUFFIX = ".c"
-
-
-class Experiment(NamedTuple):
-    rollout_perc: float = (
-        0  # Percentage of workflows to experiment on when user is not opted-in.
-    )
-
-    # Add more fields as needed
-
-
-class Settings(NamedTuple):
-    """
-    Settings for the experiments that can be opted into.
-    """
-
-    experiments: Dict[str, Experiment] = {}
-
-
 class ColorFormatter(logging.Formatter):
    """Color codes the log messages based on the log level"""

@ -182,14 +109,11 @@ def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:


 def get_potential_pr_author(
-    github_token: str, repo: str, username: str, ref_type: str, ref_name: str
+    gh: Github, repo: str, username: str, ref_type: str, ref_name: str
 ) -> str:
    # If the trigger was a new tag added by a bot, this is a ciflow case
    # Fetch the actual username from the original PR. The PR number is
    # embedded in the tag name: ciflow/<name>/<pr-number>
-
-    gh = get_gh_client(github_token)
-
    if username == "pytorch-bot[bot]" and ref_type == "tag":
        split_tag = ref_name.split("/")
        if (
@ -211,233 +135,80 @@ def get_potential_pr_author(


 def is_exception_branch(branch: str) -> bool:
-    """
-    Branches that get opted out of all experiments and should always use Meta runners
-    """
    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}


-def load_yaml(yaml_text: str) -> Any:
+def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
    try:
-        data = yaml.safe_load(yaml_text)
-        return data
-    except yaml.YAMLError as exc:
-        log.exception("Error loading YAML")
-        raise
+        first_comment = issue.get_comments()[0].body.strip("\n\t ")

-
-def extract_settings_user_opt_in_from_text(rollout_state: str) -> Tuple[str, str]:
-    """
-    Extracts the text with settings, if any, and the opted in users from the rollout state.
-
-    If the issue body contains "---" then the text above that is the settings
-    and the text below is the list of opted in users.
-
-    If it doesn't contain "---" then the settings are empty and the rest is the users.
-    """
-    rollout_state_parts = rollout_state.split("---")
-    if len(rollout_state_parts) >= 2:
-        return rollout_state_parts[0], rollout_state_parts[1]
-    else:
-        return "", rollout_state
-
-
-class UserOptins(Dict[str, List[str]]):
-    """
-    Dictionary of users with a list of features they have opted into
-    """
-
-
-def parse_user_opt_in_from_text(user_optin_text: str) -> UserOptins:
-    """
-    Parse the user opt-in text into a key value pair of username and the list of features they have opted into
-
-    Users are GitHub usernames with the @ prefix. Each user is also a comma-separated list of features/experiments to enable.
-        - Example line: "@User1,lf,split_build"
-        - A "#" prefix indicates the user is opted out of all experiments
-
-
-    """
-    optins = UserOptins()
-    for user in user_optin_text.split("\n"):
-        user = user.strip("\r\n\t -")
-        if not user or not user.startswith("@"):
-            # Not a valid user. Skip
-            continue
-
-        if user:
-            usr_name = user.split(",")[0].strip("@")
-            optins[usr_name] = [exp.strip(" ") for exp in user.split(",")[1:]]
-
-    return optins
-
-
-def parse_settings_from_text(settings_text: str) -> Settings:
-    """
-    Parse the experiments from the issue body into a list of ExperimentSettings
-    """
-    try:
-        if settings_text:
-            # Escape the backtick as well so that we can have the settings in a code block on the GH issue
-            # for easy reading
-            # Note: Using ascii for the backtick so that the cat step in _runner-determinator.yml doesn't choke on
-            #       the backtick character in shell commands.
-            backtick = chr(96)  # backtick character
-            settings_text = settings_text.strip(f"\r\n\t{backtick} ")
-            settings = load_yaml(settings_text)
-
-            # For now we just load experiments. We can expand this if/when we add more settings
-            experiments = {}
-
-            for exp_name, exp_settings in settings.get(SETTING_EXPERIMENTS).items():
-                valid_settings = {}
-                for setting in exp_settings:
-                    if setting not in Experiment._fields:
-                        log.warning(
-                            f"Unexpected setting in experiment: {setting} = {exp_settings[setting]}"
-                        )
-                    else:
-                        valid_settings[setting] = exp_settings[setting]
-
-                experiments[exp_name] = Experiment(**valid_settings)
-            return Settings(experiments)
-
-    except Exception:
-        log.exception("Failed to parse settings")
-
-    return Settings()
-
-
-def parse_settings(rollout_state: str) -> Settings:
-    """
-    Parse settings, if any, from the rollout state.
-
-    If the issue body contains "---" then the text above that is the settings
-    and the text below is the list of opted in users.
-
-    If it doesn't contain "---" then the settings are empty and the default values are used.
-    """
-    settings_text, _ = extract_settings_user_opt_in_from_text(rollout_state)
-    return parse_settings_from_text(settings_text)
-
-
-def parse_users(rollout_state: str) -> UserOptins:
-    """
-    Parse users from the rollout state.
-
-    """
-    _, users_text = extract_settings_user_opt_in_from_text(rollout_state)
-    return parse_user_opt_in_from_text(users_text)
-
-
-def is_user_opted_in(user: str, user_optins: UserOptins, experiment_name: str) -> bool:
-    """
-    Check if a user is opted into an experiment
-    """
-    return experiment_name in user_optins.get(user, [])
-
-
-def get_runner_prefix(
-    rollout_state: str, workflow_requestors: Iterable[str], is_canary: bool = False
-) -> str:
-    settings = parse_settings(rollout_state)
-    user_optins = parse_users(rollout_state)
-
-    fleet_prefix = ""
-    prefixes = []
-    for experiment_name, experiment_settings in settings.experiments.items():
-        enabled = False
-
-        # Is any workflow_requestor opted in to this experiment?
-        opted_in_users = [
-            requestor
-            for requestor in workflow_requestors
-            if is_user_opted_in(requestor, user_optins, experiment_name)
-        ]
-
-        if opted_in_users:
-            log.info(
-                f"{', '.join(opted_in_users)} have opted into experiment {experiment_name}."
-            )
-            enabled = True
-        elif experiment_settings.rollout_perc:
-            # If no user is opted in, then we randomly enable the experiment based on the rollout percentage
-            if random.uniform(0, 100) <= experiment_settings.rollout_perc:
+        if first_comment[0] == "!":
+            log.info("LF Workflows are disabled for everyone. Using meta runners.")
+            return WORKFLOW_LABEL_META
+        elif first_comment[0] == "*":
+            log.info("LF Workflows are enabled for everyone. Using LF runners.")
+            return WORKFLOW_LABEL_LF
+        else:
+            all_opted_in_users = {
+                usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
+            }
+            opted_in_requestors = {
+                usr for usr in workflow_requestors if usr in all_opted_in_users
+            }
+            if opted_in_requestors:
                log.info(
-                    f"Based on rollout percentage of {experiment_settings.rollout_perc}%, enabling experiment {experiment_name}."
+                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
                )
-                enabled = True
-
-        if enabled:
-            label = experiment_name
-            if experiment_name == LF_FLEET_EXPERIMENT:
-                # We give some special treatment to the "lf" experiment since determines the fleet we use
-                #  - If it's enabled, then we always list it's prefix first
-                #  - If we're in the canary branch, then we append ".c" to the lf prefix
-                if is_canary:
-                    label += CANARY_FLEET_SUFFIX
-                fleet_prefix = label
+                return WORKFLOW_LABEL_LF
            else:
-                prefixes.append(label)
+                log.info(
+                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
+                )
+                return WORKFLOW_LABEL_META

-    if len(prefixes) > 1:
+    except Exception as e:
        log.error(
-            f"Only a fleet and one other experiment can be enabled for a job at any time. Enabling {prefixes[0]} and ignoring the rest, which are {', '.join(prefixes[1:])}"
+            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
        )
-        prefixes = prefixes[:1]
-
-    # Fleet always comes first
-    if fleet_prefix:
-        prefixes.insert(0, fleet_prefix)
-
-    return ".".join(prefixes) + "." if prefixes else ""
-
-
-def get_rollout_state_from_issue(github_token: str, repo: str, issue_num: int) -> str:
-    """
-    Gets the first comment of the issue, which contains the desired rollout state.
-
-    The default issue we use - https://github.com/pytorch/test-infra/issues/5132
-    """
-    gh = get_gh_client(github_token)
-    issue = get_issue(gh, repo, issue_num)
-    return str(issue.get_comments()[0].body.strip("\n\t "))
+        return WORKFLOW_LABEL_META


 def main() -> None:
    args = parse_args()

    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(
-            f"Exception branch: '{args.github_branch}', using Meta runners and no experiments."
-        )
-        runner_label_prefix = DEFAULT_LABEL_PREFIX
+        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
+        label_type = WORKFLOW_LABEL_META
    else:
        try:
-            rollout_state = get_rollout_state_from_issue(
-                args.github_token, args.github_issue_repo, args.github_issue
-            )
-
+            gh = get_gh_client(args.github_token)
+            # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
+            issue = get_issue(gh, args.github_issue_repo, args.github_issue)
            username = get_potential_pr_author(
-                args.github_token,
+                gh,
                args.github_repo,
                args.github_actor,
                args.github_ref_type,
                args.github_branch,
            )
-
-            is_canary = args.github_repo == "pytorch/pytorch-canary"
-
-            runner_label_prefix = get_runner_prefix(
-                rollout_state, (args.github_issue_owner, username), is_canary
+            label_type = get_workflow_type(
+                issue,
+                (
+                    args.github_issue_owner,
+                    username,
+                ),
            )
-
        except Exception as e:
            log.error(
-                f"Failed to get issue. Defaulting to Meta runners and no experiments. Exception: {e}"
+                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
            )
+            label_type = WORKFLOW_LABEL_META

-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, runner_label_prefix)
+    # For Canary builds use canary runners
+    if args.github_repo == "pytorch/pytorch-canary" and label_type == WORKFLOW_LABEL_LF:
+        label_type = WORKFLOW_LABEL_LF_CANARY
+
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)


 if __name__ == "__main__":
--- a/.github/scripts/s390x-ci/README.md
+++ b/.github/scripts/s390x-ci/README.md
@ -3,7 +3,7 @@
 ## Install prerequisites.

 ```
-$ sudo dnf install podman podman-docker jq
+$ sudo dnf install docker
 ```

 ## Add services.
@ -27,48 +27,23 @@ $ sudo systemctl enable --now qemu-user-static

 ## Rebuild the image

-First build s390x builder image `docker.io/pytorch/manylinuxs390x-builder`,
-using following commands:
-
-```
-$ cd ~
-$ git clone https://github.com/pytorch/pytorch
-$ cd pytorch
-$ git submodule update --init --recursive
-$ GPU_ARCH_TYPE=cpu-s390x "$(pwd)/.ci/docker/manywheel/build.sh" manylinuxs390x-builder
-$ docker image tag localhost/pytorch/manylinuxs390x-builder docker.io/pytorch/manylinuxs390x-builder:cpu-s390x
-$ docker image save -o ~/manywheel-s390x.tar docker.io/pytorch/manylinuxs390x-builder:cpu-s390x
-```
-
-Next step is to build `actions-runner` image using:
+In order to build or update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
+latest OS security fixes, use the following commands:

 ```
 $ cd self-hosted-builder
 $ sudo docker build \
+      --build-arg repo=<owner>/<name> \
+      --build-arg token=<***> \
      --pull \
      -f actions-runner.Dockerfile \
-      -t iiilinuxibmcom/actions-runner.<name> \
+      -t iiilinuxibmcom/actions-runner \
      .
 ```

-If there are failures, ensure that selinux doesn't prevent it from working.
+If it fails, ensure that selinux doesn't prevent it from working.
 In worst case, selinux can be disabled with `setenforce 0`.

-Now prepare all necessary files for runner registration:
-
-```
-$ sudo mkdir -p /etc/actions-runner/<name>
-$ sudo chmod 700 /etc/actions-runner/<name>
-$ sudo /bin/cp <github_app_private_key_file> /etc/actions-runner/<name>/key_private.pem
-$ sudo echo <github_app_id> | sudo tee /etc/actions-runner/<name>/appid.env
-$ sudo echo <github_app_install_id> | sudo tee /etc/actions-runner/<name>/installid.env
-$ sudo echo NAME=<worker_name> | sudo tee    /etc/actions-runner/<name>/env
-$ sudo echo ORG=<github_org>   | sudo tee -a /etc/actions-runner/<name>/env
-$ cd self-hosted-builder
-$ sudo /bin/cp helpers/*.sh /usr/local/bin/
-$ sudo chmod 755 /usr/local/bin/app_token.sh /usr/local/bin/gh_token_generator.sh
-```
-
 ## Autostart the runner.

 ```
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -1,12 +1,12 @@
 # Self-Hosted IBM Z Github Actions Runner.

 # Temporary image: amd64 dependencies.
-FROM docker.io/amd64/ubuntu:23.10 as ld-prefix
+FROM docker.io/amd64/ubuntu:22.04 as ld-prefix
 ENV DEBIAN_FRONTEND=noninteractive
-RUN apt-get update && apt-get -y install ca-certificates libicu72 libssl3
+RUN apt-get update && apt-get -y install ca-certificates libicu70 libssl3

 # Main image.
-FROM docker.io/s390x/ubuntu:23.10
+FROM docker.io/s390x/ubuntu:22.04

 # Packages for pytorch building and testing.
 ENV DEBIAN_FRONTEND=noninteractive
@ -16,7 +16,6 @@ RUN apt-get update && apt-get -y install \
        gcc \
        git \
        jq \
-        zip \
        libxml2-dev \
        libxslt-dev \
        ninja-build \
@ -44,28 +43,24 @@ COPY fs/ /

 RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint

-# install podman
-RUN apt -y install podman podman-docker
-
 # amd64 Github Actions Runner.
 RUN useradd -m actions-runner
 USER actions-runner
 WORKDIR /home/actions-runner
+RUN curl -L https://github.com/actions/runner/releases/download/v2.309.0/actions-runner-linux-x64-2.309.0.tar.gz | tar -xz

-# set up python virtual environment which is later used by runner.
-# build workflows use "python -m pip install ...",
-# and it doesn't work for non-root user
-RUN virtualenv --system-site-packages venv
+# repository
+ARG repo

-# copy prebuilt manywheel docker image for builds and tests
-# build command is:
-# GPU_ARCH_TYPE=cpu-s390x "$(pwd)/manywheel/build_docker.sh"
-# and save command is:
-# docker image save -o manywheel-s390x.tar pytorch/manylinuxs390x-builder:cpu-s390x
-#
-COPY --chown=actions-runner:actions-runner manywheel-s390x.tar /home/actions-runner/manywheel-s390x.tar
+# repository token
+ARG token

-RUN curl -L https://github.com/actions/runner/releases/download/v2.317.0/actions-runner-linux-x64-2.317.0.tar.gz | tar -xz
+RUN ./config.sh \
+        --unattended \
+        --url "https://github.com/${repo}" \
+        --token "${token}" \
+        --no-default-labels \
+        --labels self-hosted,linux.s390x

 ENTRYPOINT ["/usr/bin/entrypoint"]
 CMD ["/usr/bin/actions-runner"]
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
@ -8,16 +8,12 @@ StartLimitIntervalSec=0
 Type=simple
 Restart=always
 ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
-ExecStartPre=-/usr/local/bin/gh_token_generator.sh /etc/actions-runner/%i/appid.env /etc/actions-runner/%i/installid.env /etc/actions-runner/%i/key_private.pem /etc/actions-runner/%i/ghtoken.env
 ExecStart=/usr/bin/docker run \
-              --env-file=/etc/actions-runner/%i/env \
-              --env-file=/etc/actions-runner/%i/ghtoken.env \
              --init \
              --interactive \
              --name=actions-runner.%i \
              --rm \
-              --privileged \
-              iiilinuxibmcom/actions-runner.%i
+              iiilinuxibmcom/actions-runner
 ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
 ExecStop=/bin/sh -c "docker wait actions-runner.%i"
 ExecStop=/bin/sh -c "docker rm actions-runner.%i"
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@ -2,45 +2,5 @@

 set -e -u

-# first import docker image
-if [ -f ./manywheel-s390x.tar ] ; then
-        docker image load --input manywheel-s390x.tar
-        docker image tag docker.io/pytorch/manylinuxs390x-builder:cpu-s390x docker.io/pytorch/manylinuxs390x-builder:cpu-s390x-main
-        rm -f manywheel-s390x.tar
-fi
-
-token_file=registration-token.json
-
-# Generate registration token
-curl \
-        -X POST \
-        -H "Accept: application/vnd.github.v3+json" \
-        -H "Authorization: Bearer ${ACCESS_TOKEN}" \
-        "https://api.github.com/orgs/${ORG}/actions/runners/registration-token" \
-        -o "$token_file"
-
-unset ACCESS_TOKEN
-
-# register runner as ephemeral runner
-# it does one job, stops and unregisters
-registration_token=$(jq --raw-output .token "$token_file")
-
-./config.sh \
-        --unattended \
-        --ephemeral \
-        --url "https://github.com/${ORG}" \
-        --token "${registration_token}" \
-        --name "${NAME}" \
-        --no-default-labels \
-        --labels self-hosted,linux.s390x
-
-unset registration_token
-rm -f "$token_file"
-
-# enter into python virtual environment.
-# build workflows use "python -m pip install ...",
-# and it doesn't work for non-root user
-source venv/bin/activate
-
 # Run one job.
-./run.sh
+./run.sh --once
--- a/.github/scripts/s390x-ci/self-hosted-builder/helpers/app_token.sh
+++ b/.github/scripts/s390x-ci/self-hosted-builder/helpers/app_token.sh
@ -1,84 +0,0 @@
-#!/usr/bin/env bash
-#
-# Request an ACCESS_TOKEN to be used by a GitHub APP
-# Environment variable that need to be set up:
-# * APP_ID, the GitHub's app ID
-# * INSTALL_ID, the Github's app's installation ID
-# * APP_PRIVATE_KEY, the content of GitHub app's private key in PEM format.
-#
-# https://github.com/orgs/community/discussions/24743#discussioncomment-3245300
-#
-
-set -o pipefail
-
-_GITHUB_HOST=${GITHUB_HOST:="github.com"}
-
-# If URL is not github.com then use the enterprise api endpoint
-if [[ ${GITHUB_HOST} = "github.com" ]]; then
-  URI="https://api.${_GITHUB_HOST}"
-else
-  URI="https://${_GITHUB_HOST}/api/v3"
-fi
-
-API_VERSION=v3
-API_HEADER="Accept: application/vnd.github.${API_VERSION}+json"
-CONTENT_LENGTH_HEADER="Content-Length: 0"
-APP_INSTALLATIONS_URI="${URI}/app/installations"
-
-
-# JWT parameters based off
-# https://docs.github.com/en/developers/apps/building-github-apps/authenticating-with-github-apps#authenticating-as-a-github-app
-#
-# JWT token issuance and expiration parameters
-JWT_IAT_DRIFT=60
-JWT_EXP_DELTA=600
-
-JWT_JOSE_HEADER='{
-    "alg": "RS256",
-    "typ": "JWT"
-}'
-
-
-build_jwt_payload() {
-    now=$(date +%s)
-    iat=$((now - JWT_IAT_DRIFT))
-    jq -c \
-        --arg iat_str "${iat}" \
-        --arg exp_delta_str "${JWT_EXP_DELTA}" \
-        --arg app_id_str "${APP_ID}" \
-    '
-        ($iat_str | tonumber) as $iat
-        | ($exp_delta_str | tonumber) as $exp_delta
-        | ($app_id_str | tonumber) as $app_id
-        | .iat = $iat
-        | .exp = ($iat + $exp_delta)
-        | .iss = $app_id
-    ' <<< "{}" | tr -d '\n'
-}
-
-base64url() {
-    base64 | tr '+/' '-_' | tr -d '=\n'
-}
-
-rs256_sign() {
-    openssl dgst -binary -sha256 -sign <(echo "$1")
-}
-
-request_access_token() {
-    jwt_payload=$(build_jwt_payload)
-    encoded_jwt_parts=$(base64url <<<"${JWT_JOSE_HEADER}").$(base64url <<<"${jwt_payload}")
-    encoded_mac=$(echo -n "$encoded_jwt_parts" | rs256_sign "${APP_PRIVATE_KEY}" | base64url)
-    generated_jwt="${encoded_jwt_parts}.${encoded_mac}"
-
-    auth_header="Authorization: Bearer ${generated_jwt}"
-
-    app_installations_response=$(curl -sX POST \
-        -H "${auth_header}" \
-        -H "${API_HEADER}" \
-        --header "X-GitHub-Api-Version: 2022-11-28" \
-        --url "https://api.github.com/app/installations/${INSTALL_ID}/access_tokens" \
-    )
-    echo "$app_installations_response" | jq --raw-output '.token'
-}
-
-request_access_token
--- a/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_token_generator.sh
+++ b/.github/scripts/s390x-ci/self-hosted-builder/helpers/gh_token_generator.sh
@ -1,10 +0,0 @@
-#!/usr/bin/env bash
-
-SCRIPT_DIR=$(dirname "$0")
-APP_ID=$1
-INSTALL_ID=$2
-APP_PRIVATE_KEY=$3
-DST_FILE="$4"
-
-ACCESS_TOKEN="$(APP_ID="$(<"${APP_ID}")" INSTALL_ID="$(<"${INSTALL_ID}")" APP_PRIVATE_KEY="$(<"${APP_PRIVATE_KEY}")" "${SCRIPT_DIR}/app_token.sh")"
-echo "ACCESS_TOKEN=${ACCESS_TOKEN}" > "${DST_FILE}"
--- a/.github/scripts/sync_distributed_folder_prototype.sh
+++ b/.github/scripts/sync_distributed_folder_prototype.sh
@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -eoux pipefail
+
+SYNC_BRANCH=pytorch-stable-prototype
+
+git config user.email "fake@example.com"
+git config user.name  "PyTorch Stable Bot"
+
+git fetch origin main
+git fetch origin "$SYNC_BRANCH"
+git checkout "$SYNC_BRANCH"
+
+# Using a hardcoded SHA here is a massive speedup as we can skip the entire history of the pytorch GitHub repo.
+# This specific SHA was chosen as it was before the "branch point" of the stable branch
+for SHA in $(git log ba3b05fdf37ddbc3c301294d6a560a816335e717..origin/main --pretty="%h" -- torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed)
+do
+    # `git merge-base --is-ancestor` exits with code 0 if the given SHA is an ancestor, and non-0 otherwise
+    if git merge-base --is-ancestor $SHA HEAD || [[ $(git log --grep="(cherry picked from commit $SHA") ]]
+    then
+        echo "Skipping $SHA"
+        continue
+    fi
+    echo "Copying $SHA"
+    git cherry-pick -x "$SHA" -X theirs
+    git reset --soft HEAD~1
+    git add torch/distributed torch/csrc/distributed test/distributed test/cpp/c10d benchmarks/distributed
+    git checkout .
+    git commit --reuse-message=HEAD@{1}
+    git clean -f
+done
+
+if [[ "${WITH_PUSH}" == true ]]; then
+  git push
+fi
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -51,8 +51,6 @@ def main() -> None:

    for platform_image in platform_images:  # type: ignore[attr-defined]
        for arch in platform_image.keys():  # type: ignore[attr-defined]
-            if arch == "cpu-s390x":
-                continue
            tag_image(
                platform_image[arch],  # type: ignore[index]
                default_tag,
--- a/.github/scripts/test_check_labels.py
+++ b/.github/scripts/test_check_labels.py
@ -18,7 +18,6 @@ def mock_parse_args() -> object:
    class Object:
        def __init__(self) -> None:
            self.pr_num = 76123
-            self.exit_non_zero = False

    return Object()

--- a/.github/scripts/test_runner_determinator.py
+++ b/.github/scripts/test_runner_determinator.py
@ -1,237 +0,0 @@
-from unittest import main, TestCase
-from unittest.mock import Mock, patch
-
-import runner_determinator as rd
-
-
-class TestRunnerDeterminatorIssueParser(TestCase):
-    def test_parse_settings(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 25
-            otherExp:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        settings = rd.parse_settings(settings_text)
-
-        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=25),
-            settings.experiments["lf"],
-            "lf settings not parsed correctly",
-        )
-        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=0),
-            settings.experiments["otherExp"],
-            "otherExp settings not parsed correctly",
-        )
-
-    def test_parse_settings_in_code_block(self) -> None:
-        settings_text = """
-
-        ```
-        experiments:
-            lf:
-                rollout_perc: 25
-            otherExp:
-                rollout_perc: 0
-
-        ```
-
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        settings = rd.parse_settings(settings_text)
-
-        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=25),
-            settings.experiments["lf"],
-            "lf settings not parsed correctly",
-        )
-        self.assertTupleEqual(
-            rd.Experiment(rollout_perc=0),
-            settings.experiments["otherExp"],
-            "otherExp settings not parsed correctly",
-        )
-
-    def test_parse_users(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 0
-            otherExp:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        users = rd.parse_users(settings_text)
-        self.assertDictEqual(
-            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
-            users,
-            "Users not parsed correctly",
-        )
-
-    def test_parse_users_without_settings(self) -> None:
-        settings_text = """
-
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        users = rd.parse_users(settings_text)
-        self.assertDictEqual(
-            {"User1": ["lf"], "User2": ["lf", "otherExp"]},
-            users,
-            "Users not parsed correctly",
-        )
-
-
-class TestRunnerDeterminatorGetRunnerPrefix(TestCase):
-    def test_opted_in_user(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 0
-            otherExp:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
-        self.assertEqual("lf.", prefix, "Runner prefix not correct for User1")
-
-    def test_opted_in_user_two_experiments(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 0
-            otherExp:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
-        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for User2")
-
-    @patch("random.uniform", return_value=50)
-    def test_opted_out_user(self, mock_uniform: Mock) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 25
-            otherExp:
-                rollout_perc: 25
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
-        self.assertEqual("", prefix, "Runner prefix not correct for user")
-
-    @patch("random.uniform", return_value=10)
-    def test_opted_out_user_was_pulled_in_by_rollout(self, mock_uniform: Mock) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 25
-            otherExp:
-                rollout_perc: 25
-        ---
-
-        Users:
-        @User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        # User3 is opted out, but is pulled into both experiments by the 10% rollout
-        prefix = rd.get_runner_prefix(settings_text, ["User3"])
-        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
-
-    def test_lf_prefix_always_comes_first(self) -> None:
-        settings_text = """
-        experiments:
-            otherExp:
-                rollout_perc: 0
-            lf:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf
-        @User2,otherExp,lf
-
-        """
-
-        prefix = rd.get_runner_prefix(settings_text, ["User2"])
-        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
-
-    def test_ignores_commented_users(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 0
-            otherExp:
-                rollout_perc: 0
-        ---
-
-        Users:
-        #@User1,lf
-        @User2,lf,otherExp
-
-        """
-
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
-        self.assertEqual("", prefix, "Runner prefix not correct for user")
-
-    def test_ignores_extra_experiments(self) -> None:
-        settings_text = """
-        experiments:
-            lf:
-                rollout_perc: 0
-            otherExp:
-                rollout_perc: 0
-            foo:
-                rollout_perc: 0
-        ---
-
-        Users:
-        @User1,lf,otherExp,foo
-
-        """
-
-        prefix = rd.get_runner_prefix(settings_text, ["User1"])
-        self.assertEqual("lf.otherExp.", prefix, "Runner prefix not correct for user")
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -36,7 +36,6 @@ from warnings import warn

 import yaml
 from github_utils import (
-    gh_close_pr,
    gh_fetch_json_list,
    gh_fetch_merge_base,
    gh_fetch_url,
@ -1117,20 +1116,15 @@ class GitHubPR:
        msg = self.get_title() + f" (#{self.pr_num})\n\n"
        msg += msg_body

+        # Mention PR co-authors
+        for author_login, author_name in self.get_authors().items():
+            if author_login != self.get_pr_creator_login():
+                msg += f"\nCo-authored-by: {author_name}"
+
        msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
        msg += f"Approved by: {approved_by_urls}\n"
        if ghstack_deps:
            msg += f"ghstack dependencies: {', '.join([f'#{pr.pr_num}' for pr in ghstack_deps])}\n"
-
-        # Mention PR co-authors, which should be at the end of the message
-        # And separated from the body by two newlines
-        first_coauthor = True
-        for author_login, author_name in self.get_authors().items():
-            if author_login != self.get_pr_creator_login():
-                if first_coauthor:
-                    msg, first_coauthor = (msg + "\n", False)
-                msg += f"\nCo-authored-by: {author_name}"
-
        return msg

    def add_numbered_label(self, label_base: str, dry_run: bool) -> None:
@ -1175,11 +1169,11 @@ class GitHubPR:
            for pr in additional_merged_prs:
                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)

-        # When the merge process reaches this part, we can assume that the commit
-        # has been successfully pushed to trunk
-        merge_commit_sha = repo.rev_parse(name=self.default_branch())
-
        if comment_id and self.pr_num:
+            # When the merge process reaches this part, we can assume that the commit
+            # has been successfully pushed to trunk
+            merge_commit_sha = repo.rev_parse(name=REMOTE_MAIN_BRANCH)
+
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
@ -1204,17 +1198,6 @@ class GitHubPR:
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")

-        # Usually Github will see that the commit has "resolves <pr_num>" in the
-        # commit message and close the PR, but sometimes it doesn't, leading to
-        # confusion.  When it doesn't, we close it manually.
-        time.sleep(60)  # Give Github some time to close the PR
-        manually_close_merged_pr(
-            pr=self,
-            additional_merged_prs=additional_merged_prs,
-            merge_commit_sha=merge_commit_sha,
-            dry_run=dry_run,
-        )
-
    def merge_changes(
        self,
        repo: GitRepo,
@ -1515,34 +1498,6 @@ def checks_to_markdown_bullets(
    ]


-def manually_close_merged_pr(
-    pr: GitHubPR,
-    additional_merged_prs: List[GitHubPR],
-    merge_commit_sha: str,
-    dry_run: bool,
-) -> None:
-    def _comment_and_close(pr: GitHubPR, comment: str) -> None:
-        pr = GitHubPR(pr.org, pr.project, pr.pr_num)  # Refresh the PR
-        if not pr.is_closed():
-            gh_post_pr_comment(pr.org, pr.project, pr.pr_num, comment, dry_run)
-            gh_close_pr(pr.org, pr.project, pr.pr_num, dry_run)
-
-    message = (
-        f"This PR (#{pr.pr_num}) was merged in {merge_commit_sha} but it is still open, likely due to a Github bug, "
-        "so mergebot is closing it manually.  If you think this is a mistake, please feel free to reopen and contact Dev Infra."
-    )
-    _comment_and_close(pr, message)
-    for additional_pr in additional_merged_prs:
-        message = (
-            f"This PR (#{additional_pr.pr_num}) was merged as part of PR #{pr.pr_num} in the stack under {merge_commit_sha} "
-            "but it is still open, likely due to a Github bug, so mergebot is closing it manually. "
-            "If you think this is a mistake, please feel free to reopen and contact Dev Infra."
-        )
-        _comment_and_close(additional_pr, message)
-
-    print(f"PR {pr.pr_num} and all additional PRs in the stack have been closed.")
-
-
@retries_decorator()
 def save_merge_record(
    comment_id: int,
--- a/.github/templates/common.yml.j2
+++ b/.github/templates/common.yml.j2
@ -1,7 +1,7 @@
 {%- set upload_artifact_s3_action = "seemethere/upload-artifact-s3@v5" -%}
 {%- set download_artifact_s3_action = "seemethere/download-artifact-s3@v4" -%}
-{%- set upload_artifact_action = "actions/upload-artifact@v4.4.0" -%}
-{%- set download_artifact_action = "actions/download-artifact@v4.1.7" -%}
+{%- set upload_artifact_action = "actions/upload-artifact@v3" -%}
+{%- set download_artifact_action = "actions/download-artifact@v3" -%}

 {%- set timeout_minutes = 240 -%}

--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -52,32 +52,19 @@ env:
 !{{ common.concurrency(build_environment) }}

 jobs:
-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
    with:!{{ upload.binary_env_as_input(config) }}
      {%- if "aarch64" in build_environment %}
-      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      runs_on: linux.arm64.m7g.4xlarge
      ALPINE_IMAGE: "arm64v8/alpine"
      {%- elif "s390x" in build_environment %}
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.24xlarge.ephemeral
-      {%- else %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.24xlarge
      {%- endif %}
      build_name: !{{ config["build_name"] }}
      build_environment: !{{ build_environment }}
@ -93,9 +80,7 @@ jobs:
  {%- if config["gpu_arch_type"] != "cuda-aarch64" %}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - !{{ config["build_name"] }}-build
-      - get-label-type
+    needs: !{{ config["build_name"] }}-build
    {%- if config["gpu_arch_type"] not in ["rocm", "xpu"] %}
    uses: ./.github/workflows/_binary-test-linux.yml
    with:!{{ upload.binary_env_as_input(config) }}
@ -110,10 +95,8 @@ jobs:
      {%- elif config["gpu_arch_type"] == "rocm" %}
      runs_on: linux.rocm.gpu
      {%- elif config["gpu_arch_type"] == "cuda" %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge.nvidia.gpu
      {%- else %}
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.4xlarge
      {%- endif %}
    secrets:
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -64,6 +64,9 @@ jobs:
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: !{{ config.pytorch_extra_install_requirements }}
    {%- endif %}
+      # For sccache access (only on non-forked PRs)
+      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
    steps:
      !{{ set_runner_specific_vars() }}
      - name: Install conda and dependencies
@ -81,7 +84,7 @@ jobs:
      !{{ common.checkout(deep_clone=False, directory="pytorch") }}
      !{{ common.checkout(deep_clone=False, directory="builder", repository=common.builder_repo, branch=common.builder_branch) }}
      - name: Install sccache (only for non-forked PRs, and pushes to trunk)
-        uses: nick-fields/retry@v3.0.0
+        uses: nick-fields/retry@v2.8.2
        if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }}
        with:
          timeout_minutes: 5
@ -101,7 +104,7 @@ jobs:
          # shellcheck disable=SC1091
          source "${RUNNER_TEMP}/anaconda/bin/activate"
          "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh"
-      - uses: actions/upload-artifact@v4.4.0
+      - uses: actions/upload-artifact@v3
        if: always()
        with:
          name: !{{ config["build_name"] }}
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -45,7 +45,7 @@
  {%- if is_windows %}
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.8"
  {%- endif %}

 {%- else %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -53,24 +53,10 @@ env:
 !{{ common.concurrency(build_environment) }}

 jobs:
-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
 {%- for config in build_configs %}
  !{{ config["build_name"] }}-build:
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: get-label-type
-    {%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
-    {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
-    {%- endif %}
+    runs-on: windows.4xlarge.nonephemeral
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
    {%- if config.pytorch_extra_install_requirements is defined and config.pytorch_extra_install_requirements|d('')|length > 0  %}
@ -99,17 +85,15 @@ jobs:
      !{{ common.wait_and_kill_ssh_windows('pytorch') }}
  !{{ config["build_name"] }}-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - !{{ config["build_name"] }}-build
-      - get-label-type
+    needs: !{{ config["build_name"] }}-build
 {%- if config["gpu_arch_type"] == "cuda" %}
 {%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
+    runs-on: windows.8xlarge.nvidia.gpu
 {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge.nonephemeral"
+    runs-on: windows.8xlarge.nvidia.gpu.nonephemeral
 {%- endif %}
 {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
+    runs-on: windows.4xlarge.nonephemeral
 {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes }}
    !{{ upload.binary_env(config, True) }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -11,16 +11,11 @@ on:
        required: true
        type: string
        description: The build environment
-      runner_prefix:
-        required: false
-        default: ""
-        type: string
-        description: prefix for runner label
      runs_on:
        required: false
-        default: linux.12xlarge.ephemeral
+        default: linux.12xlarge
        type: string
-        description: Hardware to run this "build" job on, linux.12xlarge or linux.arm64.2xlarge.
+        description: Hardware to run this "build"job on, linux.12xlarge or linux.arm64.2xlarge.
      timeout-minutes:
        required: false
        default: 210
@ -94,7 +89,7 @@ on:

 jobs:
  build:
-    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
+    runs-on: ${{ inputs.runs_on }}
    timeout-minutes: ${{ inputs.timeout-minutes }}
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
@ -283,7 +278,7 @@ jobs:
          # Ensure the working directory gets chowned back to the current user
          docker run --rm -v "${RUNNER_TEMP}/artifacts:/v" -w /v "${ALPINE_IMAGE}" chown -R "$(id -u):$(id -g)" .

-      - uses: actions/upload-artifact@v4.4.0
+      - uses: actions/upload-artifact@v3
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
        with:
          name: ${{ inputs.build_name }}
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -59,11 +59,6 @@ on:
        required: false
        type: string
        description: Desired python version
-      runner_prefix:
-        required: false
-        default: ""
-        type: string
-        description: prefix for runner label
      runs_on:
        required: true
        type: string
@ -82,7 +77,7 @@ on:

 jobs:
  test:
-    runs-on: ${{ inputs.runner_prefix}}${{ inputs.runs_on }}
+    runs-on: ${{ inputs.runs_on }}
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }}
@ -210,7 +205,7 @@ jobs:

      - name: Download Build Artifacts
        if: ${{ steps.filter.outputs.is-test-matrix-empty == 'False' }}
-        uses: actions/download-artifact@v4.1.7
+        uses: actions/download-artifact@v3
        with:
          name: ${{ inputs.build_name }}
          path: "${{ runner.temp }}/artifacts/"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -126,7 +126,7 @@ jobs:
        # NB: When the previous build job is skipped, there won't be any artifacts and
        # this step will fail. Binary build jobs can only be skipped on CI, not nightly
        continue-on-error: true
-        uses: actions/download-artifact@v4.1.7
+        uses: actions/download-artifact@v3
        with:
          name: ${{ inputs.build_name }}
          path: "${{ runner.temp }}/artifacts/"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Yiming Zhou	b9eb47372e	test commit	2024-07-30 10:09:11 -07:00
Yiming Zhou	747a6b8230	fix random_ op issues in dynamo	2024-07-30 09:51:02 -07:00
 @ -1 +1 @@
 .1.0
 .0.0