Fix swap_tensors path in _apply for modules that inherit from RNNBase (RNN, GRU, LSTM) (#122800 ) (#123116 )

Pull Request resolved: https://github.com/pytorch/pytorch/pull/122800 Approved by: https://github.com/albanD (cherry picked from commit cc12668053ad847ff4a430e99eeebf99c136f3cd)
nn.Module: use swap_tensors for Tensor subclasses (#122755 ) (#123106 )
2025-10-26 08:34:52 +08:00 · 2024-04-02 16:16:37 -07:00 · 2024-04-02 16:16:16 -07:00 · 2024-04-02 19:08:22 -04:00 · 2024-04-02 18:53:19 -04:00 · 2024-04-02 15:08:09 -04:00
8419 changed files with 653740 additions and 376783 deletions
--- a/.bazelignore
+++ b/.bazelignore
@ -1,4 +1,3 @@
 # We do not use this library in our Bazel build. It contains an
 # infinitely recursing symlink that makes Bazel very unhappy.
 third_party/ittapi/
-third_party/opentelemetry-cpp
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +0,0 @@
-0.6b
-manylinux_2_17
-rocm6.1
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -84,30 +84,16 @@ fi
 # CMake 3.18 is needed to support CUDA17 language variant
 CMAKE_VERSION=3.18.5

-_UCX_COMMIT=7bb2722ff2187a0cad557ae4a6afa090569f83fb
-_UCC_COMMIT=20eae37090a4ce1b32bcce6144ccad0b49943e0b
+_UCX_COMMIT=00bcc6bb18fc282eb160623b4c0d300147f579af
+_UCC_COMMIT=7cb07a76ccedad7e56ceb136b865eb9319c258ea

 # It's annoying to rename jobs every time you want to rewrite a
 # configuration, so we hardcode everything here rather than do it
 # from scratch
 case "$image" in
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -119,24 +105,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -149,39 +120,9 @@ case "$image" in
    TRITON=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3.12-gcc9-inductor-benchmarks)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    INDUCTOR_BENCHMARKS=yes
-    ;;
-  pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9)
    CUDA_VERSION=11.8.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -193,37 +134,9 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9)
+  pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9)
    CUDA_VERSION=12.1.1
-    CUDNN_VERSION=9
-    ANACONDA_PYTHON_VERSION=3.10
-    GCC_VERSION=9
-    PROTOBUF=yes
-    DB=yes
-    VISION=yes
-    KATEX=yes
-    UCX_COMMIT=${_UCX_COMMIT}
-    UCC_COMMIT=${_UCC_COMMIT}
-    CONDA_CMAKE=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9)
-    CUDA_VERSION=12.4.0
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=9
    PROTOBUF=yes
@ -291,7 +204,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.0
+    ROCM_VERSION=5.7
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -302,7 +215,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=6.1
+    ROCM_VERSION=6.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -313,10 +226,9 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    XPU_VERSION=0.5
+    BASEKIT_VERSION=2024.0.0-49522
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
-    TRITON=yes
    ;;
    pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.8
@ -330,10 +242,10 @@ case "$image" in
    DOCS=yes
    INDUCTOR_BENCHMARKS=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12)
    ANACONDA_PYTHON_VERSION=3.8
    CUDA_VERSION=11.8
-    CUDNN_VERSION=9
+    CUDNN_VERSION=8
    CLANG_VERSION=12
    PROTOBUF=yes
    DB=yes
@ -373,13 +285,6 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
-  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    CONDA_CMAKE=yes
-    HALIDE=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -387,7 +292,7 @@ case "$image" in
    ANACONDA_PYTHON_VERSION=3.9
    CONDA_CMAKE=yes
    ;;
-  pytorch-linux-jammy-cuda11.8-cudnn9-py3.9-linter)
+  pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter)
    ANACONDA_PYTHON_VERSION=3.9
    CUDA_VERSION=11.8
    CONDA_CMAKE=yes
@ -400,12 +305,6 @@ case "$image" in
    DB=yes
    VISION=yes
    CONDA_CMAKE=yes
-    # snadampal: skipping sccache due to the following issue
-    # https://github.com/pytorch/pytorch/issues/121559
-    SKIP_SCCACHE_INSTALL=yes
-    # snadampal: skipping llvm src build install because the current version
-    # from pytorch/llvm:9.0.1 is x86 specific
-    SKIP_LLVM_SRC_BUILD_INSTALL=yes
    ;;
  *)
    # Catch-all for builds that are not hardcoded.
@ -454,13 +353,13 @@ tmp_tag=$(basename "$(mktemp -u)" | tr '[:upper:]' '[:lower:]')
 #when using cudnn version 8 install it separately from cuda
 if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
  IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
-  if [[ ${CUDNN_VERSION} == 9 ]]; then
+  if [[ ${CUDNN_VERSION} == 8 ]]; then
    IMAGE_NAME="nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}"
  fi
 fi

 # Build image
-docker build \
+DOCKER_BUILDKIT=1 docker build \
       --no-cache \
       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
@ -497,17 +396,14 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "HALIDE=${HALIDE}" \
-       --build-arg "XPU_VERSION=${XPU_VERSION}" \
+       --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
-       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
-       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
       .

-# NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+# NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
 # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
 # find the correct image. As a result, here we have to replace the
 #   "$UBUNTU_VERSION" == "18.04-rc"
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -62,7 +62,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -77,9 +77,6 @@ RUN rm install_rocm.sh
 COPY ./common/install_rocm_magma.sh install_rocm_magma.sh
 RUN bash ./install_rocm_magma.sh
 RUN rm install_rocm_magma.sh
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
 ENV PATH /opt/rocm/bin:$PATH
 ENV PATH /opt/rocm/hcc/bin:$PATH
 ENV PATH /opt/rocm/hip/bin:$PATH
@ -113,13 +110,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton (Early fail)
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-9d859653ae916d0a72f6b2b5c5925bed38832140
+e2a8f9548aecb62a68e264607174a7d207ed2929
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +0,0 @@
-340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-21eae954efa5bf584da70324b640288c3ee7aede
+d08e16b738ab550c3af51305df624d5c823dc445
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +0,0 @@
-1b2f15840e0d70eec50d84c7a0575cb835524def
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-dedb7bdf339a3546896d4820366ca562c586bfa0
+79c6c9b209a5692b9a895398f4f3a033f8f80415
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -1,6 +1,6 @@
 set -euo pipefail

-readonly version=v24.04
+readonly version=v23.08
 readonly src_host=https://review.mlplatform.org/ml
 readonly src_repo=ComputeLibrary

--- a/.ci/docker/common/install_amdsmi.sh
+++ b/.ci/docker/common/install_amdsmi.sh
@ -1,5 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-cd /opt/rocm/share/amd_smi && pip install .
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-TARBALL='aotriton.tar.bz2'
-# This read command alwasy returns with exit code 1
-read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
-ARCH=$(uname -m)
-AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"
-
-cd "${AOTRITON_INSTALL_PREFIX}"
-# Must use -L to follow redirects
-curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
-ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
-if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
-  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
-  echo " which does not match the expected value ${SHA256}."
-  exit
-fi
-tar xf "${TARBALL}" && rm -rf "${TARBALL}"
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -3,7 +3,7 @@
 set -ex

 install_ubuntu() {
-  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn9-devel-ubuntu18.04-rc`,
+  # NVIDIA dockers for RC releases use tag names like `11.0-cudnn8-devel-ubuntu18.04-rc`,
  # for this case we will set UBUNTU_VERSION to `18.04-rc` so that the Dockerfile could
  # find the correct image. As a result, here we have to check for
  #   "$UBUNTU_VERSION" == "18.04"*
@ -113,6 +113,7 @@ install_centos() {
    glibc-devel \
    glibc-headers \
    glog-devel \
+    hiredis-devel \
    libstdc++-devel \
    libsndfile-devel \
    make \
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -85,7 +85,7 @@ fi
  else
    CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"

-    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.13" ]; then
+    if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
      conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
    else
      conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
--- a/.ci/docker/common/install_cudnn.sh
+++ b/.ci/docker/common/install_cudnn.sh
@ -1,18 +1,20 @@
 #!/bin/bash

-if [[ -n "${CUDNN_VERSION}" ]]; then
+if [[ ${CUDNN_VERSION} == 8 ]]; then
    # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
    mkdir tmp_cudnn
    pushd tmp_cudnn
-    if [[ ${CUDA_VERSION:0:2} == "12" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda12-archive"
-    elif [[ ${CUDA_VERSION:0:2} == "11" ]]; then
-        CUDNN_NAME="cudnn-linux-x86_64-9.1.0.70_cuda11-archive"
+    if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.9.2.26_cuda12-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+    elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
+        CUDNN_NAME="cudnn-linux-x86_64-8.7.0.84_cuda11-archive"
+        curl --retry 3 -OLs https://developer.download.nvidia.com/compute/redist/cudnn/v8.7.0/local_installers/11.8/${CUDNN_NAME}.tar.xz
    else
        print "Unsupported CUDA version ${CUDA_VERSION}"
        exit 1
    fi
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${CUDNN_NAME}.tar.xz
+
    tar xf ${CUDNN_NAME}.tar.xz
    cp -a ${CUDNN_NAME}/include/* /usr/local/cuda/include/
    cp -a ${CUDNN_NAME}/lib/* /usr/local/cuda/lib64/
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,14 +5,9 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ ^12\.[1-4]$ ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.5.2.1-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
+if [[ ${CUDA_VERSION:0:4} == "12.1" ]]; then
+    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.5.2.1-archive"
+    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} == "11.8" ]]; then
    CUSPARSELT_NAME="libcusparse_lt-linux-x86_64-0.4.0.7-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-x86_64/${CUSPARSELT_NAME}.tar.xz
--- a/.ci/docker/common/install_db.sh
+++ b/.ci/docker/common/install_db.sh
@ -4,6 +4,11 @@ set -ex

 install_ubuntu() {
  apt-get update
+  apt-get install -y --no-install-recommends \
+          libhiredis-dev \
+          libleveldb-dev \
+          liblmdb-dev \
+          libsnappy-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -15,6 +20,12 @@ install_centos() {
  # See http://fedoraproject.org/wiki/EPEL
  yum --enablerepo=extras install -y epel-release

+  yum install -y \
+      hiredis-devel \
+      leveldb-devel \
+      lmdb-devel \
+      snappy-devel
+
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -37,9 +37,6 @@ install_conda_dependencies() {

 install_pip_dependencies() {
  pushd executorch/.ci/docker
-  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
-  # binaries later, ExecuTorch only needs CPU
-  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
  # Install all Python dependencies
  pip_install -r requirements-ci.txt
  popd
@ -47,14 +44,13 @@ install_pip_dependencies() {

 setup_executorch() {
  pushd executorch
-  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
+  source .ci/scripts/utils.sh

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  install_flatc_from_source
+  pip_install .

-  as_jenkins .ci/scripts/setup-linux.sh cmake
+  # Make sure that all the newly generate files are owned by Jenkins
+  chown -R jenkins .
  popd
 }

--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-COMMIT=$(get_pinned_commit halide)
-test -n "$COMMIT"
-
-# activate conda to populate CONDA_PREFIX
-test -n "$ANACONDA_PYTHON_VERSION"
-eval "$(conda shell.bash hook)"
-conda activate py_$ANACONDA_PYTHON_VERSION
-
-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
-                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
-fi
-
-conda_install numpy scipy imageio cmake ninja
-
-git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
-cmake -DCMAKE_BUILD_TYPE=Release \
-        -DLLVM_ENABLE_PROJECTS="clang" \
-        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-        -S llvm-project/llvm -B llvm-build -G Ninja
-cmake --build llvm-build
-cmake --install llvm-build --prefix llvm-install
-export LLVM_ROOT=`pwd`/llvm-install
-export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
-
-git clone https://github.com/halide/Halide.git
-pushd Halide
-git checkout ${COMMIT} && git submodule update --init --recursive
-pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
-cmake --build build
-test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
-cmake --install build --prefix ${CONDA_PREFIX}
-chown -R jenkins ${CONDA_PREFIX}
-popd
-rm -rf Halide llvm-build llvm-project llvm-install
-
-python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -30,17 +30,15 @@ pip_install \

 pip_install coloredlogs packaging

-pip_install onnxruntime==1.18
-pip_install onnx==1.16.0
+pip_install onnxruntime==1.17.0
+pip_install onnx==1.15.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240613 --no-deps
-# required by onnxscript
-pip_install ml_dtypes
+pip_install onnxscript==0.1.0.dev20240301 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
 IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py"
-as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}"
+as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}"

 # Need a PyTorch version for transformers to work
 pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -11,8 +11,7 @@ mkdir -p $pb_dir
 ln -s /usr/lib64 "$pb_dir/lib64"

 curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
-
-tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
 NPROC=$[$(nproc) - 2]
 pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
 popd
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -6,6 +6,9 @@ ver() {
    printf "%3d%03d%03d%03d" $(echo "$1" | tr '.' ' ');
 }

+# Map ROCm version to AMDGPU version
+declare -A AMDGPU_VERSIONS=( ["5.0"]="21.50" ["5.1.1"]="22.10.1" ["5.2"]="22.20" )
+
 install_ubuntu() {
    apt-get update
    if [[ $UBUNTU_VERSION == 18.04 ]]; then
@ -23,14 +26,31 @@ install_ubuntu() {
    apt-get install -y libc++1
    apt-get install -y libc++abi1

-    # Add amdgpu repository
-    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
-    echo "deb [arch=amd64] https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+        # Add amdgpu repository
+        UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
+        local amdgpu_baseurl
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/ubuntu"
+        fi
+        echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
+    fi
+
+    ROCM_REPO="ubuntu"
+    if [[ $(ver $ROCM_VERSION) -lt $(ver 4.2) ]]; then
+        ROCM_REPO="xenial"
+    fi
+
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+        ROCM_REPO="${UBUNTU_VERSION_NAME}"
+    fi

    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
    local rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
-    echo "deb [arch=amd64] ${rocm_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/rocm.list
+    echo "deb [arch=amd64] ${rocm_baseurl} ${ROCM_REPO} main" > /etc/apt/sources.list.d/rocm.list
    apt-get update --allow-insecure-repositories

    DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@ -39,28 +59,34 @@ install_ubuntu() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
-
-    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then
-        DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev
-    fi
+                   roctracer-dev

    # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5
    # search for all unversioned packages
    # if search fails it will abort this script; use true to avoid case where search fails
-    MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
-    if [[ "x${MIOPENHIPGFX}" = x ]]; then
-      echo "miopen-hip-gfx package not available" && exit 1
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+        MIOPENHIPGFX=$(apt-cache search --names-only miopen-hip-gfx | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENHIPGFX}" = x ]]; then
+          echo "miopen-hip-gfx package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        fi
    else
-      DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENHIPGFX}
+        MIOPENKERNELS=$(apt-cache search --names-only miopenkernels | awk '{print $1}' | grep -F -v . || true)
+        if [[ "x${MIOPENKERNELS}" = x ]]; then
+          echo "miopenkernels package not available" && exit 1
+        else
+          DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ${MIOPENKERNELS}
+        fi
    fi

    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-    for kdb in /opt/rocm/share/miopen/db/*.kdb
-    do
-        sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-    done
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+        for kdb in /opt/rocm/share/miopen/db/*.kdb
+        do
+            sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+        done
+    fi

    # Cleanup
    apt-get autoclean && apt-get clean
@ -77,19 +103,25 @@ install_centos() {
  yum install -y epel-release
  yum install -y dkms kernel-headers-`uname -r` kernel-devel-`uname -r`

-  # Add amdgpu repository
-  local amdgpu_baseurl
-  if [[ $OS_VERSION == 9 ]]; then
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/9.0/main/x86_64"
-  else
-      amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 4.5) ]]; then
+      # Add amdgpu repository
+      local amdgpu_baseurl
+      if [[ $OS_VERSION == 9 ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/9.0/main/x86_64"
+      else
+        if [[ $(ver $ROCM_VERSION) -ge $(ver 5.3) ]]; then
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/rhel/7.9/main/x86_64"
+        else
+          amdgpu_baseurl="https://repo.radeon.com/amdgpu/${AMDGPU_VERSIONS[$ROCM_VERSION]}/rhel/7.9/main/x86_64"
+        fi
+      fi
+      echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
+      echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
+      echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
+      echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
+      echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo
  fi
-  echo "[AMDGPU]" > /etc/yum.repos.d/amdgpu.repo
-  echo "name=AMDGPU" >> /etc/yum.repos.d/amdgpu.repo
-  echo "baseurl=${amdgpu_baseurl}" >> /etc/yum.repos.d/amdgpu.repo
-  echo "enabled=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgcheck=1" >> /etc/yum.repos.d/amdgpu.repo
-  echo "gpgkey=http://repo.radeon.com/rocm/rocm.gpg.key" >> /etc/yum.repos.d/amdgpu.repo

  local rocm_baseurl="http://repo.radeon.com/rocm/yum/${ROCM_VERSION}"
  echo "[ROCm]" > /etc/yum.repos.d/rocm.repo
@ -107,23 +139,33 @@ install_centos() {
                   rocm-libs \
                   rccl \
                   rocprofiler-dev \
-                   roctracer-dev \
-                   amd-smi-lib
+                   roctracer-dev

  # precompiled miopen kernels; search for all unversioned packages
  # if search fails it will abort this script; use true to avoid case where search fails
-  MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
-  if [[ "x${MIOPENHIPGFX}" = x ]]; then
-    echo "miopen-hip-gfx package not available" && exit 1
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 5.5) ]]; then
+      MIOPENHIPGFX=$(yum -q search miopen-hip-gfx | grep miopen-hip-gfx | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENHIPGFX}" = x ]]; then
+        echo "miopen-hip-gfx package not available" && exit 1
+      else
+        yum install -y ${MIOPENHIPGFX}
+      fi
  else
-    yum install -y ${MIOPENHIPGFX}
+      MIOPENKERNELS=$(yum -q search miopenkernels | grep miopenkernels- | awk '{print $1}'| grep -F kdb. || true)
+      if [[ "x${MIOPENKERNELS}" = x ]]; then
+        echo "miopenkernels package not available" && exit 1
+      else
+        yum install -y ${MIOPENKERNELS}
+      fi
  fi

  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
-  for kdb in /opt/rocm/share/miopen/db/*.kdb
-  do
-      sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
-  done
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+      for kdb in /opt/rocm/share/miopen/db/*.kdb
+      do
+          sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+      done
+  fi

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -13,11 +13,8 @@ conda_reinstall() {
 }

 if [ -n "${ROCM_VERSION}" ]; then
-  TRITON_REPO="https://github.com/openai/triton"
+  TRITON_REPO="https://github.com/ROCmSoftwarePlatform/triton"
  TRITON_TEXT_FILE="triton-rocm"
-elif [ -n "${XPU_VERSION}" ]; then
-  TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton"
-  TRITON_TEXT_FILE="triton-xpu"
 else
  TRITON_REPO="https://github.com/openai/triton"
  TRITON_TEXT_FILE="triton"
--- a/.ci/docker/common/install_vision.sh
+++ b/.ci/docker/common/install_vision.sh
@ -5,7 +5,8 @@ set -ex
 install_ubuntu() {
  apt-get update
  apt-get install -y --no-install-recommends \
-          libopencv-dev
+          libopencv-dev \
+          libavcodec-dev

  # Cleanup
  apt-get autoclean && apt-get clean
@ -18,7 +19,8 @@ install_centos() {
  yum --enablerepo=extras install -y epel-release

  yum install -y \
-      opencv-devel
+      opencv-devel \
+      ffmpeg-devel

  # Cleanup
  yum clean all
--- a/.ci/docker/common/install_xpu.sh
+++ b/.ci/docker/common/install_xpu.sh
@ -3,7 +3,10 @@ set -xe


 # Intel® software for general purpose GPU capabilities.
-# Refer to https://www.intel.com/content/www/us/en/developer/articles/tool/pytorch-prerequisites-for-intel-gpus.html
+# Refer to https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html
+
+# Intel® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates.
+# Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html

 # Users should update to the latest version as it becomes available

@ -14,16 +17,14 @@ function install_ubuntu() {
    # Set up the repository. To do this, download the key to the system keyring
    wget -qO - https://repositories.intel.com/gpu/intel-graphics.key \
        | gpg --dearmor --output /usr/share/keyrings/intel-graphics.gpg
-    wget -qO - https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
-        | gpg --dearmor --output /usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg
+    wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB \
+        | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null

    # Add the signed entry to APT sources and configure the APT client to use the Intel repository
-    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] \
-        https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \
+    echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" \
        | tee /etc/apt/sources.list.d/intel-gpu-jammy.list
-    echo "deb [signed-by=/usr/share/keyrings/intel-for-pytorch-gpu-dev-keyring.gpg] \
-        https://apt.repos.intel.com/intel-for-pytorch-gpu-dev all main" \
-        | tee /etc/apt/sources.list.d/intel-for-pytorch-gpu-dev.list
+    echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \
+        | tee /etc/apt/sources.list.d/oneAPI.list

    # Update the packages list and repository index
    apt-get update
@ -39,11 +40,11 @@ function install_ubuntu() {
        mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo
    # Development Packages
    apt-get install -y libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev level-zero-dev
-    # Install Intel Support Packages
-    if [ -n "$XPU_VERSION" ]; then
-        apt-get install -y intel-for-pytorch-gpu-dev-${XPU_VERSION}
+    # Install Intel® oneAPI Base Toolkit
+    if [ -n "$BASEKIT_VERSION" ]; then
+        apt-get install intel-basekit=$BASEKIT_VERSION -y
    else
-        apt-get install -y intel-for-pytorch-gpu-dev
+        apt-get install intel-basekit -y
    fi

    # Cleanup
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.8.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.8.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -134,9 +134,9 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py

-optree==0.12.1
+optree==0.9.1
 #Description: A library for tree manipulation
-#Pinned versions: 0.12.1
+#Pinned versions: 0.9.1
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
 #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py,
 #common_utils.py, test_eager_transforms.py, test_python_dispatch.py,
@ -147,9 +147,9 @@ optree==0.12.1
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py

-pillow==10.3.0
+pillow==10.2.0
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.3.0
+#Pinned versions: 10.2.0
 #test that import:

 protobuf==3.20.2
@ -228,11 +228,12 @@ scikit-image==0.20.0 ; python_version >= "3.10"
 #Pinned versions: 0.20.3
 #test that import:

-scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.12.0 ; python_version == "3.12"
+scipy==1.6.3 ; python_version < "3.10"
+scipy==1.8.1 ; python_version == "3.10"
+scipy==1.10.1 ; python_version == "3.11"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
-#Pinned versions: 1.10.1
+#Pinned versions: 1.6.3
 #test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py
 #test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py
 #test_linalg.py, test_binary_ufuncs.py
@ -263,10 +264,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0
 #Pinned versions:
 #test that import:

-#lintrunner is supported on aarch64-linux only from 0.12.4 version
-lintrunner==0.12.5
+#wheel not found on aarch64, and source build requires rust
+lintrunner==0.10.7 ; platform_machine == "x86_64"
 #Description: all about linters!
-#Pinned versions: 0.12.5
+#Pinned versions: 0.10.7
 #test that import:

 rockset==1.0.3
@ -279,9 +280,9 @@ ghstack==0.8.0
 #Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.4
+jinja2==3.1.3
 #Description: jinja2 template engine
-#Pinned versions: 3.1.4
+#Pinned versions: 3.1.3
 #test that import:

 pytest-cpp==2.3.0
@ -306,9 +307,7 @@ pywavelets==1.5.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.0.0
+lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries
-
-PyGithub==2.3.0
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.0.0
+2.3.0
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -56,7 +56,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
@ -147,7 +139,7 @@ COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
 ARG CUDNN_VERSION
 ARG CUDA_VERSION
 COPY ./common/install_cudnn.sh install_cudnn.sh
-RUN if [ -n "${CUDNN_VERSION}" ]; then bash install_cudnn.sh; fi
+RUN if [ "${CUDNN_VERSION}" -eq 8 ]; then bash install_cudnn.sh; fi
 RUN rm install_cudnn.sh

 # Install CUSPARSELT
@ -160,7 +152,6 @@ RUN rm install_cusparselt.sh
 RUN if [ -h /usr/local/cuda-11.6/cuda-11.6 ]; then rm /usr/local/cuda-11.6/cuda-11.6; fi
 RUN if [ -h /usr/local/cuda-11.7/cuda-11.7 ]; then rm /usr/local/cuda-11.7/cuda-11.7; fi
 RUN if [ -h /usr/local/cuda-12.1/cuda-12.1 ]; then rm /usr/local/cuda-12.1/cuda-12.1; fi
-RUN if [ -h /usr/local/cuda-12.4/cuda-12.4 ]; then rm /usr/local/cuda-12.4/cuda-12.4; fi

 USER jenkins
 CMD ["bash"]
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -53,7 +53,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -78,11 +78,6 @@ ENV MAGMA_HOME /opt/rocm/magma
 ENV LANG C.UTF-8
 ENV LC_ALL C.UTF-8

-# Install amdsmi
-COPY ./common/install_amdsmi.sh install_amdsmi.sh
-RUN bash ./install_amdsmi.sh
-RUN rm install_amdsmi.sh
-
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
@ -105,13 +100,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -61,20 +61,15 @@ COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

-# Install XPU Dependencies
-ARG XPU_VERSION
-COPY ./common/install_xpu.sh install_xpu.sh
-RUN bash ./install_xpu.sh && rm install_xpu.sh
-
 ARG TRITON
 # Install triton, this needs to be done before sccache because the latter will
 # try to reach out to S3, which docker build runners don't have access
 COPY ./common/install_triton.sh install_triton.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt
-COPY triton_version.txt triton_version.txt
+# TODO: will add triton xpu commit
+COPY ci_commit_pins/triton.txt triton.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
-RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt
+RUN rm install_triton.sh common_utils.sh triton.txt

 # (optional) Install database packages like LMDB and LevelDB
 ARG DB
@ -83,13 +78,18 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
 RUN rm install_vision.sh cache_vision_models.sh common_utils.sh
 ENV INSTALLED_VISION ${VISION}

+# Install XPU Dependencies
+ARG BASEKIT_VERSION
+COPY ./common/install_xpu.sh install_xpu.sh
+RUN bash ./install_xpu.sh && rm install_xpu.sh
+
 # (optional) Install non-default CMake version
 ARG CMAKE_VERSION
 COPY ./common/install_cmake.sh install_cmake.sh
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -80,7 +80,7 @@ RUN if [ -n "${DB}" ]; then bash ./install_db.sh; fi
 RUN rm install_db.sh
 ENV INSTALLED_DB ${DB}

-# (optional) Install vision packages like OpenCV
+# (optional) Install vision packages like OpenCV and ffmpeg
 ARG VISION
 COPY ./common/install_vision.sh ./common/cache_vision_models.sh ./common/common_utils.sh ./
 RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi
@ -155,14 +155,6 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
@ -177,11 +169,9 @@ RUN rm install_acl.sh
 ENV INSTALLED_ACL ${ACL}

 # Install ccache/sccache (do this last, so we get priority in PATH)
-ARG SKIP_SCCACHE_INSTALL
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
-RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi
-RUN rm install_cache.sh
+RUN bash ./install_cache.sh && rm install_cache.sh

 # Add jni.h for java host build
 COPY ./common/install_jni.sh install_jni.sh
@ -198,9 +188,7 @@ ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}

 # Install LLVM dev version (Defined in the pytorch/builder github repository)
-ARG SKIP_LLVM_SRC_BUILD_INSTALL
 COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm
-RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi

 # AWS specific CUDA build guidance
 ENV TORCH_CUDA_ARCH_LIST Maxwell
--- a/.ci/onnx/common.sh
+++ b/.ci/onnx/common.sh
@ -1,9 +1,5 @@
-#!/bin/bash
-
 set -ex

-source "$(dirname "${BASH_SOURCE[0]}")/../pytorch/common_utils.sh"
-
 LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd)
 TEST_DIR="$ROOT_DIR/test"
--- a/.ci/onnx/test.sh
+++ b/.ci/onnx/test.sh
@ -3,20 +3,6 @@
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

-# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-cleanup_workspace() {
-  echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-  echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-  echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-  sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-}
-# Disable shellcheck SC2064 as we want to parse the original owner immediately.
-# shellcheck disable=SC2064
-trap_add cleanup_workspace EXIT
-sudo chown -R jenkins /var/lib/jenkins/workspace
-git config --global --add safe.directory /var/lib/jenkins/workspace
-
 if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then
  # TODO: This can be removed later once vision is also part of the Docker image
  pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)"
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -44,7 +44,15 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda11* ]]; then
  fi
 fi

-if [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
+if [[ ${BUILD_ENVIRONMENT} == *"caffe2"* ]]; then
+  echo "Caffe2 build is ON"
+  export BUILD_CAFFE2=ON
+fi
+
+if [[ ${BUILD_ENVIRONMENT} == *"paralleltbb"* ]]; then
+  export ATEN_THREADING=TBB
+  export USE_TBB=1
+elif [[ ${BUILD_ENVIRONMENT} == *"parallelnative"* ]]; then
  export ATEN_THREADING=NATIVE
 fi

@ -73,22 +81,7 @@ if ! which conda; then
    export USE_MKLDNN=0
  fi
 else
-  # CMAKE_PREFIX_PATH precedences
-  # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions.
-  # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined.
-  #    This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX.
-  # 3. $(conda info --base). The fallback value of pytorch official build
-  #    instructions actually refers to this.
-  #    Commonly this is /opt/conda/
-  if [[ -v CONDA_PREFIX ]]; then
-    export CMAKE_PREFIX_PATH=${CONDA_PREFIX}
-  elif [[ -v ANACONDA_PYTHON_VERSION ]]; then
-    export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}"
-  else
-    # already checked by `! which conda`
-    CMAKE_PREFIX_PATH="$(conda info --base)"
-    export CMAKE_PREFIX_PATH
-  fi
+  export CMAKE_PREFIX_PATH=/opt/conda

  # Workaround required for MKL library linkage
  # https://github.com/pytorch/pytorch/issues/119557
@ -230,28 +223,6 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]]
  export BUILD_STATIC_RUNTIME_BENCHMARK=ON
 fi

-if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
-  export CMAKE_BUILD_TYPE=RelWithAssert
-fi
-
-# Do not change workspace permissions for ROCm CI jobs
-# as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-  cleanup_workspace() {
-    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-  }
-  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
-  # shellcheck disable=SC2064
-  trap_add cleanup_workspace EXIT
-  sudo chown -R jenkins /var/lib/jenkins/workspace
-  git config --global --add safe.directory /var/lib/jenkins/workspace
-fi
-
 if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then
  set -e

@ -277,37 +248,16 @@ else
  ( ! get_exit_code python setup.py clean bad_argument )

  if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then
+
    # rocm builds fail when WERROR=1
    # XLA test build fails when WERROR=1
    # set only when building other architectures
    # or building non-XLA tests.
    if [[ "$BUILD_ENVIRONMENT" != *rocm*  &&
          "$BUILD_ENVIRONMENT" != *xla* ]]; then
-      if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then
-        # Install numpy-2.0 release candidate for builds
-        # Which should be backward compatible with Numpy-1.X
-        python -mpip install --pre numpy==2.0.0rc1
-      fi
-
-      WERROR=1 python setup.py clean
-
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
-        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
    else
-      python setup.py clean
-      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
-        source .ci/pytorch/install_cache_xla.sh
-      fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

@ -346,10 +296,9 @@ else
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -362,7 +311,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -374,7 +323,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPYTHON_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -405,8 +354,4 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]];
  python tools/stats/export_test_times.py
 fi

-# snadampal: skipping it till sccache support added for aarch64
-# https://github.com/pytorch/pytorch/issues/121559
-if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then
-  print_sccache_stats
-fi
+print_sccache_stats
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -56,29 +56,9 @@ function assert_git_not_dirty() {
 function pip_install_whl() {
  # This is used to install PyTorch and other build artifacts wheel locally
  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
+  python3 -mpip install --no-index --no-deps "$@"
 }

-
 function pip_install() {
  # retry 3 times
  # old versions of pip don't have the "--progress-bar" flag
@ -179,7 +159,7 @@ function install_torchvision() {
 }

 function install_tlparse() {
-  pip_install --user "tlparse==0.3.7"
+  pip_install --user "tlparse==0.3.5"
  PATH="$(python -m site --user-base)/bin:$PATH"
 }

@ -198,7 +178,7 @@ function install_torchrec_and_fbgemm() {

 function clone_pytorch_xla() {
  if [[ ! -d ./xla ]]; then
-    git clone --recursive --quiet https://github.com/pytorch/xla.git
+    git clone --recursive -b r2.3 https://github.com/pytorch/xla.git
    pushd xla
    # pin the xla hash so that we don't get broken by changes to xla
    git checkout "$(cat ../.github/ci_commit_pins/xla.txt)"
@ -208,6 +188,28 @@ function clone_pytorch_xla() {
  fi
 }

+function checkout_install_torchdeploy() {
+  local commit
+  commit=$(get_pinned_commit multipy)
+  pushd ..
+  git clone --recurse-submodules https://github.com/pytorch/multipy.git
+  pushd multipy
+  git checkout "${commit}"
+  python multipy/runtime/example/generate_examples.py
+  BUILD_CUDA_TESTS=1 pip install -e .
+  popd
+  popd
+}
+
+function test_torch_deploy(){
+ pushd ..
+ pushd multipy
+ ./multipy/runtime/build/test_deploy
+ ./multipy/runtime/build/test_deploy_gpu
+ popd
+ popd
+}
+
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
@ -222,8 +224,6 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
  popd
 }

--- a/.ci/pytorch/create_test_cert.py
+++ b/.ci/pytorch/create_test_cert.py
@ -6,7 +6,6 @@ from cryptography.hazmat.primitives import hashes, serialization
 from cryptography.hazmat.primitives.asymmetric import rsa
 from cryptography.x509.oid import NameOID

-
 temp_dir = mkdtemp()
 print(temp_dir)

--- a/.ci/pytorch/docs-test.sh
+++ b/.ci/pytorch/docs-test.sh
@ -6,4 +6,4 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 echo "Testing pytorch docs"

 cd docs
-TERM=vt100 make doctest
+make doctest
--- a/.ci/pytorch/install_cache_xla.sh
+++ b/.ci/pytorch/install_cache_xla.sh
@ -1,37 +0,0 @@
-#!/bin/bash
-
-# Script for installing sccache on the xla build job, which uses xla's docker
-# image and doesn't have sccache installed on it.  This is mostly copied from
-# .ci/docker/install_cache.sh.  Changes are: removing checks that will always
-# return the same thing, ex checks for for rocm, CUDA, and changing the path
-# where sccache is installed, and not changing /etc/environment.
-
-set -ex
-
-install_binary() {
-  echo "Downloading sccache binary from S3 repo"
-  curl --retry 3 https://s3.amazonaws.com/ossci-linux/sccache -o /tmp/cache/bin/sccache
-}
-
-mkdir -p /tmp/cache/bin
-mkdir -p /tmp/cache/lib
-export PATH="/tmp/cache/bin:$PATH"
-
-install_binary
-chmod a+x /tmp/cache/bin/sccache
-
-function write_sccache_stub() {
-  # Unset LD_PRELOAD for ps because of asan + ps issues
-  # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90589
-  # shellcheck disable=SC2086
-  # shellcheck disable=SC2059
-  printf "#!/bin/sh\nif [ \$(env -u LD_PRELOAD ps -p \$PPID -o comm=) != sccache ]; then\n  exec sccache $(which $1) \"\$@\"\nelse\n  exec $(which $1) \"\$@\"\nfi" > "/tmp/cache/bin/$1"
-  chmod a+x "/tmp/cache/bin/$1"
-}
-
-write_sccache_stub cc
-write_sccache_stub c++
-write_sccache_stub gcc
-write_sccache_stub g++
-write_sccache_stub clang
-write_sccache_stub clang++
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,9 +18,7 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_compute_comm_reordering
 time python test/run_test.py --verbose -i distributed/test_store
-time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 # FSDP tests
@ -47,13 +45,6 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel
 time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples
-time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state
-
-# FSDP2 tests
-time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh
-
-# Pipelining composability tests
-time python test/run_test.py --verbose -i distributed/pipelining/test_composability.py

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
--- a/.ci/pytorch/perf_test/compare_with_baseline.py
+++ b/.ci/pytorch/perf_test/compare_with_baseline.py
@ -3,7 +3,6 @@ import json
 import math
 import sys

-
 parser = argparse.ArgumentParser()
 parser.add_argument(
    "--test-name", dest="test_name", action="store", required=True, help="test name"
@ -60,16 +59,16 @@ print("sample mean: ", sample_mean)
 print("sample sigma: ", sample_sigma)

 if math.isnan(sample_mean):
-    raise Exception("""Error: sample mean is NaN""")  # noqa: TRY002
+    raise Exception("""Error: sample mean is NaN""")
 elif math.isnan(sample_sigma):
-    raise Exception("""Error: sample sigma is NaN""")  # noqa: TRY002
+    raise Exception("""Error: sample sigma is NaN""")

 z_value = (sample_mean - mean) / sigma

 print("z-value: ", z_value)

 if z_value >= 3:
-    raise Exception(  # noqa: TRY002
+    raise Exception(
        f"""\n
 z-value >= 3, there is high chance of perf regression.\n
 To reproduce this regression, run
--- a/.ci/pytorch/perf_test/get_stats.py
+++ b/.ci/pytorch/perf_test/get_stats.py
@ -3,7 +3,6 @@ import sys

 import numpy

-
 sample_data_list = sys.argv[1:]
 sample_data_list = [float(v.strip()) for v in sample_data_list]

--- a/.ci/pytorch/perf_test/update_commit_hash.py
+++ b/.ci/pytorch/perf_test/update_commit_hash.py
@ -1,7 +1,6 @@
 import json
 import sys

-
 data_file_path = sys.argv[1]
 commit_hash = sys.argv[2]

--- a/.ci/pytorch/print_sccache_log.py
+++ b/.ci/pytorch/print_sccache_log.py
@ -1,6 +1,5 @@
 import sys

-
 log_file_path = sys.argv[1]

 with open(log_file_path) as f:
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -26,8 +26,8 @@ echo "error: python_doc_push_script.sh: version (arg2) not specified"
 fi

 # Argument 1: Where to copy the built documentation to
-# (pytorch_docs/$install_path)
-install_path="${1:-${DOCS_INSTALL_PATH:-${DOCS_VERSION}}}"
+# (pytorch.github.io/$install_path)
+install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}"
 if [ -z "$install_path" ]; then
 echo "error: python_doc_push_script.sh: install_path (arg1) not specified"
  exit 1
@ -68,8 +68,8 @@ build_docs () {
 }


-git clone https://github.com/pytorch/docs pytorch_docs -b "$branch" --depth 1
-pushd pytorch_docs
+git clone https://github.com/pytorch/pytorch.github.io -b "$branch" --depth 1
+pushd pytorch.github.io

 export LC_ALL=C
 export PATH=/opt/conda/bin:$PATH
@ -105,7 +105,6 @@ if [ "$is_main_doc" = true ]; then
    echo undocumented objects found:
    cat build/coverage/python.txt
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -6,27 +6,6 @@

 set -ex

-# shellcheck source=./common.sh
-source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
-
-# Do not change workspace permissions for ROCm CI jobs
-# as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-  # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
-  WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
-  cleanup_workspace() {
-    echo "sudo may print the following warning message that can be ignored. The chown command will still run."
-    echo "    sudo: setrlimit(RLIMIT_STACK): Operation not permitted"
-    echo "For more details refer to https://github.com/sudo-project/sudo/issues/42"
-    sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace
-  }
-  # Disable shellcheck SC2064 as we want to parse the original owner immediately.
-  # shellcheck disable=SC2064
-  trap_add cleanup_workspace EXIT
-  sudo chown -R jenkins /var/lib/jenkins/workspace
-  git config --global --add safe.directory /var/lib/jenkins/workspace
-fi
-
 echo "Environment variables:"
 env

@ -111,6 +90,9 @@ if [[ -n $TESTS_TO_INCLUDE ]]; then
  INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE"
 fi

+# shellcheck source=./common.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+
 echo "Environment variables"
 env

@ -181,11 +163,6 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then
  export PATH="$HOME/.local/bin:$PATH"
 fi

-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
-  # TODO: revisit this once the CI is stabilized on aarch64 linux
-  export VALGRIND=OFF
-fi
-
 install_tlparse

 # DANGER WILL ROBINSON.  The LD_PRELOAD here could cause you problems
@ -234,6 +211,8 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then
    export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so
    # Disable valgrind for asan
    export VALGRIND=OFF
+    # Increase stack size, because ASAN red zones use more stack
+    ulimit -s 81920

    (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)")
    echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured"
@ -249,7 +228,9 @@ fi
 # This tests that the debug asserts are working correctly.
 if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
    echo "We are in debug mode: $BUILD_ENVIRONMENT. Expect the python assertion to fail"
-    (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
+    # TODO: Enable the check after we setup the build to run debug asserts without having
+    #       to do a full (and slow) debug build
+    # (cd test && ! get_exit_code python -c "import torch; torch._C._crash_if_debug_asserts_fail(424242)")
 elif [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
    # Noop when debug is disabled. Skip bazel jobs because torch isn't available there yet.
    echo "We are not in debug mode: $BUILD_ENVIRONMENT. Expect the assertion to pass"
@ -275,9 +256,6 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
-
-  # modify LD_LIBRARY_PATH to ensure it has the conda env.
-  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
@ -311,25 +289,19 @@ test_dynamo_shard() {
 test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
-  python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose
-  python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose
-  python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose
-  python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose
-  python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose
-  python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_state_dict.py -k test_dp_state_dict_save_load --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose
-  python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_clip_grad_norm_.py -k test_clip_grad_norm_2d --verbose
-  python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose
+  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
+  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
+  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
+  pytest test/distributed/test_c10d_functional_native.py
+  pytest test/distributed/_tensor/test_dtensor_compile.py
+  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
+  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
+  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -337,50 +309,26 @@ test_inductor_distributed() {
  assert_git_not_dirty
 }

-test_inductor_shard() {
-  if [[ -z "$NUM_TEST_SHARDS" ]]; then
-    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
-    exit 1
-  fi
-
+test_inductor() {
  python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --inductor \
-    --include test_modules test_ops test_ops_gradients test_torch \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-
+  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py \
-    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-}
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose

-test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor
  fi
 }

 test_inductor_cpp_wrapper_abi_compatible() {
  export TORCHINDUCTOR_ABI_COMPATIBLE=1
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  mkdir -p "$TEST_REPORTS_DIR"
-
  echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1"
  # cpu stack allocation causes segfault and needs more investigation
-  PYTORCH_TESTING_DEVICE_ONLY_FOR="" python test/run_test.py --include inductor/test_cpu_cpp_wrapper
+  TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper
  python test/run_test.py --include inductor/test_cuda_cpp_wrapper
-
-  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -405,7 +353,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -484,17 +432,6 @@ test_perf_for_dashboard() {
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
-      if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then
-        # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this.
-        # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data
-        # to fill the dashboard.
-        python "benchmarks/dynamo/$suite.py" \
-          "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \
-          --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true
-        # Copy cudagraph results as mock data, easiest choice?
-        cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \
-          "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv"
-      fi
    done
  done
 }
@ -530,10 +467,9 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      # Test AOTInductor with the ABI-compatible mode on CI
      # This can be removed once the ABI-compatible mode becomes default.
-      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
    python "benchmarks/dynamo/$suite.py" \
@ -550,16 +486,6 @@ test_single_dynamo_benchmark() {
  fi
 }

-test_inductor_micro_benchmark() {
-  TEST_REPORTS_DIR=$(pwd)/test/test-reports
-  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
-}
-
-test_inductor_halide() {
-  python test/run_test.py --include inductor/test_halide.py --verbose
-  assert_git_not_dirty
-}
-
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -574,16 +500,8 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
-      local dt="float32"
-      if [[ "${TEST_CONFIG}" == *amp* ]]; then
-        dt="amp"
-      fi
-      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
-      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
-      fi
+    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
    else
@ -597,16 +515,12 @@ test_inductor_torchbench_smoketest_perf() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  # Test some models in the cpp wrapper mode
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-  TORCHINDUCTOR_ABI_COMPATIBLE=1 TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
+  # smoke test the cpp_wrapper mode
+  TORCHINDUCTOR_CPP_WRAPPER=1 python benchmarks/dynamo/torchbench.py --device cuda --accuracy --bfloat16 \
+    --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv"
  python benchmarks/dynamo/check_accuracy.py \
-    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_smoketest.csv" \
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -621,8 +535,7 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.9

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -634,15 +547,6 @@ test_inductor_torchbench_smoketest_perf() {
      "$TEST_REPORTS_DIR/inductor_training_smoketest_$test.csv" \
      --expected benchmarks/dynamo/expected_ci_perf_inductor_torchbench.csv
  done
-
-  # Perform some "warm-start" runs for a few huggingface models.
-  for test in AlbertForQuestionAnswering AllenaiLongformerBase DistilBertForMaskedLM DistillGPT2 GoogleFnet YituTechConvBert; do
-    python benchmarks/dynamo/huggingface.py --accuracy --training --amp --inductor --device cuda --warm-start-latency \
-      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
-  done
 }

 test_inductor_torchbench_cpu_smoketest_perf(){
@ -689,12 +593,6 @@ test_inductor_torchbench_cpu_smoketest_perf(){
  done
 }

-test_torchbench_gcp_smoketest(){
-  pushd "${TORCHBENCHPATH}"
-  python test.py -v
-  popd
-}
-
 test_python_gloo_with_tls() {
  source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
  assert_git_not_dirty
@ -726,6 +624,7 @@ test_aten() {
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libmkldnn* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libnccl* "$TEST_BASE_DIR"
  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtorch* "$TEST_BASE_DIR"
+  ${SUDO} ln -sf "$TORCH_LIB_DIR"/libtbb* "$TEST_BASE_DIR"

  ls "$TEST_BASE_DIR"
  aten/tools/run_tests.sh "$TEST_BASE_DIR"
@ -750,6 +649,21 @@ test_without_numpy() {
  popd
 }

+# pytorch extensions require including torch/extension.h which includes all.h
+# which includes utils.h which includes Parallel.h.
+# So you can call for instance parallel_for() from your extension,
+# but the compilation will fail because of Parallel.h has only declarations
+# and definitions are conditionally included Parallel.h(see last lines of Parallel.h).
+# I tried to solve it #39612 and #39881 by including Config.h into Parallel.h
+# But if Pytorch is built with TBB it provides Config.h
+# that has AT_PARALLEL_NATIVE_TBB=1(see #3961 or #39881) and it means that if you include
+# torch/extension.h which transitively includes Parallel.h
+# which transitively includes tbb.h which is not available!
+if [[ "${BUILD_ENVIRONMENT}" == *tbb* ]]; then
+  sudo mkdir -p /usr/include/tbb
+  sudo cp -r "$PWD"/third_party/tbb/include/tbb/* /usr/include/tbb
+fi
+
 test_libtorch() {
  local SHARD="$1"

@ -763,6 +677,7 @@ test_libtorch() {
    ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libshm* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
+    ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"
    ln -sf "$TORCH_LIB_DIR"/libnvfuser* "$TORCH_BIN_DIR"

    export CPP_TESTS_DIR="${TORCH_BIN_DIR}"
@ -899,6 +814,7 @@ test_rpc() {
  # test reporting process to function as expected.
  ln -sf "$TORCH_LIB_DIR"/libtorch* "$TORCH_BIN_DIR"
  ln -sf "$TORCH_LIB_DIR"/libc10* "$TORCH_BIN_DIR"
+  ln -sf "$TORCH_LIB_DIR"/libtbb* "$TORCH_BIN_DIR"

  CPP_TESTS_DIR="${TORCH_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_cpp_rpc
 }
@ -1178,21 +1094,15 @@ test_executorch() {

  pushd /executorch

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
+  # NB: We need to build ExecuTorch runner here and not inside the Docker image
+  # because it depends on PyTorch
  # shellcheck disable=SC1091
-  source .ci/scripts/setup-linux.sh cmake
-
-  echo "Run ExecuTorch unit tests"
-  pytest -v -n auto
-  # shellcheck disable=SC1091
-  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
+  source .ci/scripts/utils.sh
+  build_executorch_runner "cmake"

  echo "Run ExecuTorch regression tests for some models"
+  # NB: This is a sample model, more can be added here
+  export PYTHON_EXECUTABLE=python
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1206,33 +1116,11 @@ test_executorch() {
  assert_git_not_dirty
 }

-test_linux_aarch64(){
-  python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \
-       test_transformers test_multiprocessing test_numpy_interop --verbose
-
-  # Dynamo tests
-  python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \
-       dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \
-       dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose
-
-  # Inductor tests
-  python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \
-       inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \
-       inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \
-       inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \
-       inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \
-       inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \
-       inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \
-       inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose
-}
-
 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
  (cd test && python -c "import torch; print(torch.__config__.parallel_info())")
 fi
-if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then
-  test_linux_aarch64
-elif [[ "${TEST_CONFIG}" == *backward* ]]; then
+if [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
  # Do NOT add tests after bc check tests, see its comment.
 elif [[ "${TEST_CONFIG}" == *xla* ]]; then
@ -1252,12 +1140,11 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
+elif [[ "$TEST_CONFIG" == deploy ]]; then
+  checkout_install_torchdeploy
+  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
-  test_inductor_halide
-elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
-  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
@ -1267,14 +1154,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
  fi
  install_torchtext
  install_torchvision
-  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1286,14 +1172,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
      shufflenet_v2_x1_0 hf_GPT2
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
-  elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then
-    checkout_install_torchbench
-    TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest
  else
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1301,23 +1184,17 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
  test_inductor_cpp_wrapper_abi_compatible
-elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
-  test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    test_inductor_aoti
-    test_inductor_distributed
-  fi
-elif [[ "${TEST_CONFIG}" == *dynamo* ]]; then
+  test_inductor
+  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  test_dynamo_shard 1
+  test_aten
+elif [[ "${TEST_CONFIG}" == *dynamo* && $SHARD_NUMBER -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    test_aten
-  fi
-elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
-  install_torchvision
-  test_python_shard "$SHARD_NUMBER"
-  test_aten
 elif [[ "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  test_without_numpy
  install_torchvision
@ -1347,6 +1224,10 @@ elif [[ "${BUILD_ENVIRONMENT}" == *-mobile-lightweight-dispatch* ]]; then
  test_libtorch
 elif [[ "${TEST_CONFIG}" = docs_test ]]; then
  test_docs_test
+elif [[ "${BUILD_ENVIRONMENT}" == *rocm* && -n "$TESTS_TO_INCLUDE" ]]; then
+  install_torchvision
+  test_python
+  test_aten
 elif [[ "${BUILD_ENVIRONMENT}" == *xpu* ]]; then
  install_torchvision
  test_python
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -17,22 +17,22 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol
 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers

 call %INSTALLER_DIR%\install_magma.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 call %INSTALLER_DIR%\install_sccache.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 :: Miniconda has been installed as part of the Windows AMI with all the dependencies.
 :: We just need to activate it here
 call %INSTALLER_DIR%\activate_miniconda3.bat
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b

 :: Override VS env here
 pushd .
@ -41,8 +41,8 @@ if "%VC_VERSION%" == "" (
 ) else (
    call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION%
 )
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
@echo on
 popd

@ -52,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION%

 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    goto fail
+    exit /b 1
 )
 rem version transformer, for example 10.1 to 10_1.
 if x%CUDA_VERSION:.=%==x%CUDA_VERSION% (
    echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.'
-    goto fail
+    exit /b 1
 )
 set VERSION_SUFFIX=%CUDA_VERSION:.=_%
 set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH%
@ -101,8 +101,8 @@ if "%USE_CUDA%"=="1" (
  :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers
  :: randomtemp.exe and sccache.exe into a batch file which CMake invokes.
  curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe
-  if errorlevel 1 goto fail
-  if not errorlevel 0 goto fail
+  if errorlevel 1 exit /b
+  if not errorlevel 0 exit /b
  echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat"
  cat %TMP_DIR%/bin/nvcc.bat
  set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat
@ -114,8 +114,8 @@ if "%USE_CUDA%"=="1" (
 set

 python setup.py bdist_wheel
-if errorlevel 1 goto fail
-if not errorlevel 0 goto fail
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
 sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
@ -135,8 +135,3 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps

 sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json
 sccache --stop-server
-
-exit /b 0
-
-:fail
-exit /b 1
--- a/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
+++ b/.ci/pytorch/win-test-helpers/run_python_nn_smoketests.py
@ -4,7 +4,6 @@ import os
 import subprocess
 import sys

-
 COMMON_TESTS = [
    (
        "Checking that torch is available",
--- a/.circleci/codegen_validation/normalize_yaml_fragment.py
+++ b/.circleci/codegen_validation/normalize_yaml_fragment.py
@ -5,7 +5,6 @@ import sys

 import yaml

-
 # Need to import modules that lie on an upward-relative path
 sys.path.append(os.path.join(sys.path[0], ".."))

--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -46,18 +46,13 @@ if [[ "\$python_nodot" = *310* ]]; then
  PROTOBUF_PACKAGE="protobuf>=3.19.0"
 fi

-if [[ "\$python_nodot" = *39* ]]; then
+if [[ "\$python_nodot" = *39*  ]]; then
  # There's an issue with conda channel priority where it'll randomly pick 1.19 over 1.20
  # we set a lower boundary here just to be safe
  NUMPY_PIN=">=1.20"
 fi

-if [[ "\$python_nodot" = *38* ]]; then
-  # sympy 1.12.1 is the last version that supports Python 3.8
-  SYMPY_PIN="==1.12.1"
-else
-  SYMPY_PIN=">=1.13.0"
-fi
+

 # Move debug wheels out of the package dir so they don't get installed
 mkdir -p /tmp/debug_final_pkgs
@ -88,7 +83,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
      "numpy\${NUMPY_PIN}" \
      mkl>=2018 \
      ninja \
-      "sympy\${SYMPY_PIN}" \
+      sympy \
      typing-extensions \
      ${PROTOBUF_PACKAGE}
    if [[ "$DESIRED_CUDA" == 'cpu' ]]; then
@ -101,21 +96,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
    conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
-  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
-  else
-    pip install "\$pkg"
-    retry pip install -q numpy protobuf typing-extensions
-  fi
+  pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+  retry pip install -q numpy protobuf typing-extensions
 fi
 if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  pkg="\$(ls /final_pkgs/*-latest.zip)"
@ -123,18 +105,9 @@ if [[ "$PACKAGE_TYPE" == libtorch ]]; then
  cd /tmp/libtorch
 fi

-if [[ "$GPU_ARCH_TYPE" == xpu ]]; then
-  # Workaround for __mkl_tmp_MOD unbound variable issue, refer https://github.com/pytorch/pytorch/issues/130543
-  set +u
-  source /opt/intel/oneapi/pytorch-gpu-dev-0.5/oneapi-vars.sh
-fi
-
 # Test the package
 /builder/check_binary.sh

-# Clean temp files
-cd /builder && git clean -ffdx
-
 # =================== The above code will be executed inside Docker container ===================
 EOL
 echo
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$PACKAGE_TYPE" == conda ]]; then
    export DOCKER_IMAGE="pytorch/conda-cuda"
  elif [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux-cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
  fi
 fi

@ -75,10 +75,10 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-  # Only linux Python < 3.13 are supported wheels for triton
-  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+  # Only linux Python < 3.12 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'"
+  TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
      TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,18 +100,30 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

-# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton xpu package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*xpu.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}"
-    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
-        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-xpu.txt)
-        TRITON_REQUIREMENT="pytorch-triton-xpu==${TRITON_VERSION}+${TRITON_SHORTHASH}"
-    fi
-    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
-    else
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}"
+JAVA_HOME=
+BUILD_JNI=OFF
+if [[ "$PACKAGE_TYPE" == libtorch ]]; then
+  POSSIBLE_JAVA_HOMES=()
+  POSSIBLE_JAVA_HOMES+=(/usr/local)
+  POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
+  POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
+  # Add the Windows-specific JNI path
+  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
+  for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
+    if [[ -e "$JH/include/jni.h" ]] ; then
+      # Skip if we're not on Windows but haven't found a JAVA_HOME
+      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
+        break
+      fi
+      echo "Found jni.h under $JH"
+      JAVA_HOME="$JH"
+      BUILD_JNI=ON
+      break
    fi
+  done
+  if [ -z "$JAVA_HOME" ]; then
+    echo "Did not find jni.h"
+  fi
 fi

 cat >"$envfile" <<EOL
@ -124,7 +136,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -148,6 +159,8 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
 export ANACONDA_USER='pytorch'

 export USE_FBGEMM=1
+export JAVA_HOME=$JAVA_HOME
+export BUILD_JNI=$BUILD_JNI
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -25,15 +25,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
-# this is special build with all dependencies packaged
-if [[ ${BUILD_NAME} == *-full* ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_full"
-fi
-
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.circleci/scripts/trigger_azure_pipeline.py
+++ b/.circleci/scripts/trigger_azure_pipeline.py
@ -8,7 +8,6 @@ import time

 import requests

-
 AZURE_PIPELINE_BASE_URL = "https://aiinfra.visualstudio.com/PyTorch/"
 AZURE_DEVOPS_PAT_BASE64 = os.environ.get("AZURE_DEVOPS_PAT_BASE64_SECRET", "")
 PIPELINE_ID = "911"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -36,7 +36,6 @@ hicpp-exception-baseclass,
 hicpp-avoid-goto,
 misc-*,
 -misc-const-correctness,
-misc-include-cleaner,
 -misc-use-anonymous-namespace,
 -misc-unused-parameters,
 -misc-no-recursion,
@ -61,7 +60,6 @@ readability-simplify-subscript-expr,
 readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
+AnalyzeTemporaryDtors: false
 WarningsAsErrors: '*'
-CheckOptions:
-  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
 ...
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,SIM911,T4,W,B9,TOR0,TOR1,TOR2,TOR9
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
@ -54,7 +54,6 @@ per-file-ignores =
    torch/ao/quantization/fx/_decomposed.py: TOR901
    torch/distributed/_functional_collectives.py: TOR901
    torch/distributed/_spmd/data_parallel.py: TOR901
-    torch/distributed/_tensor/_collective_utils.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@ -40,7 +40,3 @@ e6ec0efaf87703c5f889cfc20b29be455885d58d
 a53cda1ddc15336dc1ff0ce1eff2a49cdc5f882e
 # 2024-01-02 clangformat: fused adam #116583
 9dc68d1aa9e554d09344a10fff69f7b50b2d23a0
-# 2024-06-28 enable UFMT in `torch/storage.py`
-d80939e5e9337e8078f11489afefec59fd42f93b
-# 2024-06-28 enable UFMT in `torch.utils.data`
-7cf0b90e49689d45be91aa539fdf54cf2ea8a9a3
--- a/.gitattributes
+++ b/.gitattributes
@ -4,4 +4,3 @@
 .github/generated-* linguist-generated=true
 .github/scripts/gql_mocks.json linguist-generated=true
 third_party/LICENSES_BUNDLED.txt linguist-generated=true
-tools/build/bazel/requirements.txt linguist-generated=true
--- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@ -8,18 +8,7 @@ body:
      value: >
        #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the
        existing and past issues](https://github.com/pytorch/pytorch/issues)
-        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/main/dynamo/index.html)
-
-        Note: if you're submitting an issue that you generated from a fuzzer. Please do the following:
-
-        - Ensure rtol/atol are at default tolerances
-
-        - Dont compare indices of max/min etc, because that avoids the above requirement
-
-        - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline
-
-        If the above requirements are met, add the label "topic: fuzzer" to your issue.
-
+        It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html)
  - type: textarea
    attributes:
      label: 🐛 Describe the bug
@ -44,7 +33,7 @@ body:
      label: Minified repro
      description: |
        Please run the minifier on your example and paste the minified code below
-        Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html
+        Learn more here https://pytorch.org/docs/master/compile/troubleshooting.html
      placeholder: |
        env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py
        or
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -1,12 +1,9 @@
 self-hosted-runner:
  labels:
-    # GitHub hosted x86 Linux runners
    - linux.20_04.4x
    - linux.20_04.16x
-    # Repo-specific LF hosted ARC runners
-    - linux.large.arc
-    # Organization-wide AWS Linux Runners
    - linux.large
+    - linux.large.arc
    - linux.2xlarge
    - linux.4xlarge
    - linux.12xlarge
@ -16,36 +13,16 @@ self-hosted-runner:
    - linux.8xlarge.nvidia.gpu
    - linux.16xlarge.nvidia.gpu
    - linux.g5.4xlarge.nvidia.gpu
-    # Organization-wide AWS Linux Runners on Linux Foundation account
-    - lf.linux.large
-    - lf.linux.2xlarge
-    - lf.linux.4xlarge
-    - lf.linux.12xlarge
-    - lf.linux.24xlarge
-    - lf.linux.arm64.2xlarge
-    - lf.linux.4xlarge.nvidia.gpu
-    - lf.linux.8xlarge.nvidia.gpu
-    - lf.linux.16xlarge.nvidia.gpu
-    - lf.linux.g5.4xlarge.nvidia.gpu
-    # Repo-specific IBM hosted S390x runner
-    - linux.s390x
-    # Organization wide AWS Windows runners
    - windows.4xlarge.nonephemeral
    - windows.8xlarge.nvidia.gpu
    - windows.8xlarge.nvidia.gpu.nonephemeral
    - windows.g5.4xlarge.nvidia.gpu
-    # Organization-wide AMD hosted MI300 runners
+    - bm-runner
    - linux.rocm.gpu
-    # Repo-specific Apple hosted  runners
-    - macos-m1-ultra
-    - macos-m2-14
-    # Org wise AWS `mac2.metal` runners (2020 Mac mini hardware powered by Apple silicon M1 processors)
    - macos-m1-stable
    - macos-m1-13
-    - macos-m1-14
-    # GitHub-hosted MacOS runners
+    - macos-12-xl
+    - macos-12
+    - macos12.3-m1
    - macos-latest-xlarge
    - macos-13-xlarge
-    - macos-14-xlarge
-    # Organization-wide Intel hosted XPU runners
-    - linux.idc.xpu
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,14 +14,12 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
-        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
-        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/actions/download-build-artifacts/action.yml
+++ b/.github/actions/download-build-artifacts/action.yml
@ -9,10 +9,6 @@ inputs:
  use-gha:
    description: If set to any value, use GHA to download the artifact. Otherwise use s3.
    required: false
-  s3-bucket:
-    description: S3 bucket to download builds
-    required: false
-    default: "gha-artifacts"

 runs:
  using: composite
@ -22,10 +18,9 @@ runs:
      uses: seemethere/download-artifact-s3@v4
      with:
        name: ${{ inputs.name }}
-        s3-bucket: ${{ inputs.s3-bucket }}

    - name: Download PyTorch Build Artifacts from GHA
-      if: ${{ inputs.use-gha }}
+      if: inputs.use-gha
      uses: actions/download-artifact@v3
      with:
        name: ${{ inputs.name }}
@ -34,10 +29,6 @@ runs:
      shell: bash
      run: unzip -o artifacts.zip

-    - name: Remove artifacts.zip
-      shell: bash
-      run: rm artifacts.zip
-
    - name: Output disk space left
      shell: bash
      run: df -H
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -13,13 +13,6 @@ inputs:
    required: true
    type: string
    description: JSON description of what test configs to run.
-  selected-test-configs:
-    required: false
-    type: string
-    description: |
-      A comma-separated list of test configurations from the test matrix to keep,
-      The empty list means we are going to keep every configurations by defaults
-    default: ""
  job-name:
    type: string
    required: false
@ -47,9 +40,6 @@ outputs:
  ci-no-td:
    description: True if ci-no-td label was on PR or [ci-no-td] in PR body.
    value: ${{ steps.filter.outputs.ci-no-td }}
-  ci-td-distributed:
-    description: True if ci-td-distributed label was on PR or [ci-td-distributed] in PR body.
-    value: ${{ steps.filter.outputs.ci-td-distributed }}

 runs:
  using: composite
@ -66,8 +56,7 @@ runs:
        command: |
          set -eux
          # PyYAML 6.0 doesn't work with MacOS x86 anymore
-          # This must run on Python-3.7 (AmazonLinux2) so can't use request=3.32.2
-          python3 -m pip install requests==2.27.1 pyyaml==6.0.1
+          python3 -m pip install requests==2.26.0 pyyaml==6.0.1

    - name: Parse ref
      id: parse-ref
@ -134,7 +123,6 @@ runs:
          --workflow "${GITHUB_WORKFLOW}" \
          --job-name "${JOB_NAME}" \
          --test-matrix "${{ inputs.test-matrix }}" \
-          --selected-test-configs "${{ inputs.selected-test-configs }}" \
          --pr-number "${PR_NUMBER}" \
          --tag "${TAG}" \
          --event-name "${EVENT_NAME}" \
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -1,226 +0,0 @@
-name: linux-build
-
-inputs:
-  build-environment:
-    required: true
-    description: Top-level label for what's being built/tested.
-  docker-image-name:
-    required: true
-    description: Name of the base docker image to build with.
-  build-generates-artifacts:
-    required: false
-    default: "true"
-    description: If set, upload generated build artifacts.
-  build-with-debug:
-    required: false
-    default: "false"
-    description: If set, build in debug mode.
-  sync-tag:
-    required: false
-    default: ""
-    description: |
-      If this is set, our linter will use this to make sure that every other
-      job with the same `sync-tag` is identical.
-  cuda-arch-list:
-    required: false
-    default: "5.2"
-    description: Runner label to select worker type
-  runner:
-    required: false
-    default: "linux.2xlarge"
-    description: |
-      List of CUDA architectures CI build should target.
-  test-matrix:
-    required: false
-    type: string
-    description: |
-      An option JSON description of what test configs to run later on. This
-      is moved here from the Linux test workflow so that we can apply filter
-      logic using test-config labels earlier and skip unnecessary builds
-  s3-bucket:
-    description: S3 bucket to download artifact
-    required: false
-    default: "gha-artifacts"
-  aws-role-to-assume:
-    description: role to assume for downloading artifacts
-    required: false
-    default: ""
-  GITHUB_TOKEN:
-    description: GitHub token
-    required: true
-  HUGGING_FACE_HUB_TOKEN:
-    description: Hugging Face Hub token
-    required: false
-    default: ""
-  use_split_build:
-    description: |
-      [Experimental] Build a libtorch only wheel and build pytorch such that
-      are built from the libtorch wheel.
-    required: false
-    type: boolean
-    default: false
-outputs:
-  docker-image:
-    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
-    description: The docker image containing the built PyTorch.
-  test-matrix:
-    value: ${{ steps.filter.outputs.test-matrix }}
-    description: An optional JSON description of what test configs to run later on.
-
-runs:
-  using: composite
-  steps:
-    - name: Setup Linux
-      uses: ./.github/actions/setup-linux
-
-    - name: configure aws credentials
-      uses: aws-actions/configure-aws-credentials@v3
-      if: ${{ inputs.aws-role-to-assume != '' }}
-      with:
-        role-to-assume: ${{ inputs.aws-role-to-assume }}
-        role-session-name: gha-linux-build
-        role-duration-seconds: 10800
-        aws-region: us-east-1
-
-    - name: Calculate docker image
-      id: calculate-docker-image
-      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-      with:
-        docker-image-name: ${{ inputs.docker-image-name }}
-
-    - name: Use following to pull public copy of the image
-      id: print-ghcr-mirror
-      env:
-        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      shell: bash
-      run: |
-        tag=${ECR_DOCKER_IMAGE##*/}
-        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
-
-    - name: Pull docker image
-      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-      with:
-        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-    - name: Parse ref
-      id: parse-ref
-      shell: bash
-      run: .github/scripts/parse_ref.py
-
-    - name: Get workflow job id
-      id: get-job-id
-      uses: ./.github/actions/get-workflow-job-id
-      if: always()
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-
-    # Apply the filter logic to the build step too if the test-config label is already there
-    - name: Select all requested test configurations (if the test matrix is available)
-      id: filter
-      uses: ./.github/actions/filter-test-configs
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-        test-matrix: ${{ inputs.test-matrix }}
-        job-name: ${{ steps.get-job-id.outputs.job-name }}
-
-    - name: Download pytest cache
-      uses: ./.github/actions/pytest-cache-download
-      continue-on-error: true
-      with:
-        cache_dir: .pytest_cache
-        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
-        s3_bucket: ${{ inputs.s3-bucket }}
-
-    - name: Build
-      if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == ''
-      id: build
-      env:
-        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-        BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        # TODO duplicated
-        AWS_DEFAULT_REGION: us-east-1
-        PR_NUMBER: ${{ github.event.pull_request.number }}
-        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
-        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-        PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }}
-        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
-        DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
-        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
-        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
-        USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
-      shell: bash
-      run: |
-        # detached container should get cleaned up by teardown_ec2_linux
-        container_name=$(docker run \
-          -e BUILD_ENVIRONMENT \
-          -e MAX_JOBS="$(nproc --ignore=2)" \
-          -e AWS_DEFAULT_REGION \
-          -e PR_NUMBER \
-          -e SHA1 \
-          -e BRANCH \
-          -e SCCACHE_BUCKET \
-          -e SCCACHE_S3_KEY_PREFIX \
-          -e XLA_CUDA \
-          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-          -e SKIP_SCCACHE_INITIALIZATION=1 \
-          -e TORCH_CUDA_ARCH_LIST \
-          -e PR_LABELS \
-          -e OUR_GITHUB_JOB_ID \
-          -e HUGGING_FACE_HUB_TOKEN \
-          -e USE_SPLIT_BUILD \
-          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-          --security-opt seccomp=unconfined \
-          --cap-add=SYS_PTRACE \
-          --tty \
-          --detach \
-          --user jenkins \
-          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-          -w /var/lib/jenkins/workspace \
-          "${DOCKER_IMAGE}"
-        )
-        docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh'
-
-    - name: Archive artifacts into zip
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
-      shell: bash
-      run: |
-        zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files
-
-    - name: Store PyTorch Build Artifacts on S3
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
-      with:
-        name: ${{ inputs.build-environment }}
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Store PyTorch Build Artifacts on S3 for split build
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
-      with:
-        name: ${{ inputs.build-environment }}-experimental-split-build
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Upload sccache stats
-      if: steps.build.outcome != 'skipped'
-      uses: seemethere/upload-artifact-s3@v5
-      with:
-        s3-prefix: |
-          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
-        retention-days: 365
-        if-no-files-found: warn
-        path: sccache-stats-*.json
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Teardown Linux
-      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
--- a/.github/actions/linux-test/action.yml
+++ b/.github/actions/linux-test/action.yml
@ -1,384 +0,0 @@
-name: linux-test
-
-inputs:
-  build-environment:
-    required: true
-    type: string
-    description: Top-level label for what's being built/tested.
-  test-matrix:
-    required: true
-    type: string
-    description: JSON description of what test configs to run.
-  docker-image:
-    required: true
-    type: string
-    description: Docker image to run in.
-  sync-tag:
-    required: false
-    type: string
-    default: ""
-    description: |
-      If this is set, our linter will use this to make sure that every other
-      job with the same `sync-tag` is identical.
-  use-gha:
-    required: false
-    type: string
-    default: ""
-    description: If set to any value, upload to GHA. Otherwise upload to S3.
-  dashboard-tag:
-    required: false
-    type: string
-    default: ""
-  s3-bucket:
-    description: S3 bucket to download artifact
-    required: false
-    type: string
-    default: "gha-artifacts"
-  aws-role-to-assume:
-    description: role to assume for downloading artifacts
-    required: false
-    type: string
-    default: ""
-  HUGGING_FACE_HUB_TOKEN:
-    description: |
-      HF Auth token to avoid rate limits when downloading models or datasets from hub
-    required: false
-    default: ""
-  GITHUB_TOKEN:
-    description: GitHub token
-    required: true
-
-#env:
-#  GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }}
-
-runs:
-  using: composite
-  steps:
-    - name: Setup Linux
-      uses: ./.github/actions/setup-linux
-
-    - name: configure aws credentials
-      if : ${{ inputs.aws-role-to-assume != '' }}
-      uses: aws-actions/configure-aws-credentials@v3
-      with:
-        role-to-assume: ${{ inputs.aws-role-to-assume }}
-        role-session-name: gha-linux-test
-        aws-region: us-east-1
-
-    - name: Calculate docker image
-      id: calculate-docker-image
-      uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
-      with:
-        docker-image-name: ${{ inputs.docker-image }}
-
-    - name: Use following to pull public copy of the image
-      id: print-ghcr-mirror
-      env:
-        ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
-      shell: bash
-      run: |
-        tag=${ECR_DOCKER_IMAGE##*/}
-        echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
-
-    - name: Pull docker image
-      uses: pytorch/test-infra/.github/actions/pull-docker-image@main
-      with:
-        docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-
-    - name: Check if in a ARC runner
-      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT"
-
-    - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
-      id: install-nvidia-driver
-      uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-      if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
-
-    - name: Lock NVIDIA A100 40GB Frequency
-      shell: bash
-      run: |
-        sudo nvidia-smi -pm 1
-        sudo nvidia-smi -ac 1215,1410
-        nvidia-smi
-      if: contains(matrix.runner, 'a100')
-
-    - name: Start monitoring script
-      id: monitor-script
-      shell: bash
-      continue-on-error: true
-      run: |
-        python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84
-        python3 -m tools.stats.monitor > usage_log.txt 2>&1 &
-        echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
-
-    - name: Download build artifacts
-      uses: ./.github/actions/download-build-artifacts
-      with:
-        name: ${{ inputs.build-environment }}
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Download TD artifacts
-      continue-on-error: true
-      uses: ./.github/actions/download-td-artifacts
-
-    - name: Parse ref
-      id: parse-ref
-      shell: bash
-      run: .github/scripts/parse_ref.py
-
-    - name: Get workflow job id
-      id: get-job-id
-      uses: ./.github/actions/get-workflow-job-id
-      if: always()
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-
-    - name: Check for keep-going label and re-enabled test issues
-      # This uses the filter-test-configs action because it conviniently
-      # checks for labels and re-enabled test issues.  It does not actually do
-      # any filtering.  All filtering is done in the build step.
-      id: keep-going
-      uses: ./.github/actions/filter-test-configs
-      with:
-        github-token: ${{ inputs.GITHUB_TOKEN }}
-        test-matrix: ${{ inputs.test-matrix }}
-        job-name: ${{ steps.get-job-id.outputs.job-name }}
-
-    - name: Test
-      id: test
-      env:
-        BUILD_ENVIRONMENT: ${{ inputs.build-environment }}
-        PR_NUMBER: ${{ github.event.pull_request.number }}
-        GITHUB_REPOSITORY: ${{ github.repository }}
-        GITHUB_WORKFLOW: ${{ github.workflow }}
-        GITHUB_JOB: ${{ github.job }}
-        GITHUB_RUN_ID: ${{ github.run_id }}
-        GITHUB_RUN_NUMBER: ${{ github.run_number }}
-        GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
-        JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
-        JOB_NAME: ${{ steps.get-job-id.outputs.job-name }}
-        BRANCH: ${{ steps.parse-ref.outputs.branch }}
-        SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-        BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }}
-        TEST_CONFIG: ${{ matrix.config }}
-        SHARD_NUMBER: ${{ matrix.shard }}
-        NUM_TEST_SHARDS: ${{ matrix.num_shards }}
-        REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
-        CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
-        VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
-        NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
-        NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
-        TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
-        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-        SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
-        SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
-        DOCKER_IMAGE: ${{ inputs.docker-image }}
-        XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
-        XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
-        PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
-        PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
-        DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
-        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
-      shell: bash
-      run: |
-        set -x
-
-        if [[ $TEST_CONFIG == 'multigpu' ]]; then
-          TEST_COMMAND=.ci/pytorch/multigpu-test.sh
-        elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then
-          TEST_COMMAND=.ci/onnx/test.sh
-        else
-          TEST_COMMAND=.ci/pytorch/test.sh
-        fi
-
-        # detached container should get cleaned up by teardown_ec2_linux
-        # TODO: Stop building test binaries as part of the build phase
-        # Used for GPU_FLAG since that doesn't play nice
-        # shellcheck disable=SC2086,SC2090
-        container_name=$(docker run \
-          ${GPU_FLAG:-} \
-          -e BUILD_ENVIRONMENT \
-          -e PR_NUMBER \
-          -e GITHUB_ACTIONS \
-          -e GITHUB_REPOSITORY \
-          -e GITHUB_WORKFLOW \
-          -e GITHUB_JOB \
-          -e GITHUB_RUN_ID \
-          -e GITHUB_RUN_NUMBER \
-          -e GITHUB_RUN_ATTEMPT \
-          -e JOB_ID \
-          -e JOB_NAME \
-          -e BASE_SHA \
-          -e BRANCH \
-          -e SHA1 \
-          -e AWS_DEFAULT_REGION \
-          -e IN_WHEEL_TEST \
-          -e SHARD_NUMBER \
-          -e TEST_CONFIG \
-          -e NUM_TEST_SHARDS \
-          -e REENABLED_ISSUES \
-          -e CONTINUE_THROUGH_ERROR \
-          -e VERBOSE_TEST_LOGS \
-          -e NO_TEST_TIMEOUT \
-          -e NO_TD \
-          -e TD_DISTRIBUTED \
-          -e PR_LABELS \
-          -e MAX_JOBS="$(nproc --ignore=2)" \
-          -e SCCACHE_BUCKET \
-          -e SCCACHE_S3_KEY_PREFIX \
-          -e XLA_CUDA \
-          -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
-          -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \
-          -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
-          -e SKIP_SCCACHE_INITIALIZATION=1 \
-          -e HUGGING_FACE_HUB_TOKEN \
-          -e DASHBOARD_TAG \
-          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
-          --security-opt seccomp=unconfined \
-          --cap-add=SYS_PTRACE \
-          --ipc=host \
-          --shm-size="${SHM_SIZE}" \
-          --tty \
-          --detach \
-          --name="${container_name}" \
-          --user jenkins \
-          -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-          -w /var/lib/jenkins/workspace \
-          "${DOCKER_IMAGE}"
-        )
-        # Propagate download.pytorch.org IP to container
-        grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
-        echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
-        docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
-
-    - name: Upload pytest cache if tests failed
-      uses: ./.github/actions/pytest-cache-upload
-      continue-on-error: true
-      if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure'
-      with:
-        cache_dir: .pytest_cache
-        shard: ${{ matrix.shard }}
-        sha: ${{ github.event.pull_request.head.sha || github.sha }}
-        test_config: ${{ matrix.config }}
-        job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}
-
-    - name: Print remaining test logs
-      shell: bash
-      if: always() && steps.test.conclusion
-      run: |
-        cat test/**/*_toprint.log || true
-
-    - name: Stop monitoring script
-      if: always() && steps.monitor-script.outputs.monitor-script-pid
-      shell: bash
-      continue-on-error: true
-      env:
-        MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }}
-      run: |
-        kill "$MONITOR_SCRIPT_PID"
-
-    - name: Upload test artifacts
-      uses: ./.github/actions/upload-test-artifacts
-      if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped'
-      with:
-        file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
-        use-gha: ${{ inputs.use-gha }}
-        s3-bucket: ${{ inputs.s3-bucket }}
-
-    - name: Collect backtraces from coredumps (if any)
-      if: always()
-      shell: bash
-      run: |
-        # shellcheck disable=SC2156
-        find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \;
-
-    - name: Store Core dumps on S3
-      uses: seemethere/upload-artifact-s3@v5
-      if: failure()
-      with:
-        name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}
-        retention-days: 14
-        if-no-files-found: ignore
-        path: ./**/core.[1-9]*
-
-    - name: Teardown Linux
-      uses: pytorch/test-infra/.github/actions/teardown-linux@main
-      if: always()
-
-    # NB: We are currently having an intermittent GPU-related issue on G5 runners with
-    # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does
-    # not seem to help. Here are some symptoms:
-    #   * Calling nvidia-smi timeouts after 60 second
-    #   * Fail to run nvidia-smi with an unable to determine the device handle for GPU
-    #     unknown error
-    #   * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch
-    #   * Run docker --gpus all fails with error response from daemon
-    #
-    # As both the root cause and recovery path are unclear, let's take the runner out of
-    # service so that it doesn't get any more jobs
-    - name: Check NVIDIA driver installation step
-      if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped'
-      shell: bash
-      env:
-        RUNNER_WORKSPACE: ${{ runner.workspace }}
-      run: |
-        set +e
-        set -x
-
-        nvidia-smi
-        # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in
-        # the case where the driver has already crashed as it still can get the driver version
-        # and some basic information like the bus ID.  However, the rest of the information
-        # would be missing (ERR!), for example:
-        #
-        # +-----------------------------------------------------------------------------+
-        # | NVIDIA-SMI 525.89.02    Driver Version: 525.89.02    CUDA Version: 12.0     |
-        # |-------------------------------+----------------------+----------------------+
-        # | GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
-        # | Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
-        # |                               |                      |               MIG M. |
-        # |===============================+======================+======================|
-        # |   0  ERR!                Off  | 00000000:00:1E.0 Off |                 ERR! |
-        # |ERR!  ERR! ERR!    ERR! / ERR! |   4184MiB / 23028MiB |    ERR!      Default |
-        # |                               |                      |                 ERR! |
-        # +-------------------------------+----------------------+----------------------+
-        #
-        # +-----------------------------------------------------------------------------+
-        # | Processes:                                                                  |
-        # |  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
-        # |        ID   ID                                                   Usage      |
-        # |=============================================================================|
-        # +-----------------------------------------------------------------------------+
-        #
-        # This should be reported as a failure instead as it will guarantee to fail when
-        # Docker tries to run with --gpus all
-        #
-        # So, the correct check here is to query one of the missing piece of info like
-        # GPU name, so that the command can fail accordingly
-        nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0
-        NVIDIA_SMI_STATUS=$?
-
-        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
-        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
-          echo "NVIDIA driver installation has failed, shutting down the runner..."
-          .github/scripts/stop_runner_service.sh
-        fi
-
-        # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the
-        # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails
-        # https://github.com/pytorch/test-infra/issues/4000
-        GPU_COUNT=$(nvidia-smi --list-gpus | wc -l)
-        NVIDIA_SMI_STATUS=$?
-
-        # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action
-        if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then
-          echo "NVIDIA driver installation has failed, shutting down the runner..."
-          .github/scripts/stop_runner_service.sh
-        fi
-
-        # Check the GPU count to be a power of 2
-        if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then
-          echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
-          .github/scripts/stop_runner_service.sh
-        fi
--- a/.github/actions/pytest-cache-download/action.yml
+++ b/.github/actions/pytest-cache-download/action.yml
@ -9,10 +9,6 @@ inputs:
  job_identifier:
    description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier.
    required: true
-  s3_bucket:
-    description: S3 bucket to download PyTest cache
-    required: false
-    default: "gha-artifacts"

 runs:
  using: composite
@ -34,7 +30,6 @@ runs:
        CACHE_DIR: ${{ inputs.cache_dir }}
        JOB_IDENTIFIER: ${{ inputs.job_identifier }}
        REPO: ${{ github.repository }}
-        BUCKET: ${{ inputs.s3_bucket }}
      run: |
        python3 .github/scripts/pytest_cache.py \
          --download \
@ -43,4 +38,3 @@ runs:
          --job_identifier $JOB_IDENTIFIER \
          --temp_dir $RUNNER_TEMP \
          --repo $REPO \
-          --bucket $BUCKET \
--- a/.github/actions/setup-linux/action.yml
+++ b/.github/actions/setup-linux/action.yml
@ -15,12 +15,10 @@ runs:
          category=$1
          # If it is GCP runner (runner name contains gcp), do not run this
          runner_name_str=${{ runner.name }}
-          if [[ -f /.inarc ]]; then
-            echo "ARC Runner, no info on ec2 metadata"
-          elif [[ $runner_name_str == *"gcp"* ]]; then
-            echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
-          else
+          if [[ $runner_name_str != *"gcp"* ]]; then
            curl -fsSL "http://169.254.169.254/latest/meta-data/${category}"
+          else
+            echo "Runner is from Google Cloud Platform, No info on ec2 metadata"
          fi
        }
        echo "ami-id: $(get_ec2_metadata ami-id)"
@ -28,14 +26,8 @@ runs:
        echo "instance-type: $(get_ec2_metadata instance-type)"
        echo "system info $(uname -a)"

-    - name: Check if in a ARC runner
-      shell: bash
-      id: check_arc_runner
-      run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)"  >> $GITHUB_OUTPUT
-
    - name: Start docker if docker deamon is not running
      shell: bash
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
      run: |
        if systemctl is-active --quiet docker; then
            echo "Docker daemon is running...";
@ -66,7 +58,6 @@ runs:
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"

    - name: Kill any existing containers, clean up images
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }}
      shell: bash
      run: |
        # ignore expansion of "docker ps -q" since it could be empty
@ -105,28 +96,3 @@ runs:

        echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts
        cat /etc/hosts
-
-    - name: Check that the docker daemon is running
-      shell: bash
-      continue-on-error: true
-      if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }}
-      run: |
-        set +x
-
-        max_attempts=30
-        delay=10
-        attempt=1
-
-        for attempt in $(seq 1 $max_attempts); do
-          echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..."
-          if docker info > /dev/null 2>&1; then
-            echo "Docker is running. Proceeding with the next steps"
-            exit 0
-          else
-            echo "Docker is not running yet."
-            echo "Retrying in $delay seconds..."
-            sleep $delay
-          fi
-        done
-        echo "Reached maximum attempts to connect to Docker. Exiting."
-        exit 1
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -26,7 +26,6 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
@ -36,8 +35,7 @@ runs:
          "${DOCKER_IMAGE}"
        )

-        echo "CONTAINER_NAME=${container_name}" >> "$GITHUB_ENV"
-        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" && "${BUILD_ENVIRONMENT}" != "linux-s390x-binary-manywheel" && "${GPU_ARCH_TYPE}" != "xpu" ]]; then
+        if [[ "${GPU_ARCH_TYPE}" != "rocm" && "${BUILD_ENVIRONMENT}" != "linux-aarch64-binary-manywheel" ]]; then
          # Propagate download.pytorch.org IP to container. This is only needed on Linux non aarch64 runner
          grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" bash -c "/bin/cat >> /etc/hosts"
        fi
@ -46,11 +44,3 @@ runs:
        # Generate test script
        docker exec -t -w "${PYTORCH_ROOT}" -e OUTPUT_SCRIPT="/run.sh" "${container_name}" bash -c "bash .circleci/scripts/binary_linux_test.sh"
        docker exec -t "${container_name}" bash -c "source ${BINARY_ENV_FILE} && bash -x /run.sh"
-
-    - name: Cleanup docker
-      if: always() && (env.BUILD_ENVIRONMENT == 'linux-s390x-binary-manywheel' || env.GPU_ARCH_TYPE == 'xpu')
-      shell: bash
-      run: |
-        # on s390x or xpu stop the container for clean worker stop
-        # shellcheck disable=SC2046
-        docker stop "${{ env.CONTAINER_NAME }}" || true
--- a/.github/actions/upload-test-artifacts/action.yml
+++ b/.github/actions/upload-test-artifacts/action.yml
@ -11,10 +11,6 @@ inputs:
      Suffix to add to the filename of the artifacts. This should include the
      workflow job id, see [Job id in artifacts].
    required: true
-  s3-bucket:
-    description: S3 bucket to download builds
-    required: false
-    default: "gha-artifacts"

 runs:
  using: composite
@ -46,7 +42,7 @@ runs:
      env:
        FILE_SUFFIX: ${{ inputs.file-suffix }}
      run: |
-        # Remove any previous usage logs if they exist
+        # Remove any previous test reports if they exist
        rm -f logs-*.zip
        # this workflow is also run in bazel build test, but we dont generate usage reports for it
        # so check to see if the file exists first
@ -57,18 +53,6 @@ runs:
            zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log'
        fi

-    - name: Zip debugging artifacts for upload
-      if: runner.os != 'Windows' && !inputs.use-gha
-      shell: bash
-      env:
-        FILE_SUFFIX: ${{ inputs.file-suffix }}
-      run: |
-        # Remove any previous debugging artifacts if they exist
-        rm -f debug-*.zip
-        if [ -d 'test/debug' ]; then
-          zip -r "debug-${FILE_SUFFIX}.zip" test/debug
-        fi
-
    # Windows zip
    - name: Zip JSONs for upload
      if: runner.os == 'Windows' && !inputs.use-gha
@ -103,7 +87,6 @@ runs:
      uses: seemethere/upload-artifact-s3@v5
      if: ${{ !inputs.use-gha }}
      with:
-        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
@ -114,7 +97,6 @@ runs:
      uses: seemethere/upload-artifact-s3@v5
      if: ${{ !inputs.use-gha }}
      with:
-        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
@ -126,25 +108,12 @@ runs:
      if: ${{ !inputs.use-gha }}
      continue-on-error: true
      with:
-        s3-bucket: ${{ inputs.s3-bucket }}
        s3-prefix: |
          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
        retention-days: 14
        if-no-files-found: ignore
        path: logs-*.zip

-    - name: Store Debug Artifacts on S3
-      uses: seemethere/upload-artifact-s3@v5
-      if: ${{ !inputs.use-gha }}
-      continue-on-error: true
-      with:
-        s3-bucket: ${{ inputs.s3-bucket }}
-        s3-prefix: |
-          ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact
-        retention-days: 14
-        if-no-files-found: ignore
-        path: debug-*.zip
-
    # GHA upload
    - name: Store Test Downloaded JSONs on Github
      uses: actions/upload-artifact@v3
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-69b2a0adc2ec03ab99990d7e8be3d4510438c148
+87aeb554d3e2f7855b7abe5120c282f59648ed7a
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-23512dbebd44a11eb84afbf53c3c071dd105297e
+d6015d42d9a1834bc7595c4bd6852562fb80b30b
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-d23a6e1664d20707c11781299611436e1f0c104f
+2c127da8b5e2e8f44b50994c6cb931bcca267cfe
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-5ea4535f0699f366adb554183a65ebf7dc34a8be
+r2.3
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -1,13 +0,0 @@
-# Use this to auto apply labels based on other labels.  Applies to both PRs and
-# issues. Currently only supports any and all
- any:
-  - "module: custom operators"
-  - "module: aotdispatch"
-  then:
-  - "module: pt2-dispatcher"
- any:
-  - "module: dynamo"
-  - "module: pt2-dispatcher"
-  - "module: inductor"
-  then:
-  - "oncall: pt2"
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -35,9 +35,6 @@
 - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
 - torch/distributed/_tensor/**
 - torch/distributed/fsdp/**
- torch/csrc/inductor/**
- test/cpp/aoti_abi_check/**
- test/cpp/aoti_inference/**

 "module: cpu":
 - aten/src/ATen/cpu/**
@ -58,17 +55,6 @@
 - third_party/mkl-dnn.BUILD
 - torch/csrc/jit/codegen/onednn/**
 - test/test_jit_llga_fuser.py
- test/test_mkldnn.py
-
-"ciflow/linux-aarch64":
- third_party/ideep
- caffe2/ideep/**
- caffe2/python/ideep/**
- cmake/Modules/FindMKLDNN.cmake
- third_party/mkl-dnn.BUILD
- torch/csrc/jit/codegen/onednn/**
- test/test_jit_llga_fuser.py
- test/test_mkldnn.py

 "module: amp (automated mixed precision)":
 - torch/amp/**
--- a/.github/lf-canary-scale-config.yml
+++ b/.github/lf-canary-scale-config.yml
@ -1,281 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch-canary and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.c.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.c.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.c.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.c.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.c.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.c.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.c.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.c.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.c.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.c.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.c.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.c.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.c.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.c.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.c.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.c.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.c.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.c.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.c.amz2023.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/lf-scale-config.yml
+++ b/.github/lf-scale-config.yml
@ -1,281 +0,0 @@
-# Defines runner types that will be provisioned by by LF Self-hosted
-# runners for pytorch/pytorch and their labels.
-#
-# Runners listed here will be available as self hosted runners.
-# Configuration is directly pulled from the main branch.
-#
-# Default values:
-#
-# runner_types:
-#   runner_label: # label to specify in the Github Actions workflow
-#     instance_type: m4.large
-#     os: linux
-#     max_available: 20
-#     disk_size: 50
-#     is_ephemeral: true
-
-runner_types:
-  lf.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-  lf.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-  lf.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-  lf.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-  lf.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-  lf.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-  lf.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-  lf.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-  lf.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-  lf.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-  lf.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-  lf.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-  lf.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-  lf.windows.4xlarge:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: true
-    max_available: 420
-    os: windows
-  lf.windows.4xlarge.nonephemeral:
-    disk_size: 256
-    instance_type: c5d.4xlarge
-    is_ephemeral: false
-    max_available: 420
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: true
-    max_available: 150
-    os: windows
-  lf.windows.8xlarge.nvidia.gpu.nonephemeral:
-    disk_size: 256
-    instance_type: p3.2xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: windows
-  lf.windows.g5.4xlarge.nvidia.gpu:
-    disk_size: 256
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: windows
-
-  ### Setup runner types to test the Amazon Linux 2023 AMI
-  lf.amz2023.linux.12xlarge:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xl.spr-metal:
-    disk_size: 200
-    instance_type: c7i.metal-24xl
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.spr:
-    disk_size: 200
-    instance_type: c7i.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.12xlarge.ephemeral:
-    disk_size: 200
-    instance_type: c5.12xlarge
-    is_ephemeral: true
-    max_available: 300
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.16xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.16xlarge
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.24xlarge:
-    disk_size: 150
-    instance_type: c5.24xlarge
-    is_ephemeral: false
-    max_available: 250
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.2xlarge:
-    disk_size: 150
-    instance_type: c5.2xlarge
-    is_ephemeral: false
-    max_available: 3120
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge:
-    disk_size: 150
-    instance_type: c5.4xlarge
-    is_ephemeral: false
-    max_available: 1000
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.4xlarge
-    is_ephemeral: false
-    max_available: 520
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.8xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g3.8xlarge
-    is_ephemeral: false
-    max_available: 400
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.12xlarge
-    is_ephemeral: false
-    max_available: 50
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g4dn.metal.nvidia.gpu:
-    disk_size: 150
-    instance_type: g4dn.metal
-    is_ephemeral: false
-    max_available: 30
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.48xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.48xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.12xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.12xlarge
-    is_ephemeral: false
-    max_available: 150
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.g5.4xlarge.nvidia.gpu:
-    disk_size: 150
-    instance_type: g5.4xlarge
-    is_ephemeral: false
-    max_available: 1200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.large:
-    disk_size: 15
-    instance_type: c5.large
-    is_ephemeral: false
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.2xlarge:
-    disk_size: 256
-    instance_type: t4g.2xlarge
-    is_ephemeral: false
-    max_available: 200
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
-  lf.amz2023.linux.arm64.m7g.2xlarge:
-    disk_size: 256
-    instance_type: m7g.2xlarge
-    is_ephemeral: false
-    max_available: 20
-    os: linux
-    ami: al2023-ami-2023.5.20240701.0-kernel-6.1-x86_64
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -27,12 +27,13 @@
  - third_party/onnx
  - caffe2/python/onnx/**
  approved_by:
+  - BowenBao
+  - abock
  - justinchuby
-  - liqunfu
  - shubhambhokare1
+  - thiagocrepaldi
  - titaiwangms
  - wschin
-  - xadupre
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -235,25 +236,6 @@
  - Lint
  - pull

- name: XPU ATen
-  patterns:
-  - aten/src/ATen/xpu/**
-  - c10/xpu/**
-  - torch/csrc/xpu/**
-  - torch/xpu/**
-  - test/xpu/**
-  - test/test_xpu.py
-  - third_party/xpu.txt
-  - .ci/docker/ci_commit_pins/triton-xpu.txt
-  approved_by:
-  - EikanWang
-  - jgong5
-  - gujinghui
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
 - name: Distributions
  patterns:
  - torch/distributions/**
@ -286,7 +268,6 @@
  - test/cpp/dist_autograd/**
  - test/cpp/rpc/**
  approved_by:
-  - wconstab
  - mrshenli
  - pritamdamania87
  - zhaojuanmao
@ -313,25 +294,6 @@
  - Lint
  - pull

- name: DCP
-  patterns:
-  - torch/distributed/checkpoint/**
-  approved_by:
-  - LucasLLC
-  - fegin
-  - wz337
-  - saumishr
-  - daulet-askarov
-  - pradeepdfb
-  - kirtiteja
-  - mhorowitz
-  - saiteja64
-  mandatory_checks_name:
-  - EasyCLA
-  - Lint
-  - pull
-
-
 - name: IDEEP
  patterns:
  - third_party/ideep
@ -395,22 +357,12 @@

 - name: CPU inductor
  patterns:
-  - torch/_inductor/mkldnn_ir.py
-  - torch/_inductor/mkldnn_lowerings.py
  - torch/_inductor/fx_passes/mkldnn_fusion.py
  - torch/_inductor/fx_passes/quantization.py
-  - torch/_inductor/codegen/cpp_prefix.h
  - torch/_inductor/codegen/cpp.py
-  - torch/_inductor/codegen/cpp_utils.py
-  - torch/_inductor/codegen/cpp_micro_gemm.py
-  - torch/_inductor/codegen/cpp_template_kernel.py
-  - torch/_inductor/codegen/cpp_template.py
-  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
-  - test/inductor/test_cpu_repro.py
+  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
-  - test/inductor/test_cpu_select_algorithm.py
-  - aten/src/ATen/cpu/**
  - aten/src/ATen/native/quantized/cpu/**
  - test/quantization/core/test_quantized_op.py
  - torch/ao/quantization/quantizer/x86_inductor_quantizer.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -7,9 +7,6 @@ ciflow_push_tags:
 - ciflow/binaries_wheel
 - ciflow/inductor
 - ciflow/inductor-perf-compare
- ciflow/inductor-micro-benchmark
- ciflow/inductor-cu124
- ciflow/linux-aarch64
 - ciflow/mps
 - ciflow/nightly
 - ciflow/periodic
@ -18,12 +15,9 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
- ciflow/torchbench
 retryable_workflows:
+- lint
 - pull
 - trunk
 - linux-binary
 - windows-binary
-labeler_config: labeler.yml
-label_to_label_config: label_to_label.yml
-mergebot: True
--- a/.github/requirements-gha-cache.txt
+++ b/.github/requirements-gha-cache.txt
@ -5,11 +5,11 @@
 #   functorch/docs/requirements.txt
 #   .ci/docker/requirements-ci.txt
 boto3==1.19.12
-jinja2==3.1.4
+jinja2==3.0.1
 lintrunner==0.10.7
 ninja==1.10.0.post1
 nvidia-ml-py==11.525.84
 pyyaml==6.0
-requests==2.32.2
+requests==2.31.0
 rich==10.9.0
 rockset==1.0.3
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -4,5 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -3,5 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
+requests=2.31.0
 setuptools=68.2.2
-typing-extensions=4.9.0
+typing-extensions=4.3.0
--- a/.github/requirements/conda-env-macOS-ARM64
+++ b/.github/requirements/conda-env-macOS-ARM64
@ -2,7 +2,7 @@ numpy=1.22.3
 pyyaml=6.0
 setuptools=61.2.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/conda-env-macOS-X64
+++ b/.github/requirements/conda-env-macOS-X64
@ -4,7 +4,7 @@ numpy=1.21.2
 pyyaml=5.3
 setuptools=46.0.0
 cmake=3.22.*
-typing-extensions=4.9.0
+typing-extensions=4.3.0
 dataclasses=0.8
 pip=22.2.2
 pillow=10.0.1
--- a/.github/requirements/pip-requirements-iOS.txt
+++ b/.github/requirements/pip-requirements-iOS.txt
@ -1,4 +1,4 @@
 # iOS simulator requirements
 coremltools==5.0b5
 protobuf==3.20.2
-optree==0.12.1
+optree==0.9.1
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -17,16 +17,16 @@ pytest-xdist==3.3.1
 pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
 scipy==1.10.1
-sympy==1.12.1 ; python_version == "3.8"
-sympy>=1.13.0 ; python_version >= "3.9"
+sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
 xdoctest==1.1.0
 filelock==3.6.0
+sympy==1.11.1
 pytest-cpp==2.3.0
 rockset==1.0.3
 z3-solver==4.12.2.0
 tensorboard==2.13.0
-optree==0.12.1
+optree==0.9.1
 # NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in
 # which the stringify metadata is wrong when escaping double quote
 protobuf==3.20.2
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -1,101 +0,0 @@
-set -ex
-
-# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm
-ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}"
-
-# Find rocm_version.h header file for ROCm version extract
-rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h"
-if [ ! -f "$rocm_version_h" ]; then
-    rocm_version_h="${ROCM_HOME}/include/rocm_version.h"
-fi
-
-# Error out if rocm_version.h not found
-if [ ! -f "$rocm_version_h" ]; then
-    echo "Error: rocm_version.h not found in expected locations." >&2
-    exit 1
-fi
-
-# Extract major, minor and patch ROCm version numbers
-MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}')
-MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}')
-PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}')
-ROCM_INT=$(($MAJOR_VERSION * 10000 + $MINOR_VERSION * 100 + $PATCH_VERSION))
-echo "ROCm version: $ROCM_INT"
-
-# Check TRITON_ROCM_DIR is set
-if [[ -z "${TRITON_ROCM_DIR}" ]]; then
-    export TRITON_ROCM_DIR=third_party/amd/backend
-fi
-
-# Remove packaged libs and headers
-rm -rf $TRITON_ROCM_DIR/include/*
-
-LIBTINFO_PATH="/usr/lib64/libtinfo.so.5"
-LIBNUMA_PATH="/usr/lib64/libnuma.so.1"
-LIBELF_PATH="/usr/lib64/libelf.so.1"
-
-OS_SO_PATHS=(
-    $LIBELF_PATH
-    $LIBNUMA_PATH
-    $LIBTINFO_PATH
-)
-
-for lib in "${OS_SO_PATHS[@]}"
-do
-    cp $lib $TRITON_ROCM_DIR/lib/
-done
-
-# Required ROCm libraries
-if [[ "${MAJOR_VERSION}" == "6" ]]; then
-    libamdhip="libamdhip64.so.6"
-else
-    libamdhip="libamdhip64.so.5"
-fi
-
-# Required ROCm libraries - ROCm 6.0
-ROCM_SO=(
-    "${libamdhip}"
-    "libhsa-runtime64.so.1"
-    "libamd_comgr.so.2"
-    "libdrm.so.2"
-    "libdrm_amdgpu.so.1"
-)
-
-if [[ $ROCM_INT -ge 60100 ]]; then
-    ROCM_SO+=("librocprofiler-register.so.0")
-fi
-
-for lib in "${ROCM_SO[@]}"
-do
-    file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib
-    if [[ -z $file_path ]]; then
-        if [ -d "$ROCM_HOME/lib64/" ]; then
-            file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64
-        fi
-    fi
-    if [[ -z $file_path ]]; then
-        file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME
-    fi
-    if [[ -z $file_path ]]; then
-        file_path=($(find /opt/ -name "$lib")) # Then search in /opt
-    fi
-    if [[ -z $file_path ]]; then
-            echo "Error: Library file $lib is not found." >&2
-            exit 1
-    fi
-
-    cp $file_path $TRITON_ROCM_DIR/lib
-    # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions
-    LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g')
-    ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME
-
-done
-
-# Copy Include Files
-cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/roctracer $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/hsa $TRITON_ROCM_DIR/include
-
-# Copy linker
-mkdir -p $TRITON_ROCM_DIR/llvm/bin
-cp $ROCM_HOME/llvm/bin/ld.lld $TRITON_ROCM_DIR/llvm/bin/
--- a/.github/scripts/amd/patch_triton_wheel.sh
+++ b/.github/scripts/amd/patch_triton_wheel.sh
@ -1,103 +0,0 @@
-#!/bin/bash
-set -x
-
-if [ -z "$1" ]; then
-    echo "Need wheel location argument" && exit 1
-fi
-
-WHEELHOUSE_DIR=$1
-PATCHELF_BIN=patchelf
-ROCM_LIB=backends/amd/lib
-ROCM_LD=backends/amd/llvm/bin
-PREFIX=triton
-fname_without_so_number() {
-    LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g')
-    echo "$LINKNAME"
-}
-
-replace_needed_sofiles() {
-    find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do
-        origname=$2
-        patchedname=$3
-        if [[ "$origname" != "$patchedname" ]]; then
-            set +e
-            origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*")
-            ERRCODE=$?
-            set -e
-            if [ "$ERRCODE" -eq "0" ]; then
-                echo "patching $sofile entry $origname to $patchedname"
-                $PATCHELF_BIN --replace-needed $origname $patchedname $sofile
-            fi
-        fi
-    done
-}
-
-mkdir  -p "/tmp_dir"
-pushd /tmp_dir
-for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do
-    echo "Modifying $pkg"
-    rm -rf tmp
-    mkdir -p tmp
-    cd tmp
-    cp $pkg .
-    unzip -q $(basename $pkg)
-    rm -f $(basename $pkg)
-    $PATCHELF_BIN --set-rpath ${LD_SO_RPATH:-'$ORIGIN:$ORIGIN/../../lib'} $PREFIX/$ROCM_LD/ld.lld
-    $PATCHELF_BIN --print-rpath $PREFIX/$ROCM_LD/ld.lld
-    # Modify libtriton.so as it sits in _C directory apart from its dependencies
-    find $PREFIX/_C -type f -name "*.so*" | while read sofile; do
-        echo "Setting rpath of $sofile"
-        $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/'../$ROCM_LIB} ${FORCE_RPATH:-} $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-    done
-
-    # All included dependencies are included in a single lib directory
-    deps=()
-    deps_soname=()
-    while read sofile; do
-        echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}"
-        $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile
-        $PATCHELF_BIN --print-rpath $sofile
-        deps+=("$sofile")
-        deps_soname+=("$(basename $sofile)")
-    done < <(find $PREFIX/$ROCM_LIB -type f -name "*.so*")
-
-    patched=()
-    for filepath in "${deps[@]}"; do
-        filename=$(basename $filepath)
-        destpath=$PREFIX/$ROCM_LIB/$filename
-        if [[ "$filepath" != "$destpath" ]]; then
-            cp $filepath $destpath
-        fi
-        patchedpath=$(fname_without_so_number $destpath)
-        patchedname=$(basename $patchedpath)
-        if [[ "$destpath" != "$patchedpath" ]]; then
-            mv $destpath $patchedpath
-        fi
-        patched+=("$patchedname")
-        echo "Copied $filepath to $patchedpath"
-    done
-
-    # Go through all required shared objects and see if any of our other objects are dependants.  If so, replace so.ver wth so
-    for ((i=0;i<${#deps[@]};++i)); do
-        echo "replacing "${deps_soname[i]} ${patched[i]}
-        replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]}
-        replace_needed_sofiles $PREFIX/_C ${deps_soname[i]} ${patched[i]}
-        replace_needed_sofiles $PREFIX/$ROCM_LD ${deps_soname[i]} ${patched[i]}
-    done
-
-    # Re-bundle whl with so adjustments
-    zip -rqy $(basename $pkg) *
-
-    if [[ -z "${MANYLINUX_VERSION}" ]]; then
-        newpkg=$pkg
-    else
-        newpkg=$(echo $pkg | sed -e "s/\linux_x86_64/${MANYLINUX_VERSION}/g")
-    fi
-
-    # Remove original whl
-    rm -f $pkg
-
-    # Move rebuilt whl to original location with new name.
-    mv $(basename $pkg) $newpkg
-done
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -1,5 +1,4 @@
 #!/usr/bin/env python3
-
 import os
 import shutil
 import sys
@ -8,17 +7,12 @@ from subprocess import check_call
 from tempfile import TemporaryDirectory
 from typing import Optional

-
 SCRIPT_DIR = Path(__file__).parent
 REPO_DIR = SCRIPT_DIR.parent.parent


-def read_triton_pin(device: str = "cuda") -> str:
-    triton_file = "triton.txt"
-    if device == "rocm":
-        triton_file = "triton-rocm.txt"
-    elif device == "xpu":
-        triton_file = "triton-xpu.txt"
+def read_triton_pin(rocm_hash: bool = False) -> str:
+    triton_file = "triton.txt" if not rocm_hash else "triton-rocm.txt"
    with open(REPO_DIR / ".ci" / "docker" / "ci_commit_pins" / triton_file) as f:
        return f.read().strip()

@ -35,6 +29,27 @@ def check_and_replace(inp: str, src: str, dst: str) -> str:
    return inp.replace(src, dst)


+def patch_setup_py(
+    path: Path,
+    *,
+    version: str,
+    name: str = "triton",
+    expected_version: Optional[str] = None,
+) -> None:
+    with open(path) as f:
+        orig = f.read()
+    # Replace name
+    orig = check_and_replace(orig, 'name="triton",', f'name="{name}",')
+    # Replace version
+    if not expected_version:
+        expected_version = read_triton_version()
+    orig = check_and_replace(
+        orig, f'version="{expected_version}",', f'version="{version}",'
+    )
+    with open(path, "w") as f:
+        f.write(orig)
+
+
 def patch_init_py(
    path: Path, *, version: str, expected_version: Optional[str] = None
 ) -> None:
@ -55,7 +70,7 @@ def build_triton(
    version: str,
    commit_hash: str,
    build_conda: bool = False,
-    device: str = "cuda",
+    build_rocm: bool = False,
    py_version: Optional[str] = None,
    release: bool = False,
 ) -> Path:
@ -74,15 +89,13 @@ def build_triton(
    with TemporaryDirectory() as tmpdir:
        triton_basedir = Path(tmpdir) / "triton"
        triton_pythondir = triton_basedir / "python"
-        triton_repo = "https://github.com/openai/triton"
-        if device == "rocm":
+        if build_rocm:
+            triton_repo = "https://github.com/ROCmSoftwarePlatform/triton"
            triton_pkg_name = "pytorch-triton-rocm"
-        elif device == "xpu":
-            triton_pkg_name = "pytorch-triton-xpu"
-            triton_repo = "https://github.com/intel/intel-xpu-backend-for-triton"
        else:
+            triton_repo = "https://github.com/openai/triton"
            triton_pkg_name = "pytorch-triton"
-        check_call(["git", "clone", triton_repo, "triton"], cwd=tmpdir)
+        check_call(["git", "clone", triton_repo], cwd=tmpdir)
        if release:
            ver, rev, patch = version.split(".")
            check_call(
@ -100,7 +113,7 @@ def build_triton(
                print("source:\n  path: .\n", file=meta)
                print(
                    "build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
-                    "python setup.py install --record=record.txt\n",
+                    "python setup.py install --single-version-externally-managed --record=record.txt\n",
                    " script_env:\n   - MAX_JOBS\n",
                    file=meta,
                )
@ -149,12 +162,15 @@ def build_triton(
            expected_version=None,
        )

-        if device == "rocm":
-            check_call(
-                [f"{SCRIPT_DIR}/amd/package_triton_wheel.sh"],
-                cwd=triton_basedir,
-                shell=True,
+        if build_rocm:
+            # TODO: Remove me when ROCM triton is updated
+            patch_setup_py(
+                triton_pythondir / "setup.py",
+                name=triton_pkg_name,
+                version=f"{version}",
+                expected_version=None,
            )
+            check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True)
            print("ROCm libraries setup for triton installation...")

        check_call(
@ -164,11 +180,8 @@ def build_triton(
        whl_path = next(iter((triton_pythondir / "dist").glob("*.whl")))
        shutil.copy(whl_path, Path.cwd())

-        if device == "rocm":
-            check_call(
-                [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()],
-                cwd=triton_basedir,
-            )
+        if build_rocm:
+            check_call("scripts/amd/fix_so.sh", cwd=triton_basedir, shell=True)

        return Path.cwd() / whl_path.name

@ -179,19 +192,17 @@ def main() -> None:
    parser = ArgumentParser("Build Triton binaries")
    parser.add_argument("--release", action="store_true")
    parser.add_argument("--build-conda", action="store_true")
-    parser.add_argument(
-        "--device", type=str, default="cuda", choices=["cuda", "rocm", "xpu"]
-    )
+    parser.add_argument("--build-rocm", action="store_true")
    parser.add_argument("--py-version", type=str)
    parser.add_argument("--commit-hash", type=str)
    parser.add_argument("--triton-version", type=str, default=read_triton_version())
    args = parser.parse_args()

    build_triton(
-        device=args.device,
+        build_rocm=args.build_rocm,
        commit_hash=args.commit_hash
        if args.commit_hash
-        else read_triton_pin(args.device),
+        else read_triton_pin(args.build_rocm),
        version=args.triton_version,
        build_conda=args.build_conda,
        py_version=args.py_version,
--- a/.github/scripts/check_labels.py
+++ b/.github/scripts/check_labels.py
@ -5,6 +5,7 @@ import sys
 from typing import Any

 from github_utils import gh_delete_comment, gh_post_pr_comment
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from label_utils import has_required_labels, is_label_err_comment, LABEL_ERR_MSG
 from trymerge import GitHubPR
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -3,10 +3,12 @@
 import json
 import os
 import re
-from typing import Any, cast, Dict, List, Optional
+from typing import Any, Optional
+
 from urllib.error import HTTPError

-from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
+from github_utils import gh_fetch_url, gh_post_pr_comment
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR

@ -17,7 +19,6 @@ REQUIRES_ISSUE = {
    "critical",
    "fixnewfeature",
 }
-RELEASE_BRANCH_REGEX = re.compile(r"release/(?P<version>.+)")


 def parse_args() -> Any:
@ -28,7 +29,7 @@ def parse_args() -> Any:
        "--onto-branch", type=str, required=True, help="the target release branch"
    )
    parser.add_argument(
-        "--github-actor", type=str, required=True, help="all the world's a stage"
+        "--github-actor", type=str, required=True, help="all the world’s a stage"
    )
    parser.add_argument(
        "--classification",
@ -57,33 +58,6 @@ def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
    return commit_sha if pr.is_closed() else None


-def get_release_version(onto_branch: str) -> Optional[str]:
-    """
-    Return the release version if the target branch is a release branch
-    """
-    m = re.match(RELEASE_BRANCH_REGEX, onto_branch)
-    return m.group("version") if m else ""
-
-
-def get_tracker_issues(
-    org: str, project: str, onto_branch: str
-) -> List[Dict[str, Any]]:
-    """
-    Find the tracker issue from the repo. The tracker issue needs to have the title
-    like [VERSION] Release Tracker following the convention on PyTorch
-    """
-    version = get_release_version(onto_branch)
-    if not version:
-        return []
-
-    tracker_issues = gh_query_issues_by_labels(org, project, labels=["release tracker"])
-    if not tracker_issues:
-        return []
-
-    # Figure out the tracker issue from the list by looking at the title
-    return [issue for issue in tracker_issues if version in issue.get("title", "")]
-
-
 def cherry_pick(
    github_actor: str,
    repo: GitRepo,
@ -103,49 +77,17 @@ def cherry_pick(
    )

    try:
-        org, project = repo.gh_owner_and_name()
-
-        cherry_pick_pr = ""
        if not dry_run:
+            org, project = repo.gh_owner_and_name()
            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)

-        tracker_issues_comments = []
-        tracker_issues = get_tracker_issues(org, project, onto_branch)
-        for issue in tracker_issues:
-            issue_number = int(str(issue.get("number", "0")))
-            if not issue_number:
-                continue
+            msg = f"The cherry pick PR is at {cherry_pick_pr}"
+            if fixes:
+                msg += f" and it is linked with issue {fixes}"
+            elif classification in REQUIRES_ISSUE:
+                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"

-            res = cast(
-                Dict[str, Any],
-                post_tracker_issue_comment(
-                    org,
-                    project,
-                    issue_number,
-                    pr.pr_num,
-                    cherry_pick_pr,
-                    classification,
-                    fixes,
-                    dry_run,
-                ),
-            )
-
-            comment_url = res.get("html_url", "")
-            if comment_url:
-                tracker_issues_comments.append(comment_url)
-
-        msg = f"The cherry pick PR is at {cherry_pick_pr}"
-        if fixes:
-            msg += f" and it is linked with issue {fixes}."
-        elif classification in REQUIRES_ISSUE:
-            msg += f" and it is recommended to link a {classification} cherry pick PR with an issue."
-
-        if tracker_issues_comments:
-            msg += " The following tracker issues are updated:\n"
-            for tracker_issues_comment in tracker_issues_comments:
-                msg += f"* {tracker_issues_comment}\n"
-
-        post_pr_comment(org, project, pr.pr_num, msg, dry_run)
+            post_comment(org, project, pr.pr_num, msg)

    finally:
        if current_branch:
@ -217,9 +159,7 @@ def submit_pr(
        raise RuntimeError(msg) from error


-def post_pr_comment(
-    org: str, project: str, pr_num: int, msg: str, dry_run: bool = False
-) -> List[Dict[str, Any]]:
+def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
    """
    Post a comment on the PR itself to point to the cherry picking PR when success
    or print the error when failure
@ -242,35 +182,7 @@ def post_pr_comment(
    comment = "\n".join(
        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
    )
-    return gh_post_pr_comment(org, project, pr_num, comment, dry_run)
-
-
-def post_tracker_issue_comment(
-    org: str,
-    project: str,
-    issue_num: int,
-    pr_num: int,
-    cherry_pick_pr: str,
-    classification: str,
-    fixes: str,
-    dry_run: bool = False,
-) -> List[Dict[str, Any]]:
-    """
-    Post a comment on the tracker issue (if any) to record the cherry pick
-    """
-    comment = "\n".join(
-        (
-            "Link to landed trunk PR (if applicable):",
-            f"* https://github.com/{org}/{project}/pull/{pr_num}",
-            "",
-            "Link to release branch PR:",
-            f"* {cherry_pick_pr}",
-            "",
-            "Criteria Category:",
-            " - ".join((classification.capitalize(), fixes.capitalize())),
-        )
-    )
-    return gh_post_pr_comment(org, project, issue_num, comment, dry_run)
+    gh_post_pr_comment(org, project, pr_num, comment)


 def main() -> None:
@ -302,7 +214,7 @@ def main() -> None:

    except RuntimeError as error:
        if not args.dry_run:
-            post_pr_comment(org, project, pr_num, str(error))
+            post_comment(org, project, pr_num, str(error))
        else:
            raise error

--- a/.github/scripts/close_nonexistent_disable_issues.py
+++ b/.github/scripts/close_nonexistent_disable_issues.py
@ -10,7 +10,6 @@ import requests
 import rockset  # type: ignore[import]
 from gitutils import retries_decorator

-
 LOGS_QUERY = """
 with
    shas as (
--- a/.github/scripts/collect_ciflow_labels.py
+++ b/.github/scripts/collect_ciflow_labels.py
@ -1,12 +1,10 @@
 #!/usr/bin/env python3
-
 import sys
 from pathlib import Path
 from typing import Any, cast, Dict, List, Set

 import yaml

-
 GITHUB_DIR = Path(__file__).parent.parent


--- a/.github/scripts/comment_on_pr.py
+++ b/.github/scripts/comment_on_pr.py
@ -23,10 +23,8 @@ def main() -> None:

    job_link = f"[job]({run_url})" if run_url is not None else "job"
    msg = (
-        f"The {args.action} {job_link} was canceled or timed out. This most often happen if two merge requests were issued"
-        + " for the same PR, or if merge job was waiting for more than 6 hours for tests to finish."
-        + " In later case, please do not hesitate to reissue the merge command\n"
-        + f" For more information see [pytorch-bot wiki]({BOT_COMMANDS_WIKI})."
+        f"The {args.action} {job_link} was canceled. If you believe this is a mistake,"
+        + f" then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})."
    )

    gh_post_pr_comment(org, project, args.pr_num, msg)
--- a/.github/scripts/convert_lintrunner_annotations_to_github.py
+++ b/.github/scripts/convert_lintrunner_annotations_to_github.py
@ -1,6 +1,7 @@
 import json
 import subprocess
 import sys
+
 from enum import Enum
 from pathlib import Path
 from typing import NamedTuple, Optional
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -2,14 +2,12 @@
 import os
 import re
 from datetime import datetime
-from functools import lru_cache
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Set

 from github_utils import gh_fetch_json_dict, gh_graphql
 from gitutils import GitRepo

-
 SEC_IN_DAY = 24 * 60 * 60
 CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
 NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
@ -20,7 +18,7 @@ ESTIMATED_TOKENS = [0]

 TOKEN = os.environ["GITHUB_TOKEN"]
 if not TOKEN:
-    raise Exception("GITHUB_TOKEN is not set")  # noqa: TRY002
+    raise Exception("GITHUB_TOKEN is not set")

 REPO_ROOT = Path(__file__).parent.parent.parent

@ -189,17 +187,6 @@ def get_recent_prs() -> Dict[str, Any]:
    return prs_by_branch_base


-@lru_cache(maxsize=1)
-def get_open_prs() -> List[Dict[str, Any]]:
-    return paginate_graphql(
-        GRAPHQL_OPEN_PRS,
-        {"owner": "pytorch", "repo": "pytorch"},
-        lambda data: False,
-        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
-        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
-    )
-
-
 def get_branches_with_magic_label_or_open_pr() -> Set[str]:
    pr_infos: List[Dict[str, Any]] = paginate_graphql(
        GRAPHQL_NO_DELETE_BRANCH_LABEL,
@ -209,7 +196,15 @@ def get_branches_with_magic_label_or_open_pr() -> Set[str]:
        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
    )

-    pr_infos.extend(get_open_prs())
+    pr_infos.extend(
+        paginate_graphql(
+            GRAPHQL_OPEN_PRS,
+            {"owner": "pytorch", "repo": "pytorch"},
+            lambda data: False,
+            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+        )
+    )

    # Get the most recent PR for each branch base (group gh together)
    branch_bases = set()
@ -275,41 +270,5 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


-def delete_old_ciflow_tags() -> None:
-    # Deletes ciflow tags if they are associated with a closed PR or a specific
-    # commit.  Lightweight tags don't have information about the date they were
-    # created, so we can't check how old they are.  The script just assumes that
-    # ciflow tags should be deleted regardless of creation date.
-    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
-
-    def delete_tag(tag: str) -> None:
-        print(f"Deleting tag {tag}")
-        ESTIMATED_TOKENS[0] += 1
-        delete_branch(git_repo, f"refs/tags/{tag}")
-
-    tags = git_repo._run_git("tag").splitlines()
-    open_pr_numbers = [x["number"] for x in get_open_prs()]
-
-    for tag in tags:
-        try:
-            if ESTIMATED_TOKENS[0] > 400:
-                print("Estimated tokens exceeded, exiting")
-                break
-            if not tag.startswith("ciflow/"):
-                continue
-            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
-            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
-            if re_match_pr:
-                pr_number = int(re_match_pr.group(1))
-                if pr_number in open_pr_numbers:
-                    continue
-                delete_tag(tag)
-            elif re_match_sha:
-                delete_tag(tag)
-        except Exception as e:
-            print(f"Failed to check tag {tag}: {e}")
-
-
 if __name__ == "__main__":
    delete_branches()
-    delete_old_ciflow_tags()
--- a/.github/scripts/docathon-label-sync.py
+++ b/.github/scripts/docathon-label-sync.py
@ -1,52 +0,0 @@
-import os
-import re
-import sys
-
-from github import Github
-
-
-def main() -> None:
-    token = os.environ.get("GITHUB_TOKEN")
-
-    repo_owner = "pytorch"
-    repo_name = "pytorch"
-    pull_request_number = int(sys.argv[1])
-
-    g = Github(token)
-    repo = g.get_repo(f"{repo_owner}/{repo_name}")
-    pull_request = repo.get_pull(pull_request_number)
-    pull_request_body = pull_request.body
-    # PR without description
-    if pull_request_body is None:
-        return
-
-    # get issue number from the PR body
-    if not re.search(r"#\d{1,6}", pull_request_body):
-        print("The pull request does not mention an issue.")
-        return
-    issue_number = int(re.findall(r"#(\d{1,6})", pull_request_body)[0])
-    issue = repo.get_issue(issue_number)
-    issue_labels = issue.labels
-    docathon_label_present = any(
-        label.name == "docathon-h1-2024" for label in issue_labels
-    )
-
-    # if the issue has a docathon label, add all labels from the issue to the PR.
-    if not docathon_label_present:
-        print("The 'docathon-h1-2024' label is not present in the issue.")
-        return
-    pull_request_labels = pull_request.get_labels()
-    pull_request_label_names = [label.name for label in pull_request_labels]
-    issue_label_names = [label.name for label in issue_labels]
-    labels_to_add = [
-        label for label in issue_label_names if label not in pull_request_label_names
-    ]
-    if not labels_to_add:
-        print("The pull request already has the same labels.")
-        return
-    pull_request.add_to_labels(*labels_to_add)
-    print("Labels added to the pull request!")
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/ensure_actions_will_cancel.py
+++ b/.github/scripts/ensure_actions_will_cancel.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3

 import sys
+
 from pathlib import Path

 import yaml
--- a/.github/scripts/export_pytorch_labels.py
+++ b/.github/scripts/export_pytorch_labels.py
@ -14,6 +14,7 @@ import json
 from typing import Any

 import boto3  # type: ignore[import]
+
 from label_utils import gh_get_labels


--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -1,7 +1,6 @@
 #!/usr/bin/env python3

 import json
-import logging
 import os
 import re
 import subprocess
@ -9,18 +8,42 @@ import sys
 import warnings
 from enum import Enum
 from functools import lru_cache
-from logging import info
 from typing import Any, Callable, Dict, List, Optional, Set
 from urllib.request import Request, urlopen

 import yaml

-
 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"

 PREFIX = "test-config/"

-logging.basicConfig(level=logging.INFO)
+# Same as shard names
+VALID_TEST_CONFIG_LABELS = {
+    f"{PREFIX}{label}"
+    for label in {
+        "backwards_compat",
+        "crossref",
+        "default",
+        "deploy",
+        "distributed",
+        "docs_tests",
+        "dynamo",
+        "force_on_cpu",
+        "functorch",
+        "inductor",
+        "inductor_distributed",
+        "inductor_huggingface",
+        "inductor_timm",
+        "inductor_torchbench",
+        "jit_legacy",
+        "multigpu",
+        "nogpu_AVX512",
+        "nogpu_NO_AVX2",
+        "slow",
+        "tsan",
+        "xla",
+    }
+}


 def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool:
@ -39,9 +62,9 @@ SUPPORTED_PERIODICAL_MODES: Dict[str, Callable[[Optional[str]], bool]] = {
 }

 # The link to the published list of disabled jobs
-DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json"
+DISABLED_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/disabled-jobs.json?versionId=qO7aEr.Og33PtLXfNq0j0yj.bbLC7SzR"
 # and unstable jobs
-UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json"
+UNSTABLE_JOBS_URL = "https://ossci-metrics.s3.amazonaws.com/unstable-jobs.json?versionId=7NhgpqKTtGXVUnL1C79KboTW_5qQx8y5"

 # Some constants used to handle disabled and unstable jobs
 JOB_NAME_SEP = "/"
@ -67,12 +90,6 @@ def parse_args() -> Any:
    parser.add_argument(
        "--test-matrix", type=str, required=True, help="the original test matrix"
    )
-    parser.add_argument(
-        "--selected-test-configs",
-        type=str,
-        default="",
-        help="a comma-separated list of test configurations from the test matrix to keep",
-    )
    parser.add_argument(
        "--workflow", type=str, help="the name of the current workflow, i.e. pull"
    )
@ -138,25 +155,19 @@ def get_labels(pr_number: int) -> Set[str]:
    }


-def filter_labels(labels: Set[str], label_regex: Any) -> Set[str]:
-    """
-    Return the list of matching labels
-    """
-    return {l for l in labels if re.match(label_regex, l)}
-
-
 def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, List[Any]]:
    """
    Select the list of test config to run from the test matrix. The logic works
    as follows:

-    If the PR has one or more test-config labels as specified, only these test configs
-    will be selected.  This also works with ciflow labels, for example, if a PR has both
-    ciflow/trunk and test-config/functorch, only trunk functorch builds and tests will
-    be run.
+    If the PR has one or more labels as specified in the VALID_TEST_CONFIG_LABELS set, only
+    these test configs will be selected.  This also works with ciflow labels, for example,
+    if a PR has both ciflow/trunk and test-config/functorch, only trunk functorch builds
+    and tests will be run

    If the PR has none of the test-config label, all tests are run as usual.
    """
+
    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}

    for entry in test_matrix.get("include", []):
@ -166,46 +177,23 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis

        label = f"{PREFIX}{config_name.strip()}"
        if label in labels:
-            msg = f"Select {config_name} because label {label} is present in the pull request by the time the test starts"
-            info(msg)
+            print(
+                f"Select {config_name} because label {label} is presented in the pull request by the time the test starts"
+            )
            filtered_test_matrix["include"].append(entry)

-    test_config_labels = filter_labels(labels, re.compile(f"{PREFIX}.+"))
-    if not filtered_test_matrix["include"] and not test_config_labels:
-        info("Found no test-config label on the PR, so all test configs are included")
-        # Found no test-config label and the filtered test matrix is empty, return the same
+    valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS)
+
+    if not filtered_test_matrix["include"] and not valid_test_config_labels:
+        # Found no valid label and the filtered test matrix is empty, return the same
        # test matrix as before so that all tests can be run normally
        return test_matrix
    else:
-        msg = f"Found {test_config_labels} on the PR so only these test configs are run"
-        info(msg)
        # When the filter test matrix contain matches or if a valid test config label
        # is found in the PR, return the filtered test matrix
        return filtered_test_matrix


-def filter_selected_test_configs(
-    test_matrix: Dict[str, List[Any]], selected_test_configs: Set[str]
-) -> Dict[str, List[Any]]:
-    """
-    Keep only the selected configs if the list if not empty. Otherwise, keep all test configs.
-    This filter is used when the workflow is dispatched manually.
-    """
-    if not selected_test_configs:
-        return test_matrix
-
-    filtered_test_matrix: Dict[str, List[Any]] = {"include": []}
-    for entry in test_matrix.get("include", []):
-        config_name = entry.get("config", "")
-        if not config_name:
-            continue
-
-        if config_name in selected_test_configs:
-            filtered_test_matrix["include"].append(entry)
-
-    return filtered_test_matrix
-
-
 def set_periodic_modes(
    test_matrix: Dict[str, List[Any]], job_name: Optional[str]
 ) -> Dict[str, List[Any]]:
@ -386,33 +374,30 @@ def process_jobs(
        # - If the target record has the job (config) name, only that test config
        #   will be skipped or marked as unstable
        if not target_job_cfg:
-            msg = (
+            print(
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"all CI jobs for {workflow} / {job_name}"
            )
-            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
            )

        if target_job_cfg == BUILD_JOB_NAME:
-            msg = (
+            print(
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"the build job for {workflow} / {job_name}"
            )
-            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
            )

        if target_job_cfg in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME):
-            msg = (
+            print(
                f"Issue {target_url} created by {author} has {issue_type.value} "
                + f"all the test jobs for {workflow} / {job_name}"
            )
-            info(msg)
            return _filter_jobs(
                test_matrix=test_matrix,
                issue_type=issue_type,
@ -478,7 +463,7 @@ def parse_reenabled_issues(s: Optional[str]) -> List[str]:


 def get_reenabled_issues(pr_body: str = "") -> List[str]:
-    default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'main')}"
+    default_branch = os.getenv("GIT_DEFAULT_BRANCH", "main")
    try:
        commit_messages = subprocess.check_output(
            f"git cherry -v {default_branch}".split(" ")
@ -509,15 +494,10 @@ def perform_misc_tasks(
        "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
    )
    set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td"))
-    # Only relevant for the one linux distributed cuda job, delete this when TD
-    # is rolled out completely
-    set_output(
-        "ci-td-distributed", check_for_setting(labels, pr_body, "ci-td-distributed")
-    )

    # Obviously, if the job name includes unstable, then this is an unstable job
    is_unstable = job_name and IssueType.UNSTABLE.value in job_name
-    if not is_unstable and test_matrix and test_matrix.get("include"):
+    if not is_unstable and test_matrix:
        # Even when the job name doesn't mention unstable, we will also mark it as
        # unstable when the test matrix only includes unstable jobs. Basically, this
        # logic allows build or build-and-test jobs to be marked as unstable too.
@ -587,16 +567,6 @@ def main() -> None:
        # No PR number, no tag, we can just return the test matrix as it is
        filtered_test_matrix = test_matrix

-    if args.selected_test_configs:
-        selected_test_configs = {
-            v.strip().lower()
-            for v in args.selected_test_configs.split(",")
-            if v.strip()
-        }
-        filtered_test_matrix = filter_selected_test_configs(
-            filtered_test_matrix, selected_test_configs
-        )
-
    if args.event_name == "schedule" and args.schedule == "29 8 * * *":
        # we don't want to run the mem leak check or disabled tests on normal
        # periodically scheduled jobs, only the ones at this time
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -8,25 +8,22 @@ architectures:
    * CPU
    * Latest CUDA
    * Latest ROCM
-    * Latest XPU
 """

 import os
 from typing import Dict, List, Optional, Tuple

-
-CUDA_ARCHES = ["11.8", "12.1", "12.4"]
+CUDA_ARCHES = ["11.8", "12.1"]


-CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"}
+CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"}


-CUDA_ARCHES_CUDNN_VERSION = {"11.8": "9", "12.1": "9", "12.4": "9"}
+CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"}


-ROCM_ARCHES = ["6.0", "6.1"]
+ROCM_ARCHES = ["5.7", "6.0"]

-XPU_ARCHES = ["xpu"]

 CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]

@ -34,53 +31,33 @@ CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"]
 CPU_AARCH64_ARCH = ["cpu-aarch64"]


-CPU_S390X_ARCH = ["cpu-s390x"]
-
-
-CUDA_AARCH64_ARCH = ["cuda-aarch64"]
-
-
 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "11.8": (
        "nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.1": (
        "nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "  # noqa: B950
        "nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
-    "12.4": (
-        "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
-    ),
 }


@ -135,16 +112,10 @@ def arch_type(arch_version: str) -> str:
        return "cuda"
    elif arch_version in ROCM_ARCHES:
        return "rocm"
-    elif arch_version in XPU_ARCHES:
-        return "xpu"
    elif arch_version in CPU_CXX11_ABI_ARCH:
        return "cpu-cxx11-abi"
    elif arch_version in CPU_AARCH64_ARCH:
        return "cpu-aarch64"
-    elif arch_version in CPU_S390X_ARCH:
-        return "cpu-s390x"
-    elif arch_version in CUDA_AARCH64_ARCH:
-        return "cuda-aarch64"
    else:  # arch_version should always be "cpu" in this case
        return "cpu"

@ -161,12 +132,9 @@ WHEEL_CONTAINER_IMAGES = {
        gpu_arch: f"pytorch/manylinux-builder:rocm{gpu_arch}-{DEFAULT_TAG}"
        for gpu_arch in ROCM_ARCHES
    },
-    "xpu": f"pytorch/manylinux2_28-builder:xpu-{DEFAULT_TAG}",
    "cpu": f"pytorch/manylinux-builder:cpu-{DEFAULT_TAG}",
    "cpu-cxx11-abi": f"pytorch/manylinuxcxx11-abi-builder:cpu-cxx11-abi-{DEFAULT_TAG}",
    "cpu-aarch64": f"pytorch/manylinuxaarch64-builder:cpu-aarch64-{DEFAULT_TAG}",
-    "cpu-s390x": f"pytorch/manylinuxs390x-builder:cpu-s390x-{DEFAULT_TAG}",
-    "cuda-aarch64": f"pytorch/manylinuxaarch64-builder:cuda12.4-{DEFAULT_TAG}",
 }

 CONDA_CONTAINER_IMAGES = {
@ -223,11 +191,8 @@ def translate_desired_cuda(gpu_arch_type: str, gpu_arch_version: str) -> str:
        "cpu": "cpu",
        "cpu-aarch64": "cpu",
        "cpu-cxx11-abi": "cpu-cxx11-abi",
-        "cpu-s390x": "cpu",
        "cuda": f"cu{gpu_arch_version.replace('.', '')}",
-        "cuda-aarch64": "cu124",
        "rocm": f"rocm{gpu_arch_version}",
-        "xpu": "xpu",
    }.get(gpu_arch_type, gpu_arch_version)


@ -307,11 +272,11 @@ def generate_libtorch_matrix(
                    "libtorch_variant": libtorch_variant,
                    "libtorch_config": abi_version if os == "windows" else "",
                    "devtoolset": abi_version if os != "windows" else "",
-                    "container_image": (
-                        LIBTORCH_CONTAINER_IMAGES[(arch_version, abi_version)]
-                        if os != "windows"
-                        else ""
-                    ),
+                    "container_image": LIBTORCH_CONTAINER_IMAGES[
+                        (arch_version, abi_version)
+                    ]
+                    if os != "windows"
+                    else "",
                    "package_type": "libtorch",
                    "build_name": f"libtorch-{gpu_arch_type}{gpu_arch_version}-{libtorch_variant}-{abi_version}".replace(
                        ".", "_"
@ -327,28 +292,24 @@ def generate_wheels_matrix(
    python_versions: Optional[List[str]] = None,
 ) -> List[Dict[str, str]]:
    package_type = "wheel"
-    if os == "linux" or os == "linux-aarch64" or os == "linux-s390x":
-        # NOTE: We only build manywheel packages for x86_64 and aarch64 and s390x linux
+    if os == "linux" or os == "linux-aarch64":
+        # NOTE: We only build manywheel packages for x86_64 and aarch64 linux
        package_type = "manywheel"

    if python_versions is None:
-        python_versions = FULL_PYTHON_VERSIONS + ["3.13"]
+        python_versions = FULL_PYTHON_VERSIONS

    if arches is None:
        # Define default compute archivectures
        arches = ["cpu"]
        if os == "linux":
-            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
+            arches += CPU_CXX11_ABI_ARCH + CUDA_ARCHES + ROCM_ARCHES
        elif os == "windows":
            arches += CUDA_ARCHES
        elif os == "linux-aarch64":
            # Only want the one arch as the CPU type is different and
            # uses different build/test scripts
-            arches = ["cpu-aarch64", "cuda-aarch64"]
-        elif os == "linux-s390x":
-            # Only want the one arch as the CPU type is different and
-            # uses different build/test scripts
-            arches = ["cpu-s390x"]
+            arches = ["cpu-aarch64"]

    ret: List[Dict[str, str]] = []
    for python_version in python_versions:
@ -359,24 +320,11 @@ def generate_wheels_matrix(
                if arch_version == "cpu"
                or arch_version == "cpu-cxx11-abi"
                or arch_version == "cpu-aarch64"
-                or arch_version == "cpu-s390x"
-                or arch_version == "cuda-aarch64"
-                or arch_version == "xpu"
                else arch_version
            )

-            # TODO: Enable python 3.13 on rocm, xpu, aarch64, windows
-            if (
-                gpu_arch_type in ["rocm", "xpu"] or os != "linux"
-            ) and python_version == "3.13":
-                continue
-
            # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install
-            if (
-                arch_version in ["12.4", "12.1", "11.8"]
-                and os == "linux"
-                or arch_version == "cuda-aarch64"
-            ):
+            if arch_version in ["12.1", "11.8"] and os == "linux":
                ret.append(
                    {
                        "python_version": python_version,
@ -385,64 +333,15 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi" if arch_version == "cuda-aarch64" else ""
-                        ),
+                        "devtoolset": "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                            if os != "linux-aarch64"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements": PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version],  # fmt: skip
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(  # noqa: B950
                            ".", "_"
                        ),
                    }
                )
-                if arch_version != "cuda-aarch64":
-                    ret.append(
-                        {
-                            "python_version": python_version,
-                            "gpu_arch_type": gpu_arch_type,
-                            "gpu_arch_version": gpu_arch_version,
-                            "desired_cuda": translate_desired_cuda(
-                                gpu_arch_type, gpu_arch_version
-                            ),
-                            "use_split_build": "True",
-                            "devtoolset": "",
-                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": (
-                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                                if os != "linux-aarch64"
-                                else ""
-                            ),
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
-                                ".", "_"
-                            ),
-                        }
-                    )
-                    # Special build building to use on Colab. PyThon 3.10 for 12.1 CUDA
-                    if python_version == "3.10" and arch_version == "12.1":
-                        ret.append(
-                            {
-                                "python_version": python_version,
-                                "gpu_arch_type": gpu_arch_type,
-                                "gpu_arch_version": gpu_arch_version,
-                                "desired_cuda": translate_desired_cuda(
-                                    gpu_arch_type, gpu_arch_version
-                                ),
-                                "use_split_build": "False",
-                                "devtoolset": "",
-                                "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                                "package_type": package_type,
-                                "pytorch_extra_install_requirements": "",
-                                "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-full".replace(  # noqa: B950
-                                    ".", "_"
-                                ),
-                            }
-                        )
            else:
                ret.append(
                    {
@ -452,26 +351,21 @@ def generate_wheels_matrix(
                        "desired_cuda": translate_desired_cuda(
                            gpu_arch_type, gpu_arch_version
                        ),
-                        "devtoolset": (
-                            "cxx11-abi"
-                            if arch_version in ["cpu-cxx11-abi", "xpu"]
-                            else ""
-                        ),
+                        "devtoolset": "cxx11-abi"
+                        if arch_version == "cpu-cxx11-abi"
+                        else "",
                        "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
                        "package_type": package_type,
                        "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}".replace(
                            ".", "_"
                        ),
-                        "pytorch_extra_install_requirements": (
-                            PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
-                            if os != "linux"
-                            else ""
-                        ),
+                        "pytorch_extra_install_requirements":
+                        PYTORCH_EXTRA_INSTALL_REQUIREMENTS["12.1"]  # fmt: skip
+                        if os != "linux" else "",
                    }
                )
    return ret


-validate_nccl_dep_consistency("12.4")
 validate_nccl_dep_consistency("12.1")
 validate_nccl_dep_consistency("11.8")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
mikaylagawarecki	4bb5cb51e6	Fix swap_tensors path in _apply for modules that inherit from RNNBase (RNN, GRU, LSTM) (#122800 ) (#123116 ) Pull Request resolved: https://github.com/pytorch/pytorch/pull/122800 Approved by: https://github.com/albanD (cherry picked from commit cc12668053ad847ff4a430e99eeebf99c136f3cd)	2024-04-02 16:16:37 -07:00
Wanchao	ef38d0572e	nn.Module: use swap_tensors for Tensor subclasses (#122755 ) (#123106 ) This fixes a bug when casting a module that has DTensor parameters. The old behavior will swap the .data field of the Tensor subclass which is incorrect behavior when dealing with tensor subclasses that may have multiple child tensors. This uses the `swap_tensors` method to swap all of the tensors not just the .data field. Test plan: ``` pytest test/distributed/_tensor/test_api.py -k 'test_distribute_module_casting' python test/distributed/fsdp/test_wrap.py -k test_auto_wrap_smoke_test_cuda_init_mode1_cpu_offload0_use_device_id_True ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/122755 Approved by: https://github.com/wanchaol, https://github.com/mikaylagawarecki (cherry picked from commit e6ee8322d767ab241ce1651e7c178f539e8e3199) Co-authored-by: Tristan Rice <rice@fn.lc>	2024-04-02 16:16:16 -07:00
Xinya Zhang	5a53185e65	Remove cuda dependencies when building AOTriton (#122982 ) (#123179 ) Downloading CUDA sometimes failed and breaks the build process, but AOTriton does not need these packages. This commit comments out the related downloading scripts.	2024-04-02 19:08:22 -04:00
Xinya Zhang	bc9e23abb5	Fix performance regression and memory storage handling of Flash Attention on ROCM (#122857 ) (#122967 ) This PR fixes the two major issues that was discovered after the initial merge of PR #121561 1. The Flash Attention support added by has severe performance regressions on regular shapes (power of two head dimensions and sequence lengths) compared with PR #115981. Its performance is worse than the math backend and only has numerical stability advantages. This PR fixes this problem. 2. There is a flaw of memory storage handling in PR #121561 which does not copy the gradients back to the designated output tensor. This PR removes the deprecated `TensorStorageSanitizer` class which is unnecessary due to the more flexible backward kernel shipped by PR #121561 Pull Request resolved: https://github.com/pytorch/pytorch/pull/122857 Approved by: https://github.com/jeffdaily, https://github.com/drisspg	2024-04-02 18:53:19 -04:00
Huy Do	8194fae625	Pin protobuf to 3.20.2 on macOS (#123197 ) The newer protobuf 5.26.0 releasing on March 13rd is causing failures with `test_hparams_*` from `test_tensorboard` in which the stringify metadata is wrong when escaping double quote. For example, `3bc2bb6781`. This looks like an upstream issue from Tensorboard where it doesn't work with this brand new protobuf version https://github.com/tensorflow/tensorboard/blob/master/tensorboard/pip_package/requirements.txt#L29 The package has been pinned on Docker https://github.com/pytorch/pytorch/blob/main/.ci/docker/requirements-ci.txt#L155, so it should be pinned on macOS too. We want to eventually just have one requirements.txt file. Fixes https://github.com/pytorch/pytorch/issues/122008 Fixes https://github.com/pytorch/pytorch/issues/121927 Fixes https://github.com/pytorch/pytorch/issues/121946 Pull Request resolved: https://github.com/pytorch/pytorch/pull/121918 Approved by: https://github.com/kit1980	2024-04-02 15:08:09 -04:00
Iris Z	12acd4c9b3	[Cherrypick][DeviceMesh] Cache and reuse sliced result (#122975 ) (#123073 ) Fixes #118849 Add a map for parent_to_child_mappings in _mesh_resources so we can cache and reuse submesh slicing result so that we can avoid recreating submesh and the underlying sub pg repeatedly, which could lead to funky behaviors. We will follow up with reusing pg from the parent_mesh during submesh creation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/122975 Approved by: https://github.com/wanchaol	2024-04-02 15:05:07 -04:00
Chunyuan WU	857797d148	[CherryPick] Inductor cpp wrapper: fix dtype of ShapeAsConstantBuffer (#122297 ) (#123064 ) For `at::scalar_tensor` the default dtype will be `float` ([link to scalar_tensor](`0d8e960f74/aten/src/ATen/native/TensorFactories.cpp (L856)`), [link to default dtype](`0d8e960f74/c10/core/TensorOptions.h (L551)`)) if we don't set the `dtype` value. However, the input scalar value is not necessarily a `float` value. With `torch::tensor(x)`, the dtype of the tensor will be decided according to the dtype of the scalar. Pull Request resolved: https://github.com/pytorch/pytorch/pull/122297 Approved by: https://github.com/jgong5, https://github.com/desertfire	2024-04-02 15:03:25 -04:00
pytorchbot	233dfe4d6a	Proper view support for jagged layout NestedTensor (#122854 ) * Proper view support for jagged layout NestedTensor (#113279) This PR: * Introduces an ATen op for creating true jagged views from a dense values buffer * `_nested_view_from_jagged(values, offsets, lengths, ragged_idx, dummy)` * This ops is implemented on the Python side using torch.library so we can return a subclass instance * `jagged_from_list()` now uses this instead of the old autograd.Function `NestedViewFromBuffer` * The latter op is used for non-contiguous JTs returned via `torch.nested.narrow()` * `dummy` is an awful hack to ensure that `NestedTensor.__torch_dispatch__()` is invoked for our view * Introduces an ATen op for accessing the `values` component of an NT via a view * `_nested_get_values(nt)` * Removes the autograd.Functions `ViewNestedFromBuffer` and `ViewBufferFromNested` in favor of `nested_from_values_offsets()` / `nested_from_values_offsets_lengths()` and `nt.values()`, respectively. * Changes test code to prefer `as_nested_tensor()` over `jagged_from_list()` directly * Similarly, avoid `buffer_from_jagged()`, preferring `values()` * Depends on general subclass view fake-ification on the PT2 side (handled solely in previous PRs in the stack) With these changes, the semantics of jagged layout NTs are such that they are considered a true view of the underlying `values` buffer. This means views of jagged NTs are views of the underlying buffer as well, simplifying some handling. Differential Revision: [D54269922](https://our.internmc.facebook.com/intern/diff/D54269922) Co-authored-by: voznesenskym <voznesenskym@gmail.com> Pull Request resolved: https://github.com/pytorch/pytorch/pull/113279 Approved by: https://github.com/ezyang (cherry picked from commit cd6bfc7965fc5ae20720bae0994e332e56f819c0) * Update executorch.txt * Update executorch.txt * Fix linter error --------- Co-authored-by: Joel Schlosser <jbschlosser@meta.com> Co-authored-by: Guang Yang <42389959+guangy10@users.noreply.github.com>	2024-04-02 11:46:53 -07:00
Xia Weiwen	e22b534b10	Upgrade submodule oneDNN to v3.3.6 for release/2.3 (#122164 ) (#122930 ) As the title. Including issue fixes for aarch64: - https://github.com/oneapi-src/oneDNN/pull/1831 - https://github.com/oneapi-src/oneDNN/pull/1834 --- ## Validation results (on Intel CPU + Linux) Static quantization with Inductor on CV models Quant method \| Geomean throughput ratio (v3.3.6/baseline) -- \| -- ptq \| 0.982937 ptq (cpp wrapper) \| 0.978384 qat \| 0.978828 Torchbench cpu userbenchmark with Inductor Items \| Perf Geomean Ratio (v3.3.6/baseline) -- \| -- eager_throughtput_bf16_infer \| 1.00x eager_throughtput_fp32_infer \| 1.00x jit_llga_throughtput_amp_bf16 \| 1.01x jit_llga_throughtput_fp32 \| 1.00x eager_throughtput_fx_int8 \| 1.00x eager_throughtput_bf16_train \| 1.46x eager_throughtput_fp32_train \| 1.41x Dynamo benchmarks tests Precision \| Shape \| Wrapper \| Thread \| Eager old/new GEOMEAN \| Inductor old/new GEOMEAN -- \| -- \| -- \| -- \| -- \| -- Float32 \| Static \| Default \| Multiple \| 1.003836812 \| 1.003425 Float32 \| Static \| Default \| Single \| 1.000181451 \| 0.999611 Float32 \| Dynamic \| Default \| Multiple \| 1.003980183 \| 1.006563 Float32 \| Dynamic \| Default \| Single \| 1.000076939 \| 0.999969 AMP \| Static \| Default \| Multiple \| 0.996824772 \| 0.998715 AMP \| Static \| Default \| Single \| 0.996402574 \| 1.001483 AMP \| Dynamic \| Default \| Multiple \| 0.994919866 \| 1.000467 AMP \| Dynamic \| Default \| Single \| 0.9962054 \| 1.000767 (on Aarch64) https://github.com/pytorch/pytorch/pull/122164#issuecomment-2007912919 --- Pull Request resolved: https://github.com/pytorch/pytorch/pull/122164 Approved by: https://github.com/snadampal, https://github.com/malfet, https://github.com/atalman	2024-04-02 12:57:11 -04:00
Iris Z	8602990e3f	[CherryPick] Back out "[DeviceMesh] Add support for nD slicing (#119752 )" (#121763 ) (#122495 ) Summary: Original commit changeset: e52b8809c8d8 Original Phabricator Diff: D54778906 We have to backout this diff. D54778906 seems to be causing test failures for APF blocking trunk health and hence release. Just starting to look at the issue. T182209248 Test Plan: Sandcastle Reviewed By: satgera Differential Revision: D54825114 Pull Request resolved: https://github.com/pytorch/pytorch/pull/121763 Approved by: https://github.com/osalpekar (cherry picked from commit e99fa0042cd3dcd2eded24585d59c53f2da9d9f5)	2024-03-28 14:25:08 -07:00
Jack Taylor	685cc955df	[ROCm] Update triton rocm branch to release/2.3.x (#122493 ) * Update triton rocm branch to release/2.3.x * Remove ROCM_TRITION_VERSION and update to 2.3.0 * Remove unnecessary ROCm conditionalisation * Skip failing UT	2024-03-28 14:18:37 -07:00
pytorchbot	b1c2430fbd	remove torchao dependency (#122635 ) * remove torchao dependency (#122524) Test Plan: CI ``` buck2 run mode/dev-nosan mode/inplace executorch/examples/models/llama2:export_llama -- -c ~/llama/ultra_new_checkpoint.pt -p ~/llama/params.json -kv -E 8,8 -d fp32 --pt2e_quantize "xnnpack_dynamic" -2 ``` ``` buck run //executorch/backends/xnnpack/test:test_xnnpack_ops -- executorch.backends.xnnpack.test.ops.linear.TestLinear.test_qd8_fp32_per_token_weight_per_channel_group_int4 ``` Differential Revision: D55263008 Pull Request resolved: https://github.com/pytorch/pytorch/pull/122524 Approved by: https://github.com/jerryzh168 (cherry picked from commit c677221798d8ce87c97aac1bd9ae34af0767c383) * Update executorch.txt * Update _decomposed.py * Update executorch.txt * Update executorch.txt * Update executorch.txt * Update executorch.txt * Update executorch.txt --------- Co-authored-by: Guang Yang <guangyang@meta.com> Co-authored-by: Guang Yang <42389959+guangy10@users.noreply.github.com>	2024-03-28 12:25:12 -07:00
pytorchbot	3002eb2556	[export] hack skip index_put_ in dce (#122683 ) (#122721 ) Summary: Ideally we should do whats in the todo. Just doing this for now to unblock llama capture Test Plan: capturing llama and using pt2e to quantize it Differential Revision: D55354487 Pull Request resolved: https://github.com/pytorch/pytorch/pull/122683 Approved by: https://github.com/kimishpatel (cherry picked from commit 41d24df08f72e059c4eebdde4315e63a9918406f) Co-authored-by: Jacob Szwejbka <jakeszwe@meta.com>	2024-03-27 21:29:53 -07:00
pytorchbot	e1a846d6b8	Fix auto_functionalize (#121990 ) (#122654 ) Differential Revision: D54964130 When we re-export, auto_functionalize HOP will be in the graph. Therefore, we need to implement proper functionalization rule for it. Since the content inside auto_functionalize is guaranteed be functional, it is ok to just fall through it. Pull Request resolved: https://github.com/pytorch/pytorch/pull/121990 Approved by: https://github.com/ydwu4, https://github.com/zou3519 (cherry picked from commit 0d845f7b0781f091452a5fd31de14e1c2117f3d4) Co-authored-by: Tugsbayasgalan (Tugsuu) Manlaibaatar <tmanlaibaatar@meta.com>	2024-03-27 21:28:56 -07:00
pytorchbot	4a9a8c606d	[export] add pass to remove auto functionalized hop (#122246 ) (#122655 ) Summary: Adds a pass that blindly removes the functionalize hop without consideration on if its safe. Useful for ExecuTorch today and other usecases that have additional logic that can reason about when this pass is safe to use Test Plan: added unit test Differential Revision: D55103867 Pull Request resolved: https://github.com/pytorch/pytorch/pull/122246 Approved by: https://github.com/angelayi (cherry picked from commit c84f81b395fff969bbd2f784efad8ab1a8aa52de) Co-authored-by: Jacob Szwejbka <jakeszwe@meta.com>	2024-03-27 21:05:15 -07:00
Andrey Talman	d3201f48b1	Revert "Revert "CI: Specify libc and libstdcxx versions in conda environments"" (#122523 ) This reverts commit 74832f12fae2e1bc51bf1f9971dcd12c90a971f5.	2024-03-22 17:41:42 -04:00
Andrey Talman	74832f12fa	Revert "CI: Specify libc and libstdcxx versions in conda environments" (#122497 ) This reverts commit b4f90aae1b375bfe06d3c4a099240e06f93c81c4.	2024-03-22 11:27:50 -04:00
Andrey Talman	02cdb400d7	Use temporary name for triton package, fix lint (#122438 ) * Use temporary name for triton package * Fix lint	2024-03-21 17:30:38 -04:00
Andrey Talman	37257774c6	Triton wheel build using 2.3.x branch (#122403 ) * Triton build 2.3.x * Revert "[Release Only] Build triton using pinned version rather branch (#121765)" This reverts commit d69c4219127e2cf5d9637b0daacc0a24e65f8133. * Triton wheel change * release	2024-03-21 12:52:21 -04:00
shunting314	c4e5434423	necessary change to make torch2.3 work with triton2.2 (#122139 )	2024-03-21 08:24:53 -04:00
pytorchbot	b4f90aae1b	CI: Specify libc and libstdcxx versions in conda environments (#121929 ) Without this we get mismatches between the GLIBC and GLIBCXX ABI used by conda packages vs pytorch. Pull Request resolved: https://github.com/pytorch/pytorch/pull/121556 Approved by: https://github.com/isuruf, https://github.com/malfet (cherry picked from commit 7a53dedb07ed72b85d1e083ce38c43c7810fc5f1) Co-authored-by: Peter Bell <peterbell10@live.co.uk>	2024-03-14 17:56:46 -04:00
Andrey Talman	94d6463255	[RELEASE ONLY CHANGES] Increase timeout for linux binary jobs, fix workflow lint (#121851 ) * [release only] Increase timeout job for linux binary builds by 30min * fix lint	2024-03-13 19:50:57 -04:00
Andrey Talman	6a89a753b1	[RELEASE ONLY CHANGES] Apply release only changes Release 2.3 (#121813 ) * [Release only changes] Release only changes #2 * common+lint	2024-03-13 11:03:48 -04:00
Andrey Talman	d69c421912	[Release Only] Build triton using pinned version rather branch (#121765 )	2024-03-12 19:05:23 -04:00
Andrey Talman	6725db07ae	[RELEASE ONLY CHANGES] Apply release only changes Release 2.3 (#121726 ) * Apply release only changes * temp changes * tweak * fix * Revert "tweak" This reverts commit 38edcac21448829ac114c73423c84614628e2598.	2024-03-12 18:14:35 -04:00
 @ -1 +1 @@
 .0.0
 .3.0