Update base for Update on "[WIP] Add a dedicated registration API to support torch.compile-based aten implemantion"

This PR is a follow-up of RFC https://github.com/pytorch/pytorch/issues/115545. In this PR, we are trying to provide a registration mode to implement a single aten operation on the top of `torch.compile` and then register to aten. By now, the Python-based aten kernel implementation assumes the hermetic Python object. For `torch.compile`-based aten kernel implementation, the assumption will be broken. Because > While HermeticPyObject was enabled, we attempted to create a tensor subclass with __torch_dispatch__. This violates the invariant that operations in HermeticPyObject have equivalent C++ implementations. cc voznesenskym penguinwu jgong5 Guobing-Chen XiaobingSuper zhuhaozhe blzheng wenzhe-nrv jiayisunx peterbell10 ipiszy yf225 chenyang78 kadeng muchulee8 aakhundov ColinPeppler [ghstack-poisoned]
2025-11-01 13:34:57 +08:00 · 2024-03-06 06:04:37 +00:00
parent 5e796bbce3 a299db2983
commit bc901b4642
7946 changed files with 82979 additions and 43546 deletions
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -204,7 +204,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.6
+    ROCM_VERSION=5.7
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -215,7 +215,7 @@ case "$image" in
    PROTOBUF=yes
    DB=yes
    VISION=yes
-    ROCM_VERSION=5.7
+    ROCM_VERSION=6.0
    NINJA_VERSION=1.9.0
    CONDA_CMAKE=yes
    TRITON=yes
@ -277,6 +277,7 @@ case "$image" in
    CONDA_CMAKE=yes
    TRITON=yes
    DOCS=yes
+    UNINSTALL_DILL=yes
    ;;
  pytorch-linux-jammy-py3-clang12-executorch)
    ANACONDA_PYTHON_VERSION=3.10
@ -349,7 +350,7 @@ if [[ "$image" == *cuda*  && ${OS} == "ubuntu" ]]; then
 fi

 # Build image
-docker build \
+DOCKER_BUILDKIT=1 docker build \
       --no-cache \
       --progress=plain \
       --build-arg "BUILD_ENVIRONMENT=${image}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-58a82f5e72a9ec0263a59d5f5d36a6769d12e230
+566528fd7bf00badb72d2d9966ba6e301674217d
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +1 @@
-6c26faa159b79a42d7fa46cb66e2d21523351987
+243e186efbf7fb93328dd6b34927a4e8c8f24395
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-dafe1459823b9549417ed95e9720f1b594fab329
+0a22a91d04c2b4a029a69a198eac390089c3e891
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-e28a256d71f3cf2bcc7b69d6bda73a9b855e385e
+901819d2b67bcb4543aa2645b729a9ff8ec32661
--- a/.ci/docker/common/install_base.sh
+++ b/.ci/docker/common/install_base.sh
@ -153,7 +153,7 @@ wget https://ossci-linux.s3.amazonaws.com/valgrind-${VALGRIND_VERSION}.tar.bz2
 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2
 cd valgrind-${VALGRIND_VERSION}
 ./configure --prefix=/usr/local
-make -j6
+make -j$[$(nproc) - 2]
 sudo make install
 cd ../../
 rm -rf valgrind_build
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -52,9 +52,10 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then

  # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README
  CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools"
-  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then
-    conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS}
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
+    conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS}
  else
+
    conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS}
  fi

--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -29,10 +29,11 @@ pip_install \
  transformers==4.36.2

 pip_install coloredlogs packaging
-retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006

-pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2
-pip_install onnxscript==0.1.0.dev20240117 --no-deps
+pip_install onnxruntime==1.17.0
+pip_install onnx==1.15.0
+# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@1d6362db06706c13447e590ecf5ac3238efc1880" --no-deps
+pip_install onnxscript==0.1.0.dev20240216 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_openssl.sh
+++ b/.ci/docker/common/install_openssl.sh
@ -9,7 +9,8 @@ tar xf "${OPENSSL}.tar.gz"
 cd "${OPENSSL}"
 ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)'
 # NOTE: openssl install errors out when built with the -j option
-make -j6; make install_sw
+NPROC=$[$(nproc) - 2]
+make -j${NPROC}; make install_sw
 # Link the ssl libraries to the /usr/lib folder.
 sudo ln -s /opt/openssl/lib/lib* /usr/lib
 cd ..
--- a/.ci/docker/common/install_protobuf.sh
+++ b/.ci/docker/common/install_protobuf.sh
@ -2,55 +2,17 @@

 set -ex

-# This function installs protobuf 3.17
-install_protobuf_317() {
-  pb_dir="/usr/temp_pb_install_dir"
-  mkdir -p $pb_dir
+pb_dir="/usr/temp_pb_install_dir"
+mkdir -p $pb_dir

-  # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
-  # else it will fail with
-  #   g++: error: ./../lib64/crti.o: No such file or directory
-  ln -s /usr/lib64 "$pb_dir/lib64"
+# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or
+# else it will fail with
+#   g++: error: ./../lib64/crti.o: No such file or directory
+ln -s /usr/lib64 "$pb_dir/lib64"

-  curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
-  tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
-  # -j6 to balance memory usage and speed.
-  # naked `-j` seems to use too much memory.
-  pushd "$pb_dir" && ./configure && make -j6 && make -j6 check && sudo make -j6 install && sudo ldconfig
-  popd
-  rm -rf $pb_dir
-}
-
-install_ubuntu() {
-  # Ubuntu 14.04 has cmake 2.8.12 as the default option, so we will
-  # install cmake3 here and use cmake3.
-  apt-get update
-  if [[ "$UBUNTU_VERSION" == 14.04 ]]; then
-    apt-get install -y --no-install-recommends cmake3
-  fi
-
-  # Cleanup
-  apt-get autoclean && apt-get clean
-  rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-  install_protobuf_317
-}
-
-install_centos() {
-  install_protobuf_317
-}
-
-# Install base packages depending on the base OS
-ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
-case "$ID" in
-  ubuntu)
-    install_ubuntu
-    ;;
-  centos)
-    install_centos
-    ;;
-  *)
-    echo "Unable to determine OS..."
-    exit 1
-    ;;
-esac
+curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3
+tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz
+NPROC=$[$(nproc) - 2]
+pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig
+popd
+rm -rf $pb_dir
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -80,6 +80,14 @@ install_ubuntu() {
        fi
    fi

+    # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
+    if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+        for kdb in /opt/rocm/share/miopen/db/*.kdb
+        do
+            sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+        done
+    fi
+
    # Cleanup
    apt-get autoclean && apt-get clean
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
@ -151,6 +159,14 @@ install_centos() {
      fi
  fi

+  # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime
+  if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then
+      for kdb in /opt/rocm/share/miopen/db/*.kdb
+      do
+          sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;"
+      done
+  fi
+
  # Cleanup
  yum clean all
  rm -rf /var/cache/yum
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -7,7 +7,7 @@ git clone https://bitbucket.org/icl/magma.git
 pushd magma

 # Version 2.7.2 + ROCm related updates
-git checkout 823531632140d0edcb7e77c3edc0e837421471c5
+git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6

 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc
 echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -64,5 +64,6 @@ if [ -n "${CONDA_CMAKE}" ]; then
  # latest numpy version, which fails ASAN tests with the following import error: Numba
  # needs NumPy 1.20 or less.
  conda_reinstall cmake="${CMAKE_VERSION}"
-  conda_reinstall numpy="${NUMPY_VERSION}"
+  # Note that we install numpy with pip as conda might not have the version we want
+  pip_install --force-reinstall numpy=="${NUMPY_VERSION}"
 fi
--- a/.ci/docker/common/install_ucc.sh
+++ b/.ci/docker/common/install_ucc.sh
@ -36,7 +36,12 @@ function install_ucc() {
  git submodule update --init --recursive

  ./autogen.sh
-  ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda
+  # We only run distributed tests on Tesla M60 and A10G
+  NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86"
+  ./configure --prefix=$UCC_HOME          \
+    --with-ucx=$UCX_HOME                  \
+    --with-cuda=$with_cuda                \
+    --with-nvcc-gencode="${NVCC_GENCODE}"
  time make -j
  sudo make install

--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -15,7 +15,7 @@ click
 #Pinned versions:
 #test that import:

-coremltools==5.0b5
+coremltools==5.0b5 ; python_version < "3.12"
 #Description: Apple framework for ML integration
 #Pinned versions: 5.0b5
 #test that import:
@ -25,6 +25,11 @@ coremltools==5.0b5
 #Pinned versions:
 #test that import:

+dill==0.3.7
+#Description: dill extends pickle with serializing and de-serializing for most built-ins
+#Pinned versions: 0.3.7
+#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py
+
 expecttest==0.1.6
 #Description: method for writing tests where test framework auto populates
 # the expected output based on previous runs
@ -47,6 +52,11 @@ junitparser==2.1.1
 #Pinned versions: 2.1.1
 #test that import:

+lark==0.12.0
+#Description: parser
+#Pinned versions: 0.12.0
+#test that import:
+
 librosa>=0.6.2 ; python_version < "3.11"
 #Description: A python package for music and audio analysis
 #Pinned versions: >=0.6.2
@ -66,7 +76,7 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Description: A testing library that allows you to replace parts of your
 #system under test with mock objects
 #Pinned versions:
-#test that import: test_module_init.py, test_modules.py, test_nn.py,
+#test that import: test_modules.py, test_nn.py,
 #test_testing.py

 #MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8
@ -75,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.7.0
+mypy==1.8.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.7.0
+#Pinned versions: 1.8.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -137,9 +147,9 @@ optree==0.9.1
 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py,
 #test_fake_tensor.py, test_mps.py

-pillow==10.0.1
+pillow==10.2.0
 #Description:  Python Imaging Library fork
-#Pinned versions: 10.0.1
+#Pinned versions: 10.2.0
 #test that import:

 protobuf==3.20.2
@ -162,11 +172,6 @@ pytest-xdist==3.3.1
 #Pinned versions:
 #test that import:

-pytest-shard==0.1.2
-#Description: plugin spliting up tests in pytest
-#Pinned versions:
-#test that import:
-
 pytest-flakefinder==1.1.0
 #Description: plugin for rerunning tests a fixed number of times in pytest
 #Pinned versions: 1.1.0
@ -268,14 +273,14 @@ rockset==1.0.3
 #Pinned versions: 1.0.3
 #test that import:

-ghstack==0.7.1
+ghstack==0.8.0
 #Description: ghstack tool
-#Pinned versions: 0.7.1
+#Pinned versions: 0.8.0
 #test that import:

-jinja2==3.1.2
+jinja2==3.1.3
 #Description: jinja2 template engine
-#Pinned versions: 3.1.2
+#Pinned versions: 3.1.3
 #test that import:

 pytest-cpp==2.3.0
@ -293,7 +298,8 @@ tensorboard==2.13.0
 #Pinned versions:
 #test that import: test_tensorboard

-pywavelets==1.4.1
+pywavelets==1.4.1 ; python_version < "3.12"
+pywavelets==1.5.0 ; python_version >= "3.12"
 #Description: This is a requirement of scikit-image, we need to pin
 # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI
 #Pinned versions: 1.4.1
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-2.2.0
+3.0.0
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -37,6 +37,7 @@ COPY requirements-ci.txt requirements-docs.txt /opt/conda/
 COPY ./common/install_conda.sh install_conda.sh
 COPY ./common/common_utils.sh common_utils.sh
 RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt
+RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi

 # Install gcc
 ARG GCC_VERSION
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -82,6 +82,13 @@ if ! which conda; then
  fi
 else
  export CMAKE_PREFIX_PATH=/opt/conda
+
+  # Workaround required for MKL library linkage
+  # https://github.com/pytorch/pytorch/issues/119557
+  if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then
+    export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/"
+    export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/"
+  fi
 fi

 if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then
--- a/.ci/pytorch/macos-common.sh
+++ b/.ci/pytorch/macos-common.sh
@ -9,7 +9,7 @@ sysctl -a | grep machdep.cpu

 # These are required for both the build job and the test job.
 # In the latter to test cpp extensions.
-export MACOSX_DEPLOYMENT_TARGET=11.0
+export MACOSX_DEPLOYMENT_TARGET=11.1
 export CXX=clang++
 export CC=clang

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -34,7 +34,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test
 # functional collective tests
 time python test/run_test.py --verbose -i distributed/test_functional_api

-
 # DTensor tests
 time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops
 time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile
@ -49,6 +48,7 @@ time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_ex

 # Other tests
 time python test/run_test.py --verbose -i test_cuda_primary_ctx
-time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors
+time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu
+time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype
 time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping
 assert_git_not_dirty
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -130,6 +130,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda"
 elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
+  # setting PYTHON_TEST_EXTRA_OPTION
+  export PYTHON_TEST_EXTRA_OPTION="--xpu"
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -137,6 +139,8 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then
 fi

 if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
+  # regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1
+  export VALGRIND=OFF
  # Print GPU info
  rocminfo
  rocminfo | grep -E 'Name:.*\sgfx|Marketing'
@ -250,14 +254,14 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
 }

 test_python() {
  # shellcheck disable=SC2086
-  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose
+  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION
  assert_git_not_dirty
 }

@ -274,6 +278,7 @@ test_dynamo_shard() {
    --exclude-inductor-tests \
    --exclude-jit-executor \
    --exclude-distributed-tests \
+    --exclude-torch-export-tests \
    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
  assert_git_not_dirty
@ -285,8 +290,16 @@ test_inductor_distributed() {
  pytest test/inductor/test_torchinductor.py -k test_multi_gpu
  pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device
  pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices
+  pytest test/distributed/test_c10d_functional_native.py
  pytest test/distributed/_tensor/test_dtensor_compile.py
  pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_comm.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing
+  pytest test/distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp
+  pytest test/distributed/_composable/fsdp/test_fully_shard_frozen.py
+  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype
+  pytest test/distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype

  # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported
  # with if required # gpus aren't available
@ -400,7 +413,7 @@ test_perf_for_dashboard() {
            --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
      if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then
-        python "benchmarks/dynamo/$suite.py" \
+        TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \
            --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv"
      fi
@ -444,6 +457,11 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
+      # Test AOTInductor with the ABI-compatible mode on CI
+      # This can be removed once the ABI-compatible mode becomes default.
+      export TORCHINDUCTOR_ABI_COMPATIBLE=1
+    fi
    python "benchmarks/dynamo/$suite.py" \
      --ci --accuracy --timing --explain \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" \
@ -500,7 +518,7 @@ test_inductor_torchbench_smoketest_perf() {
  # The threshold value needs to be actively maintained to make this check useful
  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4

-  python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
+  TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \
    --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv"
  # The threshold value needs to be actively maintained to make this check useful
  # The perf number of nanogpt seems not very stable, e.g.
@ -521,6 +539,50 @@ test_inductor_torchbench_smoketest_perf() {
  done
 }

+test_inductor_torchbench_cpu_smoketest_perf(){
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+
+  #set jemalloc
+  JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2"
+  IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
+  export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD"
+  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
+  export KMP_AFFINITY=granularity=fine,compact,1,0
+  export KMP_BLOCKTIME=1
+  CORES=$(lscpu | grep Core | awk '{print $4}')
+  export OMP_NUM_THREADS=$CORES
+  end_core=$(( CORES-1 ))
+
+  MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv
+
+  grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg
+  do
+    local model_name=${model_cfg[0]}
+    local data_type=${model_cfg[1]}
+    local speedup_target=${model_cfg[4]}
+    if [[ ${model_cfg[3]} == "cpp" ]]; then
+      export TORCHINDUCTOR_CPP_WRAPPER=1
+    else
+      unset TORCHINDUCTOR_CPP_WRAPPER
+    fi
+    local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv"
+
+    if [[ ${model_cfg[2]} == "dynamic" ]]; then
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \
+        --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name"
+    else
+      taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \
+        --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \
+        --freezing --timeout 9000 --backend=inductor --output "$output_name"
+    fi
+    cat "$output_name"
+    # The threshold value needs to be actively maintained to make this check useful.
+    python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target"
+  done
+}
+
 test_python_gloo_with_tls() {
  source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh"
  assert_git_not_dirty
@ -920,7 +982,8 @@ test_bazel() {

    tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests
  else
-    tools/bazel test --test_output=errors \
+    # Increase the test timeout to 480 like CPU tests because modules_test frequently timeout
+    tools/bazel test --test_timeout=480 --test_output=errors \
      //:any_test \
      //:autograd_test \
      //:dataloader_test \
@ -1094,6 +1157,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then
    checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer
    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf
+  elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then
+    checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \
+      llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \
+      shufflenet_v2_x1_0 hf_GPT2
+    PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf
  else
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -16,11 +16,6 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol

 set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers

-
-call %INSTALLER_DIR%\install_mkl.bat
-if errorlevel 1 exit /b
-if not errorlevel 0 exit /b
-
 call %INSTALLER_DIR%\install_magma.bat
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b
@ -35,6 +30,10 @@ call %INSTALLER_DIR%\activate_miniconda3.bat
 if errorlevel 1 exit /b
 if not errorlevel 0 exit /b

+call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0
+if errorlevel 1 exit /b
+if not errorlevel 0 exit /b
+
 :: Override VS env here
 pushd .
 if "%VC_VERSION%" == "" (
@ -89,8 +88,8 @@ set SCCACHE_IGNORE_SERVER_IO_ERROR=1
 sccache --stop-server
 sccache --start-server
 sccache --zero-stats
-set CC=sccache-cl
-set CXX=sccache-cl
+set CMAKE_C_COMPILER_LAUNCHER=sccache
+set CMAKE_CXX_COMPILER_LAUNCHER=sccache

 set CMAKE_GENERATOR=Ninja

--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat
@ -1,14 +0,0 @@
-if "%REBUILD%"=="" (
-  if "%BUILD_ENVIRONMENT%"=="" (
-    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z
-  ) else (
-    aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet
-  )
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
-  7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl
-  if errorlevel 1 exit /b
-  if not errorlevel 0 exit /b
-)
-set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include
-set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB%
--- a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat
@ -1,18 +1,13 @@
 mkdir %TMP_DIR_WIN%\bin

 if "%REBUILD%"=="" (
-  :check_sccache
-  %TMP_DIR_WIN%\bin\sccache.exe --show-stats || (
+  IF EXIST %TMP_DIR_WIN%\bin\sccache.exe (
    taskkill /im sccache.exe /f /t || ver > nul
    del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul
-    del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul
-    if "%BUILD_ENVIRONMENT%"=="" (
-      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe
-      curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe
-    ) else (
-      aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe
-      aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe
-    )
-    goto :check_sccache
  )
-)
+  if "%BUILD_ENVIRONMENT%"=="" (
+    curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-v0.7.4.exe --output %TMP_DIR_WIN%\bin\sccache.exe
+  ) else (
+    aws s3 cp s3://ossci-windows/sccache-v0.7.4.exe %TMP_DIR_WIN%\bin\sccache.exe
+  )
+)
--- a/.circleci/README.md
+++ b/.circleci/README.md
@ -1,468 +1,4 @@
 Warning
 =======

-Contents may be out of date. Our CircleCI workflows are gradually being migrated to Github actions.
-
-Structure of CI
-===============
-
-setup job:
-1. Does a git checkout
-2. Persists CircleCI scripts (everything in `.circleci`) into a workspace.  Why?
-   We don't always do a Git checkout on all subjobs, but we usually
-   still want to be able to call scripts one way or another in a subjob.
-   Persisting files this way lets us have access to them without doing a
-   checkout.  This workspace is conventionally mounted on `~/workspace`
-   (this is distinguished from `~/project`, which is the conventional
-   working directory that CircleCI will default to starting your jobs
-   in.)
-3. Write out the commit message to `.circleci/COMMIT_MSG`.  This is so
-   we can determine in subjobs if we should actually run the jobs or
-   not, even if there isn't a Git checkout.
-
-
-CircleCI configuration generator
-================================
-
-One may no longer make changes to the `.circleci/config.yml` file directly.
-Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory.
-
-
-Usage
----------
-
-1. Make changes to these scripts.
-2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`.
-
-You'll see a build failure on GitHub if the scripts don't agree with the checked-in version.
-
-
-Motivation
----------
-
-These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix.
-The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content.
-
-Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate
-multiple parts of the file.
-
-* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets
-
-Also see https://github.com/pytorch/pytorch/issues/17038
-
-
-Future direction
----------------
-
-### Declaring sparse config subsets
-See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747):
-
-In contrast with a full recursive tree traversal of configuration dimensions,
-> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this.
----------------
----------------
-
-# How do the binaries / nightlies / releases work?
-
-### What is a binary?
-
-A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source.
-
-A **binary configuration** is a collection of
-
-* release or nightly
-    * releases are stable, nightlies are beta and built every night
-* python version
-    * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists)
-    * macos: 3.7, 3.8
-    * windows: 3.7, 3.8
-* cpu version
-    * cpu, cuda 9.0, cuda 10.0
-    * The supported cuda versions occasionally change
-* operating system
-    * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu
-    * MacOS
-    * Windows - these are built on Azure pipelines
-* devtoolset version (gcc compiler version)
-    * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string
-
-### Where are the binaries?
-
-The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months.
-
-We have 3 types of binary packages
-
-* pip packages - nightlies are stored on s3 (pip install -f \<a s3 url\>). releases are stored in a pip repo (pip install torch) (ask Soumith about this)
-* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix
-* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only
-    * shared with dependencies (the only supported option for Windows)
-    * static with dependencies
-    * shared without dependencies
-    * static without dependencies
-
-All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release)
-
-# CircleCI structure of the binaries
-
-Some quick vocab:
-
-* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/main/.circleci/config.yml to see the workflows.
-* **jobs** are a sequence of '**steps**'
-* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps*
-* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps.
-
-## How are the workflows structured?
-
-The nightly binaries have 3 workflows. We have one job (actually 3 jobs:  build, test, and upload) per binary configuration
-
-1. binary_builds
-    1. every day midnight EST
-    2. linux: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    3. macos: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. binary_linux_conda_3.7_cpu_build
-            1. Builds the build. On linux jobs this uses the 'docker executor'.
-            2. Persists the package to the workspace
-        2. binary_linux_conda_3.7_cpu_test
-            1. Loads the package to the workspace
-            2. Spins up a docker image (on Linux), mapping the package and code repos into the docker
-            3. Runs some smoke tests in the docker
-            4. (Actually, for macos this is a step rather than a separate job)
-        3. binary_linux_conda_3.7_cpu_upload
-            1. Logs in to aws/conda
-            2. Uploads the package
-2. update_s3_htmls
-    1. every day 5am EST
-    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
-    3. See below for what these are for and why they're needed
-    4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3
-3. binarysmoketests
-    1. every day
-    2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a
-        1. smoke_linux_conda_3.7_cpu
-            1. Downloads the package from the cloud, e.g. using the official pip or conda instructions
-            2. Runs the smoke tests
-
-## How are the jobs structured?
-
-The jobs are in https://github.com/pytorch/pytorch/tree/main/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/main/.circleci/scripts .
-
-* Linux jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml
-    * binary_linux_build.sh
-    * binary_linux_test.sh
-    * binary_linux_upload.sh
-* MacOS jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml
-    * binary_macos_build.sh
-    * binary_macos_test.sh
-    * binary_macos_upload.sh
-* Update html jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/main/cron/update_s3_htmls.sh
-    * https://github.com/pytorch/builder/blob/main/cron/upload_binary_sizes.sh
-* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml
-    * These delegate from the pytorch/builder repo
-    * https://github.com/pytorch/builder/blob/main/run_tests.sh
-    * https://github.com/pytorch/builder/blob/main/smoke_test.sh
-    * https://github.com/pytorch/builder/blob/main/check_binary.sh
-* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-binary-build-defaults.yml
-    * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh
-    * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps.
-    * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables
-    * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image
-
-### **Why do the steps all refer to scripts?**
-
-CircleCI creates a  final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems.
-
-### **What is binary_run_in_docker for?**
-
-So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus
-
-* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor
-* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs
-* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use
-* linux smoke test jobs use the machine executor for the same reason as the linux test jobs
-
-binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs
-
-### **Why does binary_checkout also checkout pytorch? Why shouldn't it?**
-
-We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where.
-
-# Code structure of the binaries (circleci agnostic)
-
-## Overview
-
-The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is
-
-
-```
-# All code needed to set-up environments for build code to run in,
-# but only code that is specific to the current CI system
-pytorch/pytorch
- .circleci/                # Folder that holds all circleci related stuff
-  - config.yml              # GENERATED file that actually controls all circleci behavior
-  - verbatim-sources        # Used to generate job/workflow sections in ^
-  - scripts/                # Code needed to prepare circleci environments for binary build scripts
- setup.py                  # Builds pytorch. This is wrapped in pytorch/builder
- cmake files               # used in normal building of pytorch
-# All code needed to prepare a binary build, given an environment
-# with all the right variables/packages/paths.
-pytorch/builder
-# Given an installed binary and a proper python env, runs some checks
-# to make sure the binary was built the proper way. Checks things like
-# the library dependencies, symbols present, etc.
- check_binary.sh
-# Given an installed binary, runs python tests to make sure everything
-# is in order. These should be de-duped. Right now they both run smoke
-# tests, but are called from different places. Usually just call some
-# import statements, but also has overlap with check_binary.sh above
- run_tests.sh
- smoke_test.sh
-# Folders that govern how packages are built. See paragraphs below
- conda/
-  - build_pytorch.sh          # Entrypoint. Delegates to proper conda build folder
-  - switch_cuda_version.sh    # Switches activate CUDA installation in Docker
-  - pytorch-nightly/          # Build-folder
- manywheel/
-  - build_cpu.sh              # Entrypoint for cpu builds
-  - build.sh                  # Entrypoint for CUDA builds
-  - build_common.sh           # Actual build script that ^^ call into
- wheel/
-  - build_wheel.sh            # Entrypoint for wheel builds
- windows/
-  - build_pytorch.bat         # Entrypoint for wheel builds on Windows
-```
-
-Every type of package has an entrypoint build script that handles the all the important logic.
-
-## Conda
-
-Linux, MacOS and Windows use the same code flow for the conda builds.
-
-Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html
-
-Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing.
-tl;dr on conda-build is
-
-1. Creates a brand new conda environment, based off of deps in the meta.yaml
-    1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml
-    2. If the build fails this environment will stick around. You can activate it for much easier debugging. The “General Python” section below explains what exactly a python “environment” is.
-2. Calls build.sh in the environment
-3. Copies the finished package to a new conda env, also specified by the meta.yaml
-4. Runs some simple import tests (if specified in the meta.yaml)
-5. Saves the finished package as a tarball
-
-The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths.
-
-The entrypoint file `builder/conda/build_conda.sh` is complicated because
-
-* It works for Linux, MacOS and Windows
-    * The mac builds used to create their own environments, since they all used to be on the same machine. There’s now a lot of extra logic to handle conda envs. This extra machinery could be removed
-* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed.
-
-## Manywheels (linux pip and libtorch packages)
-
-Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant.
-
-`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh`
-
-The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because
-
-* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed.
-
-## Wheels (MacOS pip and libtorch packages)
-
-The entrypoint file `builder/wheel/build_wheel.sh` is complicated because
-
-* The mac builds used to all run on one machine (we didn’t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory.
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * Ditto the comment above. This should definitely be separated out.
-
-Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## Windows Wheels (Windows pip and libtorch packages)
-
-The entrypoint file `builder/windows/build_pytorch.bat` is complicated because
-
-* This used to handle building for several different python versions at the same time. This is why there are loops everywhere
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This used to handle testing the pip packages too. This is why there’s testing code at the end that messes with python installations and stuff
-    * The script is never used this way anymore. This extra machinery could be removed.
-* This also builds libtorch packages
-    * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file.
-
-Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda.
-
-## General notes
-
-### Note on run_tests.sh, smoke_test.sh, and check_binary.sh
-
-* These should all be consolidated
-* These must run on all OS types: MacOS, Linux, and Windows
-* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on main and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didn’t mess anything up.
-* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package.
-
-### Note on libtorch
-
-Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this
-
-* It’s confusing. Most of those scripts deal with python specifics.
-* The extra conditionals everywhere severely complicate the wheel build scripts
-* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script)
-
-### Note on docker images / Dockerfiles
-
-All linux builds occur in docker images. The docker images are
-
-* pytorch/conda-cuda
-    * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds
-    * Also used for cpu builds
-* pytorch/manylinux-cuda90
-* pytorch/manylinux-cuda100
-    * Also used for cpu builds
-
-The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now.
-
-### General Python
-
-* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2
-
-# How to manually rebuild the binaries
-
-tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159
-
-Sometimes we want to push a change to mainand then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/main/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want.
-
-## How to test changes to the binaries via .circleci
-
-Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this.
-
-```sh
-# Make your changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-# Regenerate the yaml, has to be in python 3.7
-.circleci/regenerate.sh
-# Make a commit
-git add .circleci *
-git commit -m "My real changes"
-git push origin my_branch
-# Now hardcode the jobs that you want in the .circleci/config.yml workflows section
-# Also eliminate ensure-consistency and should_run_job checks
-# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d
-# Make a commit you won't keep
-git add .circleci
-git commit -m "[DO NOT LAND] testing binaries for above changes"
-git push origin my_branch
-# Now you need to make some changes to the first commit.
-git rebase -i HEAD~2 # mark the first commit as 'edit'
-# Make the changes
-touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml
-.circleci/regenerate.sh
-# Ammend the commit and recontinue
-git add .circleci
-git commit --amend
-git rebase --continue
-# Update the PR, need to force since the commits are different now
-git push origin my_branch --force
-```
-
-The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes.
-
-## How to build a binary locally
-
-### Linux
-
-You can build Linux binaries locally easily using docker.
-
-```sh
-# Run the docker
-# Use the correct docker image, pytorch/conda-cuda used here as an example
-#
-# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the
-#    machine that you're running the command on) accessible to the docker
-#    container at path/to/bar. So if you then run `touch path/to/bar/baz`
-#    in the docker container then you will see path/to/foo/baz on your local
-#    machine. You could also clone the pytorch and builder repos in the docker.
-#
-# If you know how, add ccache as a volume too and speed up everything
-docker run \
-    -v your/pytorch/repo:/pytorch \
-    -v your/builder/repo:/builder \
-    -v where/you/want/packages/to/appear:/final_pkgs \
-    -it pytorch/conda-cuda /bin/bash
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-# Call the entrypoint
-# `|& tee foo.log` just copies all stdout and stderr output to foo.log
-# The builds generate lots of output so you probably need this when
-# building locally.
-/builder/conda/build_pytorch.sh |& tee build_output.log
-```
-
-**Building CUDA binaries on docker**
-
-You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though it’s gonna take a long time).
-
-For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast.
-
-### MacOS
-
-There’s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If you’re trying to repro an error on a Mac build in .circleci and you can’t seem to repro locally, then my best advice is actually to iterate on .circleci    :/
-
-But if you want to try, then I’d recommend
-
-```sh
-# Create a new terminal
-# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you
-# know how to do
-# Install a new miniconda
-# First remove any other python or conda installation from your PATH
-# Always install miniconda 3, even if building for Python <3
-new_conda="~/my_new_conda"
-conda_sh="$new_conda/install_miniconda.sh"
-curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x "$conda_sh"
-"$conda_sh" -b -p "$MINICONDA_ROOT"
-rm -f "$conda_sh"
-export PATH="~/my_new_conda/bin:$PATH"
-# Create a clean python env
-# All MacOS builds use conda to manage the python env and dependencies
-# that are built with, even the pip packages
-conda create -yn binary python=2.7
-conda activate binary
-# Export whatever variables are important to you. All variables that you'd
-# possibly need are in .circleci/scripts/binary_populate_env.sh
-# You should probably always export at least these 3 variables
-export PACKAGE_TYPE=conda
-export DESIRED_PYTHON=3.7
-export DESIRED_CUDA=cpu
-# Call the entrypoint you want
-path/to/builder/wheel/build_wheel.sh
-```
-
-N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the “General Python” section above, but tldr; is that
-
-1. You make the ‘conda’ command accessible by prepending `path/to/conda_root/bin` to your PATH.
-2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH`
-3. Now say you (or some code that you ran) call python executable `foo`
-    1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected.
-    2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ‘base’), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version!
-
-Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe.
-
-### Windows
-
-TODO: fill in
+PyTorch migration from CircleCI to github actions has been completed. All continuous integration & deployment workflows are defined in  `.github/workflows` folder
--- a/.clang-tidy
+++ b/.clang-tidy
@ -42,7 +42,6 @@ misc-*,
 -misc-non-private-member-variables-in-classes,
 -misc-confusable-identifiers,
 modernize-*,
-modernize-concat-nested-namespaces,
 -modernize-macro-to-enum,
 -modernize-return-braced-init-list,
 -modernize-use-auto,
--- a/.flake8
+++ b/.flake8
@ -2,7 +2,7 @@
 # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml
 # before we can fully move to use ruff
 enable-extensions = G
-select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2
+select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9
 max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
@ -27,6 +27,9 @@ ignore =
    # TODO(kit1980): fix all TOR102 issues
    # `torch.load` without `weights_only` parameter is unsafe
    TOR102,
+    # TODO(kit1980): resolve all TOR003 issues
+    # pass `use_reentrant` explicitly to `checkpoint`.
+    TOR003
 per-file-ignores =
    __init__.py: F401
    test/**: F821
@ -34,6 +37,23 @@ per-file-ignores =
    torch/utils/cpp_extension.py: B950
    torchgen/api/types/__init__.py: F401,F403
    torchgen/executorch/api/types/__init__.py: F401,F403
+    test/dynamo/test_higher_order_ops.py: B950
+    torch/testing/_internal/dynamo_test_failures.py: B950
+    # TOR901 is only for test, we want to ignore it for everything else.
+    # It's not easy to configure this without affecting other per-file-ignores,
+    # so we explicitly list every file where it's violated outside of test.
+    torch/__init__.py: F401,TOR901
+    torch/_custom_op/impl.py: TOR901
+    torch/_export/serde/upgrade.py: TOR901
+    torch/_functorch/vmap.py: TOR901
+    torch/_inductor/test_operators.py: TOR901
+    torch/_library/abstract_impl.py: TOR901
+    torch/_meta_registrations.py: TOR901
+    torch/_prims/__init__.py: F401,TOR901
+    torch/_prims/rng_prims.py: TOR901
+    torch/ao/quantization/fx/_decomposed.py: TOR901
+    torch/distributed/_functional_collectives.py: TOR901
+    torch/distributed/_spmd/data_parallel.py: TOR901
 optional-ascii-coding = True
 exclude =
    ./.git,
--- a/.github/actionlint.yaml
+++ b/.github/actionlint.yaml
@ -20,6 +20,7 @@ self-hosted-runner:
    - bm-runner
    - linux.rocm.gpu
    - macos-m1-12
+    - macos-m1-stable
    - macos-m1-13
    - macos-12-xl
    - macos-12
--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -26,11 +26,20 @@ outputs:
    description: True if the filtered test configs matrix is empty. False otherwise.
    value: ${{ steps.filter.outputs.is-test-matrix-empty }}
  keep-going:
-    description: True if keep-going label was on PR.
+    description: True if keep-going label was on PR or [keep-going] in PR body.
    value: ${{ steps.filter.outputs.keep-going }}
  reenabled-issues:
    description: Comma separated list of issue numbers that should correspond to disable test issues that the PR fixes
    value: ${{ steps.filter.outputs.reenabled-issues }}
+  ci-verbose-test-logs:
+    description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body.
+    value: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+  ci-no-test-timeout:
+    description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body.
+    value: ${{ steps.filter.outputs.ci-no-test-timeout }}
+  ci-no-td:
+    description: True if ci-no-td label was on PR or [ci-no-td] in PR body.
+    value: ${{ steps.filter.outputs.ci-no-td }}

 runs:
  using: composite
--- a/.github/auto_request_review.yml
+++ b/.github/auto_request_review.yml
@ -6,7 +6,6 @@ reviewers:
      - albanD
      - miladm
      - bdhirsh
-      - voznesenskym

  per_author:
    symbolic-shapes:
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-b2d9c3e315405f2b5cfdfa5b93f849d5b27a4109
+5286f9f60d8647fb4a490cdf22eac39a54e63a80
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-a00a72b1ee41483407717379fb5cafe992de2f82
+a52607ece94aedbe41107617ace22a8da91efc25
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-2990cb38c17e06d0dbe25437674ca40130d76a8f
+fba464b199559f61faa720de8bf64cf955cfdce7
--- a/.github/requirements/conda-env-Linux-X64.txt
+++ b/.github/requirements/conda-env-Linux-X64.txt
@ -4,6 +4,6 @@ mkl-include=2022.1.0
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.28.1
-setuptools=65.5.0
+requests=2.31.0
+setuptools=68.2.2
 typing-extensions=4.3.0
--- a/.github/requirements/conda-env-iOS.txt
+++ b/.github/requirements/conda-env-iOS.txt
@ -3,6 +3,6 @@ cmake=3.22.1
 ninja=1.10.2
 numpy=1.23.3
 pyyaml=6.0
-requests=2.28.1
-setuptools=63.4.1
+requests=2.31.0
+setuptools=68.2.2
 typing-extensions=4.3.0
--- a/.github/requirements/pip-requirements-macOS.txt
+++ b/.github/requirements/pip-requirements-macOS.txt
@ -16,7 +16,6 @@ pytest==7.3.2
 pytest-xdist==3.3.1
 pytest-rerunfailures==10.3
 pytest-flakefinder==1.1.0
-pytest-shard==0.1.2
 scipy==1.10.1
 sympy==1.11.1
 unittest-xml-reporting<=3.2.0,>=2.0.0
--- a/.github/scripts/build_triton_wheel.py
+++ b/.github/scripts/build_triton_wheel.py
@ -109,7 +109,7 @@ def build_triton(
                print("source:\n  path: .\n", file=meta)
                print(
                    "build:\n  string: py{{py}}\n  number: 1\n  script: cd python; "
-                    "python setup.py install --single-version-externally-managed --record=record.txt\n",
+                    "python setup.py install --record=record.txt\n",
                    " script_env:\n   - MAX_JOBS\n",
                    file=meta,
                )
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -0,0 +1,223 @@
+#!/usr/bin/env python3
+
+import json
+import os
+import re
+from typing import Any, Optional
+
+from urllib.error import HTTPError
+
+from github_utils import gh_fetch_url, gh_post_pr_comment
+
+from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
+from trymerge import get_pr_commit_sha, GitHubPR
+
+
+# This is only a suggestion for now, not a strict requirement
+REQUIRES_ISSUE = {
+    "regression",
+    "critical",
+    "fixnewfeature",
+}
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("cherry pick a landed PR onto a release branch")
+    parser.add_argument(
+        "--onto-branch", type=str, required=True, help="the target release branch"
+    )
+    parser.add_argument(
+        "--github-actor", type=str, required=True, help="all the world’s a stage"
+    )
+    parser.add_argument(
+        "--classification",
+        choices=["regression", "critical", "fixnewfeature", "docs", "release"],
+        required=True,
+        help="the cherry pick category",
+    )
+    parser.add_argument("pr_num", type=int)
+    parser.add_argument(
+        "--fixes",
+        type=str,
+        default="",
+        help="the GitHub issue that the cherry pick fixes",
+    )
+    parser.add_argument("--dry-run", action="store_true")
+
+    return parser.parse_args()
+
+
+def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
+    """
+    Return the merge commit SHA iff the PR has been merged. For simplicity, we
+    will only cherry pick PRs that have been merged into main
+    """
+    commit_sha = get_pr_commit_sha(repo, pr)
+    return commit_sha if pr.is_closed() else None
+
+
+def cherry_pick(
+    github_actor: str,
+    repo: GitRepo,
+    pr: GitHubPR,
+    commit_sha: str,
+    onto_branch: str,
+    classification: str,
+    fixes: str,
+    dry_run: bool = False,
+) -> None:
+    """
+    Create a local branch to cherry pick the commit and submit it as a pull request
+    """
+    current_branch = repo.current_branch()
+    cherry_pick_branch = create_cherry_pick_branch(
+        github_actor, repo, pr, commit_sha, onto_branch
+    )
+
+    try:
+        if not dry_run:
+            org, project = repo.gh_owner_and_name()
+            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)
+
+            msg = f"The cherry pick PR is at {cherry_pick_pr}"
+            if fixes:
+                msg += f" and it is linked with issue {fixes}"
+            elif classification in REQUIRES_ISSUE:
+                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"
+
+            post_comment(org, project, pr.pr_num, msg)
+
+    finally:
+        if current_branch:
+            repo.checkout(branch=current_branch)
+
+
+def create_cherry_pick_branch(
+    github_actor: str, repo: GitRepo, pr: GitHubPR, commit_sha: str, onto_branch: str
+) -> str:
+    """
+    Create a local branch and cherry pick the commit. Return the name of the local
+    cherry picking branch.
+    """
+    repo.checkout(branch=onto_branch)
+    repo._run_git("submodule", "update", "--init", "--recursive")
+
+    # Remove all special characters if we want to include the actor in the branch name
+    github_actor = re.sub("[^0-9a-zA-Z]+", "_", github_actor)
+
+    cherry_pick_branch = f"cherry-pick-{pr.pr_num}-by-{github_actor}"
+    repo.create_branch_and_checkout(branch=cherry_pick_branch)
+
+    # We might want to support ghstack later
+    repo._run_git("cherry-pick", "-x", "-X", "theirs", commit_sha)
+    repo.push(branch=cherry_pick_branch, dry_run=False)
+
+    return cherry_pick_branch
+
+
+def submit_pr(
+    repo: GitRepo,
+    pr: GitHubPR,
+    cherry_pick_branch: str,
+    onto_branch: str,
+) -> str:
+    """
+    Submit the cherry pick PR and return the link to the PR
+    """
+    org, project = repo.gh_owner_and_name()
+
+    default_msg = f"Cherry pick #{pr.pr_num} onto {onto_branch} branch"
+    title = pr.info.get("title", default_msg)
+    body = pr.info.get("body", default_msg)
+
+    try:
+        response = gh_fetch_url(
+            f"https://api.github.com/repos/{org}/{project}/pulls",
+            method="POST",
+            data={
+                "title": title,
+                "body": body,
+                "head": cherry_pick_branch,
+                "base": onto_branch,
+            },
+            headers={"Accept": "application/vnd.github.v3+json"},
+            reader=json.load,
+        )
+
+        cherry_pick_pr = response.get("html_url", "")
+        if not cherry_pick_pr:
+            raise RuntimeError(
+                f"Fail to find the cherry pick PR: {json.dumps(response)}"
+            )
+
+        return str(cherry_pick_pr)
+
+    except HTTPError as error:
+        msg = f"Fail to submit the cherry pick PR: {error}"
+        raise RuntimeError(msg) from error
+
+
+def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
+    """
+    Post a comment on the PR itself to point to the cherry picking PR when success
+    or print the error when failure
+    """
+    internal_debugging = ""
+
+    run_url = os.getenv("GH_RUN_URL")
+    # Post a comment to tell folks that the PR is being cherry picked
+    if run_url is not None:
+        internal_debugging = "\n".join(
+            line
+            for line in (
+                "<details><summary>Details for Dev Infra team</summary>",
+                f'Raised by <a href="{run_url}">workflow job</a>\n',
+                "</details>",
+            )
+            if line
+        )
+
+    comment = "\n".join(
+        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
+    )
+    gh_post_pr_comment(org, project, pr_num, comment)
+
+
+def main() -> None:
+    args = parse_args()
+    pr_num = args.pr_num
+
+    repo = GitRepo(get_git_repo_dir(), get_git_remote_name())
+    org, project = repo.gh_owner_and_name()
+
+    pr = GitHubPR(org, project, pr_num)
+
+    try:
+        commit_sha = get_merge_commit_sha(repo, pr)
+        if not commit_sha:
+            raise RuntimeError(
+                f"Refuse to cherry pick #{pr_num} because it hasn't been merged yet"
+            )
+
+        cherry_pick(
+            args.github_actor,
+            repo,
+            pr,
+            commit_sha,
+            args.onto_branch,
+            args.classification,
+            args.fixes,
+            args.dry_run,
+        )
+
+    except RuntimeError as error:
+        if not args.dry_run:
+            post_comment(org, project, pr_num, str(error))
+        else:
+            raise error
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -0,0 +1,274 @@
+# Delete old branches
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Set
+
+from github_utils import gh_fetch_json_dict, gh_graphql
+from gitutils import GitRepo
+
+SEC_IN_DAY = 24 * 60 * 60
+CLOSED_PR_RETENTION = 30 * SEC_IN_DAY
+NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY
+PR_WINDOW = 90 * SEC_IN_DAY  # Set to None to look at all PRs (may take a lot of tokens)
+REPO_OWNER = "pytorch"
+REPO_NAME = "pytorch"
+ESTIMATED_TOKENS = [0]
+
+TOKEN = os.environ["GITHUB_TOKEN"]
+if not TOKEN:
+    raise Exception("GITHUB_TOKEN is not set")
+
+REPO_ROOT = Path(__file__).parent.parent.parent
+
+# Query for all PRs instead of just closed/merged because it's faster
+GRAPHQL_ALL_PRS_BY_UPDATED_AT = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    pullRequests(
+      first: 100
+      after: $cursor
+      orderBy: {field: UPDATED_AT, direction: DESC}
+    ) {
+      totalCount
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+      nodes {
+        headRefName
+        number
+        updatedAt
+        state
+      }
+    }
+  }
+}
+"""
+
+GRAPHQL_OPEN_PRS = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    pullRequests(
+      first: 100
+      after: $cursor
+      states: [OPEN]
+    ) {
+      totalCount
+      pageInfo {
+        hasNextPage
+        endCursor
+      }
+      nodes {
+        headRefName
+        number
+        updatedAt
+        state
+      }
+    }
+  }
+}
+"""
+
+GRAPHQL_NO_DELETE_BRANCH_LABEL = """
+query ($owner: String!, $repo: String!, $cursor: String) {
+  repository(owner: $owner, name: $repo) {
+    label(name: "no-delete-branch") {
+      pullRequests(first: 100, after: $cursor) {
+        totalCount
+        pageInfo {
+          hasNextPage
+          endCursor
+        }
+        nodes {
+          headRefName
+          number
+          updatedAt
+          state
+        }
+      }
+    }
+  }
+}
+"""
+
+
+def is_protected(branch: str) -> bool:
+    try:
+        ESTIMATED_TOKENS[0] += 1
+        res = gh_fetch_json_dict(
+            f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/branches/{branch}"
+        )
+        return bool(res["protected"])
+    except Exception as e:
+        print(f"[{branch}] Failed to fetch branch protections: {e}")
+        return True
+
+
+def convert_gh_timestamp(date: str) -> float:
+    return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").timestamp()
+
+
+def get_branches(repo: GitRepo) -> Dict[str, Any]:
+    # Query locally for branches, group by branch base name (e.g. gh/blah/base -> gh/blah), and get the most recent branch
+    git_response = repo._run_git(
+        "for-each-ref",
+        "--sort=creatordate",
+        "--format=%(refname) %(committerdate:iso-strict)",
+        "refs/remotes/origin",
+    )
+    branches_by_base_name: Dict[str, Any] = {}
+    for line in git_response.splitlines():
+        branch, date = line.split(" ")
+        re_branch = re.match(r"refs/remotes/origin/(.*)", branch)
+        assert re_branch
+        branch = branch_base_name = re_branch.group(1)
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch):
+            branch_base_name = x.group(1)
+        date = datetime.fromisoformat(date).timestamp()
+        if branch_base_name not in branches_by_base_name:
+            branches_by_base_name[branch_base_name] = [date, [branch]]
+        else:
+            branches_by_base_name[branch_base_name][1].append(branch)
+            if date > branches_by_base_name[branch_base_name][0]:
+                branches_by_base_name[branch_base_name][0] = date
+    return branches_by_base_name
+
+
+def paginate_graphql(
+    query: str,
+    kwargs: Dict[str, Any],
+    termination_func: Callable[[List[Dict[str, Any]]], bool],
+    get_data: Callable[[Dict[str, Any]], List[Dict[str, Any]]],
+    get_page_info: Callable[[Dict[str, Any]], Dict[str, Any]],
+) -> List[Any]:
+    hasNextPage = True
+    endCursor = None
+    data: List[Dict[str, Any]] = []
+    while hasNextPage:
+        ESTIMATED_TOKENS[0] += 1
+        res = gh_graphql(query, cursor=endCursor, **kwargs)
+        data.extend(get_data(res))
+        hasNextPage = get_page_info(res)["hasNextPage"]
+        endCursor = get_page_info(res)["endCursor"]
+        if termination_func(data):
+            break
+    return data
+
+
+def get_recent_prs() -> Dict[str, Any]:
+    now = datetime.now().timestamp()
+
+    # Grab all PRs updated in last CLOSED_PR_RETENTION days
+    pr_infos: List[Dict[str, Any]] = paginate_graphql(
+        GRAPHQL_ALL_PRS_BY_UPDATED_AT,
+        {"owner": "pytorch", "repo": "pytorch"},
+        lambda data: (
+            PR_WINDOW is not None
+            and (now - convert_gh_timestamp(data[-1]["updatedAt"]) > PR_WINDOW)
+        ),
+        lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+        lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+    )
+
+    # Get the most recent PR for each branch base (group gh together)
+    prs_by_branch_base = {}
+    for pr in pr_infos:
+        pr["updatedAt"] = convert_gh_timestamp(pr["updatedAt"])
+        branch_base_name = pr["headRefName"]
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name):
+            branch_base_name = x.group(1)
+        if branch_base_name not in prs_by_branch_base:
+            prs_by_branch_base[branch_base_name] = pr
+        else:
+            if pr["updatedAt"] > prs_by_branch_base[branch_base_name]["updatedAt"]:
+                prs_by_branch_base[branch_base_name] = pr
+    return prs_by_branch_base
+
+
+def get_branches_with_magic_label_or_open_pr() -> Set[str]:
+    pr_infos: List[Dict[str, Any]] = paginate_graphql(
+        GRAPHQL_NO_DELETE_BRANCH_LABEL,
+        {"owner": "pytorch", "repo": "pytorch"},
+        lambda data: False,
+        lambda res: res["data"]["repository"]["label"]["pullRequests"]["nodes"],
+        lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"],
+    )
+
+    pr_infos.extend(
+        paginate_graphql(
+            GRAPHQL_OPEN_PRS,
+            {"owner": "pytorch", "repo": "pytorch"},
+            lambda data: False,
+            lambda res: res["data"]["repository"]["pullRequests"]["nodes"],
+            lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"],
+        )
+    )
+
+    # Get the most recent PR for each branch base (group gh together)
+    branch_bases = set()
+    for pr in pr_infos:
+        branch_base_name = pr["headRefName"]
+        if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name):
+            branch_base_name = x.group(1)
+        branch_bases.add(branch_base_name)
+    return branch_bases
+
+
+def delete_branch(repo: GitRepo, branch: str) -> None:
+    repo._run_git("push", "origin", "-d", branch)
+
+
+def delete_branches() -> None:
+    now = datetime.now().timestamp()
+    git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True)
+    branches = get_branches(git_repo)
+    prs_by_branch = get_recent_prs()
+    keep_branches = get_branches_with_magic_label_or_open_pr()
+
+    delete = []
+    # Do not delete if:
+    # * associated PR is open, closed but updated recently, or contains the magic string
+    # * no associated PR and branch was updated in last 1.5 years
+    # * is protected
+    # Setting different values of PR_WINDOW will change how branches with closed
+    # PRs are treated depending on how old the branch is.  The default value of
+    # 90 will allow branches with closed PRs to be deleted if the PR hasn't been
+    # updated in 90 days and the branch hasn't been updated in 1.5 years
+    for base_branch, (date, sub_branches) in branches.items():
+        print(f"[{base_branch}] Updated {(now - date) / SEC_IN_DAY} days ago")
+        if base_branch in keep_branches:
+            print(f"[{base_branch}] Has magic label or open PR, skipping")
+            continue
+        pr = prs_by_branch.get(base_branch)
+        if pr:
+            print(
+                f"[{base_branch}] Has PR {pr['number']}: {pr['state']}, updated {(now - pr['updatedAt']) / SEC_IN_DAY} days ago"
+            )
+            if (
+                now - pr["updatedAt"] < CLOSED_PR_RETENTION
+                or (now - date) < CLOSED_PR_RETENTION
+            ):
+                continue
+        elif now - date < NO_PR_RETENTION:
+            continue
+        print(f"[{base_branch}] Checking for branch protections")
+        if any(is_protected(sub_branch) for sub_branch in sub_branches):
+            print(f"[{base_branch}] Is protected")
+            continue
+        for sub_branch in sub_branches:
+            print(f"[{base_branch}] Deleting {sub_branch}")
+            delete.append(sub_branch)
+        if ESTIMATED_TOKENS[0] > 400:
+            print("Estimated tokens exceeded, exiting")
+            break
+
+    print(f"To delete ({len(delete)}):")
+    for branch in delete:
+        print(f"About to delete branch {branch}")
+        delete_branch(git_repo, branch)
+
+
+if __name__ == "__main__":
+    delete_branches()
--- a/.github/scripts/fetch_latest_green_commit.py
+++ b/.github/scripts/fetch_latest_green_commit.py
@ -1,139 +0,0 @@
-import os
-import re
-import sys
-from typing import Any, cast, Dict, List, NamedTuple, Tuple
-
-import rockset  # type: ignore[import]
-from gitutils import _check_output
-
-
-def eprint(msg: str) -> None:
-    print(msg, file=sys.stderr)
-
-
-class WorkflowCheck(NamedTuple):
-    workflowName: str
-    name: str
-    jobName: str
-    conclusion: str
-
-
-def get_latest_commits() -> List[str]:
-    latest_viable_commit = _check_output(
-        [
-            "git",
-            "log",
-            "-n",
-            "1",
-            "--pretty=format:%H",
-            "origin/viable/strict",
-        ],
-        encoding="ascii",
-    )
-    commits = _check_output(
-        [
-            "git",
-            "rev-list",
-            f"{latest_viable_commit}^..HEAD",
-            "--remotes=*origin/main",
-        ],
-        encoding="ascii",
-    ).splitlines()
-
-    return commits
-
-
-def query_commits(commits: List[str]) -> List[Dict[str, Any]]:
-    rs = rockset.RocksetClient(
-        host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-    )
-    params = [{"name": "shas", "type": "string", "value": ",".join(commits)}]
-    res = rs.QueryLambdas.execute_query_lambda(
-        # https://console.rockset.com/lambdas/details/commons.commit_jobs_batch_query
-        query_lambda="commit_jobs_batch_query",
-        version="19c74e10819104f9",
-        workspace="commons",
-        parameters=params,
-    )
-
-    return cast(List[Dict[str, Any]], res.results)
-
-
-def print_commit_status(commit: str, results: Dict[str, Any]) -> None:
-    print(commit)
-    for check in results["results"]:
-        if check["sha"] == commit:
-            print(f"\t{check['conclusion']:>10}: {check['name']}")
-
-
-def get_commit_results(
-    commit: str, results: List[Dict[str, Any]]
-) -> List[Dict[str, Any]]:
-    workflow_checks = []
-    for check in results:
-        if check["sha"] == commit:
-            workflow_checks.append(
-                WorkflowCheck(
-                    workflowName=check["workflowName"],
-                    name=check["name"],
-                    jobName=check["jobName"],
-                    conclusion=check["conclusion"],
-                )._asdict()
-            )
-    return workflow_checks
-
-
-def isGreen(commit: str, results: List[Dict[str, Any]]) -> Tuple[bool, str]:
-    workflow_checks = get_commit_results(commit, results)
-
-    regex = {
-        "pull": False,
-        "trunk": False,
-        "lint": False,
-        "linux-binary": False,
-    }
-
-    for check in workflow_checks:
-        jobName = check["jobName"]
-        # Ignore result from unstable job, be it success or failure
-        if "unstable" in jobName:
-            continue
-
-        workflowName = check["workflowName"]
-        conclusion = check["conclusion"]
-        for required_check in regex:
-            if re.match(required_check, workflowName, flags=re.IGNORECASE):
-                if conclusion not in ["success", "skipped"]:
-                    return (False, workflowName + " checks were not successful")
-                else:
-                    regex[required_check] = True
-
-    missing_workflows = [x for x in regex.keys() if not regex[x]]
-    if len(missing_workflows) > 0:
-        return (False, "missing required workflows: " + ", ".join(missing_workflows))
-
-    return (True, "")
-
-
-def get_latest_green_commit(commits: List[str], results: List[Dict[str, Any]]) -> Any:
-    for commit in commits:
-        eprint(f"Checking {commit}")
-        is_green, msg = isGreen(commit, results)
-        if is_green:
-            eprint("GREEN")
-            return commit
-        else:
-            eprint("RED: " + msg)
-    return None
-
-
-def main() -> None:
-    commits = get_latest_commits()
-    results = query_commits(commits)
-
-    latest_viable_commit = get_latest_green_commit(commits, results)
-    print(latest_viable_commit)
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -474,6 +474,10 @@ def get_reenabled_issues(pr_body: str = "") -> List[str]:
    return parse_reenabled_issues(pr_body) + parse_reenabled_issues(commit_messages)


+def check_for_setting(labels: Set[str], body: str, setting: str) -> bool:
+    return setting in labels or f"[{setting}]" in body
+
+
 def perform_misc_tasks(
    labels: Set[str], test_matrix: Dict[str, List[Any]], job_name: str, pr_body: str
 ) -> None:
@ -481,7 +485,15 @@ def perform_misc_tasks(
    In addition to apply the filter logic, the script also does the following
    misc tasks to set keep-going and is-unstable variables
    """
-    set_output("keep-going", "keep-going" in labels)
+    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+    set_output(
+        "ci-verbose-test-logs",
+        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
+    )
+    set_output(
+        "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout")
+    )
+    set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td"))

    # Obviously, if the job name includes unstable, then this is an unstable job
    is_unstable = job_name and IssueType.UNSTABLE.value in job_name
@ -577,7 +589,7 @@ def main() -> None:
        labels=labels,
        test_matrix=filtered_test_matrix,
        job_name=args.job_name,
-        pr_body=pr_body,
+        pr_body=pr_body if pr_body else "",
    )

    # Set the filtered test matrix as the output
--- a/.github/scripts/generate_docker_release_matrix.py
+++ b/.github/scripts/generate_docker_release_matrix.py
@ -4,7 +4,7 @@

 Will output a condensed version of the matrix. Will include fllowing:
    * CUDA version short
-    * CUDA full verison
+    * CUDA full version
    * CUDNN version short
    * Image type either runtime or devel
    * Platform linux/arm64,linux/amd64
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -119,6 +119,19 @@ def gh_fetch_json_dict(
    return cast(Dict[str, Any], _gh_fetch_json_any(url, params, data))


+def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
+    rc = gh_fetch_url(
+        "https://api.github.com/graphql",
+        data={"query": query, "variables": kwargs},
+        reader=json.load,
+    )
+    if "errors" in rc:
+        raise RuntimeError(
+            f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}"
+        )
+    return cast(Dict[str, Any], rc)
+
+
 def _gh_post_comment(
    url: str, comment: str, dry_run: bool = False
 ) -> List[Dict[str, Any]]:
--- a/.github/scripts/gitutils.py
+++ b/.github/scripts/gitutils.py
@ -155,12 +155,19 @@ class GitRepo:
        )
        return [x.strip() for x in rc.split("\n") if x.strip()] if len(rc) > 0 else []

-    def current_branch(self) -> str:
-        return self._run_git("symbolic-ref", "--short", "HEAD").strip()
+    def current_branch(self) -> Optional[str]:
+        try:
+            return self._run_git("symbolic-ref", "--short", "HEAD").strip()
+        except RuntimeError:
+            # we are in detached HEAD state
+            return None

    def checkout(self, branch: str) -> None:
        self._run_git("checkout", branch)

+    def create_branch_and_checkout(self, branch: str) -> None:
+        self._run_git("checkout", "-b", branch)
+
    def fetch(self, ref: Optional[str] = None, branch: Optional[str] = None) -> None:
        if branch is None and ref is None:
            self._run_git("fetch", self.remote)
@ -273,6 +280,7 @@ class GitRepo:

    def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None:
        orig_branch = self.current_branch()
+        assert orig_branch is not None, "Must be on a branch"
        self.checkout(to_branch)
        from_commits, to_commits = self.compute_branch_diffs(from_branch, to_branch)
        if len(from_commits) == 0:
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/label_utils.py
+++ b/.github/scripts/label_utils.py
@ -74,15 +74,23 @@ def gh_get_labels(org: str, repo: str) -> List[str]:


 def gh_add_labels(
-    org: str, repo: str, pr_num: int, labels: Union[str, List[str]]
+    org: str, repo: str, pr_num: int, labels: Union[str, List[str]], dry_run: bool
 ) -> None:
+    if dry_run:
+        print(f"Dryrun: Adding labels {labels} to PR {pr_num}")
+        return
    gh_fetch_url_and_headers(
        url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels",
        data={"labels": labels},
    )


-def gh_remove_label(org: str, repo: str, pr_num: int, label: str) -> None:
+def gh_remove_label(
+    org: str, repo: str, pr_num: int, label: str, dry_run: bool
+) -> None:
+    if dry_run:
+        print(f"Dryrun: Removing {label} from PR {pr_num}")
+        return
    gh_fetch_url_and_headers(
        url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels/{label}",
        method="DELETE",
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+set -x
+
+# The generic Linux job chooses to use base env, not the one setup by the image
+CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+conda activate "${CONDA_ENV}"
+
+CACHE_DIRECTORY="/tmp/.lintbin"
+# Try to recover the cached binaries
+if [[ -d "${CACHE_DIRECTORY}" ]]; then
+    # It's ok to fail this as lintrunner init would download these binaries
+    # again if they do not exist
+    cp -r "${CACHE_DIRECTORY}" . || true
+fi
+
+# This has already been cached in the docker image
+lintrunner init 2> /dev/null
+
+# Do build steps necessary for linters
+if [[ "${CLANG}" == "1" ]]; then
+    python3 -m tools.linter.clang_tidy.generate_build_files
+fi
+python3 -m tools.generate_torch_version --is_debug=false
+python3 -m tools.pyi.gen_pyi \
+    --native-functions-path aten/src/ATen/native/native_functions.yaml \
+    --tags-path aten/src/ATen/native/tags.yaml \
+    --deprecated-functions-path "tools/autograd/deprecated.yaml"
+
+RC=0
+# Run lintrunner on all files
+if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then
+    echo ""
+    echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
+    echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
+    RC=1
+fi
+
+# Use jq to massage the JSON lint output into GitHub Actions workflow commands.
+jq --raw-output \
+    '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
+    lint.json || true
+
+exit $RC
--- a/.github/scripts/s390x-ci/README.md
+++ b/.github/scripts/s390x-ci/README.md
@ -0,0 +1,51 @@
+# Configuring the builder.
+
+## Install prerequisites.
+
+```
+$ sudo dnf install docker
+```
+
+## Add services.
+
+```
+$ sudo cp self-hosted-builder/*.service /etc/systemd/system/
+$ sudo systemctl daemon-reload
+```
+
+## Download qemu-user-static image
+
+```
+# sudo docker pull docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1
+```
+
+## Autostart the x86_64 emulation support.
+
+```
+$ sudo systemctl enable --now qemu-user-static
+```
+
+## Rebuild the image
+
+In order to build or update the `iiilinuxibmcom/actions-runner` image, e.g. to get the
+latest OS security fixes, use the following commands:
+
+```
+$ cd self-hosted-builder
+$ sudo docker build \
+      --build-arg repo=<owner>/<name> \
+      --build-arg token=<***> \
+      --pull \
+      -f actions-runner.Dockerfile \
+      -t iiilinuxibmcom/actions-runner \
+      .
+```
+
+If it fails, ensure that selinux doesn't prevent it from working.
+In worst case, selinux can be disabled with `setenforce 0`.
+
+## Autostart the runner.
+
+```
+$ sudo systemctl enable --now actions-runner@$NAME
+```
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile
@ -0,0 +1,66 @@
+# Self-Hosted IBM Z Github Actions Runner.
+
+# Temporary image: amd64 dependencies.
+FROM docker.io/amd64/ubuntu:22.04 as ld-prefix
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install ca-certificates libicu70 libssl3
+
+# Main image.
+FROM docker.io/s390x/ubuntu:22.04
+
+# Packages for pytorch building and testing.
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && apt-get -y install \
+        cmake \
+        curl \
+        gcc \
+        git \
+        jq \
+        libxml2-dev \
+        libxslt-dev \
+        ninja-build \
+        python-is-python3 \
+        python3 \
+        python3-dev \
+        python3-pip \
+        pybind11-dev \
+        python3-numpy \
+        libopenblas-dev \
+        liblapack-dev \
+        libgloo-dev \
+        python3-yaml \
+        python3-scipy \
+        virtualenv
+
+# amd64 dependencies.
+COPY --from=ld-prefix / /usr/x86_64-linux-gnu/
+RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/
+RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/
+ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu
+
+# Scripts.
+COPY fs/ /
+
+RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint
+
+# amd64 Github Actions Runner.
+RUN useradd -m actions-runner
+USER actions-runner
+WORKDIR /home/actions-runner
+RUN curl -L https://github.com/actions/runner/releases/download/v2.309.0/actions-runner-linux-x64-2.309.0.tar.gz | tar -xz
+
+# repository
+ARG repo
+
+# repository token
+ARG token
+
+RUN ./config.sh \
+        --unattended \
+        --url "https://github.com/${repo}" \
+        --token "${token}" \
+        --no-default-labels \
+        --labels self-hosted,linux.s390x
+
+ENTRYPOINT ["/usr/bin/entrypoint"]
+CMD ["/usr/bin/actions-runner"]
--- a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
+++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service
@ -0,0 +1,22 @@
+[Unit]
+Description=Self-Hosted IBM Z Github Actions Runner
+Wants=qemu-user-static
+After=qemu-user-static
+StartLimitIntervalSec=0
+
+[Service]
+Type=simple
+Restart=always
+ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i
+ExecStart=/usr/bin/docker run \
+              --init \
+              --interactive \
+              --name=actions-runner.%i \
+              --rm \
+              iiilinuxibmcom/actions-runner
+ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1"
+ExecStop=/bin/sh -c "docker wait actions-runner.%i"
+ExecStop=/bin/sh -c "docker rm actions-runner.%i"
+
+[Install]
+WantedBy=multi-user.target
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner
@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+set -e -u
+
+# Run one job.
+./run.sh --once
--- a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint
+++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint
@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+
+#
+# Container entrypoint that waits for all spawned processes.
+#
+
+set -e -u
+
+# Create a FIFO and start reading from its read end.
+tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX")
+trap 'rm -r "$tempdir"' EXIT
+done="$tempdir/pipe"
+mkfifo "$done"
+cat "$done" & waiter=$!
+
+# Start the workload. Its descendants will inherit the FIFO's write end.
+status=0
+if [ "$#" -eq 0 ]; then
+    bash 9>"$done" || status=$?
+else
+    "$@" 9>"$done" || status=$?
+fi
+
+# When the workload and all of its descendants exit, the FIFO's write end will
+# be closed and `cat "$done"` will exit. Wait until it happens. This is needed
+# in order to handle SelfUpdater, which the workload may start in background
+# before exiting.
+wait "$waiter"
+
+exit "$status"
--- a/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service
+++ b/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service
@ -0,0 +1,11 @@
+[Unit]
+Description=Support for transparent execution of non-native binaries with QEMU user emulation
+
+[Service]
+Type=oneshot
+# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1
+# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available
+ExecStart=/usr/bin/docker run --rm --interactive --privileged docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1 --reset -p yes
+
+[Install]
+WantedBy=multi-user.target
--- a/.github/scripts/test_fetch_latest_green_commit.py
+++ b/.github/scripts/test_fetch_latest_green_commit.py
@ -1,148 +0,0 @@
-from typing import Any, Dict, List
-from unittest import main, mock, TestCase
-
-from fetch_latest_green_commit import isGreen, WorkflowCheck
-
-workflowNames = [
-    "pull",
-    "trunk",
-    "Lint",
-    "linux-binary-libtorch-pre-cxx11",
-    "android-tests",
-    "windows-binary-wheel",
-    "periodic",
-    "docker-release-builds",
-    "nightly",
-    "pr-labels",
-    "Close stale pull requests",
-    "Update S3 HTML indices for download.pytorch.org",
-    "Create Release",
-]
-
-
-def set_workflow_job_status(
-    workflow: List[Dict[str, Any]], name: str, status: str
-) -> List[Dict[str, Any]]:
-    for check in workflow:
-        if check["workflowName"] == name:
-            check["conclusion"] = status
-    return workflow
-
-
-class TestChecks:
-    def make_test_checks(self) -> List[Dict[str, Any]]:
-        workflow_checks = []
-        for i in range(len(workflowNames)):
-            workflow_checks.append(
-                WorkflowCheck(
-                    workflowName=workflowNames[i],
-                    name="test/job",
-                    jobName="job",
-                    conclusion="success",
-                )._asdict()
-            )
-        return workflow_checks
-
-
-class TestPrintCommits(TestCase):
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_all_successful(self, mock_get_commit_results: Any) -> None:
-        "Test with workflows are successful"
-        workflow_checks = mock_get_commit_results()
-        self.assertTrue(isGreen("sha", workflow_checks)[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_successful(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary workflows are successful"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[8], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[9], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[10], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[11], "failed"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, workflowNames[12], "failed"
-        )
-        self.assertTrue(isGreen("sha", workflow_checks)[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_skipped(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary job (ex: pull) skipped"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(workflow_checks, "pull", "skipped")
-        result = isGreen("sha", workflow_checks)
-        self.assertTrue(result[0])
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_skippable_skipped(self, mock_get_commit_results: Any) -> None:
-        "Test with skippable jobs (periodic and docker-release-builds skipped"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "periodic", "skipped"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "docker-release-builds", "skipped"
-        )
-        self.assertTrue(isGreen("sha", workflow_checks))
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_necessary_failed(self, mock_get_commit_results: Any) -> None:
-        "Test with necessary job (ex: Lint) failed"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(workflow_checks, "Lint", "failed")
-        result = isGreen("sha", workflow_checks)
-        self.assertFalse(result[0])
-        self.assertEqual(result[1], "Lint checks were not successful")
-
-    @mock.patch(
-        "fetch_latest_green_commit.get_commit_results",
-        return_value=TestChecks().make_test_checks(),
-    )
-    def test_skippable_failed(self, mock_get_commit_results: Any) -> None:
-        "Test with failing skippable jobs (ex: docker-release-builds) should pass"
-        workflow_checks = mock_get_commit_results()
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "periodic", "skipped"
-        )
-        workflow_checks = set_workflow_job_status(
-            workflow_checks, "docker-release-builds", "failed"
-        )
-        result = isGreen("sha", workflow_checks)
-        self.assertTrue(result[0])
-
-    @mock.patch("fetch_latest_green_commit.get_commit_results", return_value={})
-    def test_no_workflows(self, mock_get_commit_results: Any) -> None:
-        "Test with missing workflows"
-        workflow_checks = mock_get_commit_results()
-        result = isGreen("sha", workflow_checks)
-        self.assertFalse(result[0])
-        self.assertEqual(
-            result[1],
-            "missing required workflows: pull, trunk, lint, linux-binary",
-        )
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/test_filter_test_configs.py
+++ b/.github/scripts/test_filter_test_configs.py
@ -636,55 +636,108 @@ class TestConfigFilter(TestCase):

    @mock.patch("subprocess.check_output")
    def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None:
+        def _gen_expected_string(
+            keep_going: bool = False,
+            ci_verbose_test_logs: bool = False,
+            ci_no_test_timeout: bool = False,
+            ci_no_td: bool = False,
+            is_unstable: bool = False,
+            reenabled_issues: str = "",
+        ) -> str:
+            return (
+                f"keep-going={keep_going}\n"
+                f"ci-verbose-test-logs={ci_verbose_test_logs}\n"
+                f"ci-no-test-timeout={ci_no_test_timeout}\n"
+                f"ci-no-td={ci_no_td}\n"
+                f"is-unstable={is_unstable}\n"
+                f"reenabled-issues={reenabled_issues}\n"
+            )
+
        mocked_subprocess.return_value = b""
        testcases: List[Dict[str, Any]] = [
            {
                "labels": {},
                "test_matrix": '{include: [{config: "default"}]}',
                "job_name": "A job name",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                "description": "No keep-going, no is-unstable",
            },
            {
                "labels": {"keep-going"},
                "test_matrix": '{include: [{config: "default"}]}',
                "job_name": "A job name",
-                "expected": "keep-going=True\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(keep_going=True),
                "description": "Has keep-going, no is-unstable",
            },
+            {
+                "labels": {},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[keep-going]",
+                "expected": _gen_expected_string(keep_going=True),
+                "description": "Keep-going in PR body",
+            },
+            {
+                "labels": {"ci-verbose-test-logs"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-no-test-timeout]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs label and no test timeout in PR body",
+            },
+            {
+                "labels": {"ci-no-test-timeout"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "[ci-verbose-test-logs]",
+                "expected": _gen_expected_string(
+                    ci_verbose_test_logs=True, ci_no_test_timeout=True
+                ),
+                "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
+            },
+            {
+                "labels": {"ci-no-td"},
+                "test_matrix": '{include: [{config: "default"}]}',
+                "job_name": "A job name",
+                "pr_body": "",
+                "expected": _gen_expected_string(ci_no_td=True),
+                "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)",
+            },
            {
                "labels": {},
                "test_matrix": '{include: [{config: "default"}]}',
                "job_name": None,
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(),
                "description": "No job name",
            },
            {
                "labels": {},
                "test_matrix": '{include: [{config: "default"}]}',
-                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
+                "expected": _gen_expected_string(is_unstable=True),
                "description": "Unstable job",
            },
            {
                "labels": {},
                "test_matrix": '{include: [{config: "default"}]}',
-                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)",
+                "expected": _gen_expected_string(is_unstable=True),
                "description": "Unstable job",
            },
            {
                "labels": {},
                "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2", unstable: "unstable"}]}',
                "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=True),
                "description": "All configs are unstable",
            },
            {
                "labels": {},
                "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2"}]}',
                "job_name": "macos-12-py3-arm64 / build",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n",
+                "expected": _gen_expected_string(is_unstable=False),
                "description": "Only mark some configs as unstable",
            },
            {
@ -692,7 +745,7 @@ class TestConfigFilter(TestCase):
                "test_matrix": '{include: [{config: "default"}]}',
                "job_name": "A job name",
                "pr_body": "resolves #123 fixes #234",
-                "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=123,234\n",
+                "expected": _gen_expected_string(reenabled_issues="123,234"),
                "description": "Reenable some issues",
            },
        ]
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -16,6 +16,8 @@ from typing import Any, Dict, List, Optional
 from unittest import main, mock, skip, TestCase
 from urllib.error import HTTPError

+from github_utils import gh_graphql
+
 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo

 from trymerge import (
@ -26,7 +28,6 @@ from trymerge import (
    get_drci_classifications,
    get_rockset_results,
    gh_get_team_members,
-    gh_graphql,
    GitHubPR,
    JobCheckState,
    main as trymerge_main,
@ -140,11 +141,14 @@ def mock_parse_args(revert: bool = False, force: bool = False) -> Any:
            self.comment_id = 0
            self.reason = "this is for testing"
            self.ignore_current = False
+            self.check_mergeability = False

    return Object()


-def mock_remove_label(org: str, repo: str, pr_num: str, label: str) -> None:
+def mock_remove_label(
+    org: str, repo: str, pr_num: str, label: str, dry_run: bool
+) -> None:
    pass


@ -431,6 +435,13 @@ class TestTryMerge(TestCase):
        assert pr._reviews is not None  # to pacify mypy
        self.assertGreater(len(pr._reviews), 100)

+    def get_co_authors(self, *args: Any) -> None:
+        """Tests that co-authors are recognized"""
+        pr = GitHubPR("pytorch", "pytorch", 118347)
+        authors = pr.get_authors()
+        self.assertIn("kit1980", authors)
+        self.assertIn("Co-authored-by:", pr.gen_commit_message())
+
    def test_get_checkruns_many_runs(self, *args: Any) -> None:
        """Tests that all checkruns can be fetched"""
        pr = GitHubPR("pytorch", "pytorch", 105260)
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -39,6 +39,7 @@ from github_utils import (
    gh_fetch_json_list,
    gh_fetch_merge_base,
    gh_fetch_url,
+    gh_graphql,
    gh_post_commit_comment,
    gh_post_pr_comment,
    gh_update_pr_state,
@ -152,12 +153,14 @@ GH_COMMIT_AUTHORS_FRAGMENT = """
 fragment CommitAuthors on PullRequestCommitConnection {
  nodes {
    commit {
-      author {
-        user {
-          login
+      authors(first: 2) {
+        nodes {
+          user {
+            login
+          }
+          email
+          name
        }
-        email
-        name
      }
      oid
    }
@ -458,19 +461,6 @@ HAS_NO_CONNECTED_DIFF_TITLE = (
 IGNORABLE_FAILED_CHECKS_THESHOLD = 10


-def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]:
-    rc = gh_fetch_url(
-        "https://api.github.com/graphql",
-        data={"query": query, "variables": kwargs},
-        reader=json.load,
-    )
-    if "errors" in rc:
-        raise RuntimeError(
-            f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}"
-        )
-    return cast(Dict[str, Any], rc)
-
-
 def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any:
    rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no)
    return rc["data"]["repository"]["pullRequest"]
@ -608,6 +598,7 @@ def parse_args() -> Any:
    parser.add_argument("--revert", action="store_true")
    parser.add_argument("--force", action="store_true")
    parser.add_argument("--ignore-current", action="store_true")
+    parser.add_argument("--check-mergeability", action="store_true")
    parser.add_argument("--comment-id", type=int)
    parser.add_argument("--reason", type=str)
    parser.add_argument("pr_num", type=int)
@ -745,7 +736,7 @@ class GitHubPR:
        # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base,
        # so let's just use main instead
        self.merge_base = gh_fetch_merge_base(
-            self.org, self.project, last_commit_oid, "main"
+            self.org, self.project, last_commit_oid, self.default_branch()
        )

        # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid
@ -845,14 +836,14 @@ class GitHubPR:

        def add_authors(info: Dict[str, Any]) -> None:
            for node in info["commits_with_authors"]["nodes"]:
-                author_node = node["commit"]["author"]
-                user_node = author_node["user"]
-                author = f"{author_node['name']} <{author_node['email']}>"
-                if user_node is None:
-                    # If author is not github user, user node will be null
-                    authors.append(("", author))
-                else:
-                    authors.append((cast(str, user_node["login"]), author))
+                for author_node in node["commit"]["authors"]["nodes"]:
+                    user_node = author_node["user"]
+                    author = f"{author_node['name']} <{author_node['email']}>"
+                    if user_node is None:
+                        # If author is not github user, user node will be null
+                        authors.append(("", author))
+                    else:
+                        authors.append((cast(str, user_node["login"]), author))

        info = self.info
        for _ in range(100):
@ -948,11 +939,6 @@ class GitHubPR:

    def get_authors(self) -> Dict[str, str]:
        rc = {}
-        # TODO: replace with  `self.get_commit_count()` when GraphQL pagination can be used
-        # to fetch all commits, see https://gist.github.com/malfet/4f35321b0c9315bcd7116c7b54d83372
-        # and https://support.github.com/ticket/enterprise/1642/1659119
-        if self.get_commit_count() <= 250:
-            assert len(self._fetch_authors()) == self.get_commit_count()
        for idx in range(len(self._fetch_authors())):
            rc[self.get_committer_login(idx)] = self.get_committer_author(idx)

@ -1068,6 +1054,7 @@ class GitHubPR:
        repo: GitRepo,
        skip_mandatory_checks: bool,
        comment_id: Optional[int] = None,
+        skip_all_rule_checks: bool = False,
    ) -> List["GitHubPR"]:
        assert self.is_ghstack_pr()
        ghstack_prs = get_ghstack_prs(
@ -1082,7 +1069,7 @@ class GitHubPR:
            commit_msg = pr.gen_commit_message(
                filter_ghstack=True, ghstack_deps=pr_dependencies
            )
-            if pr.pr_num != self.pr_num:
+            if pr.pr_num != self.pr_num and not skip_all_rule_checks:
                # Raises exception if matching rule is not found
                find_matching_merge_rule(
                    pr,
@ -1113,13 +1100,19 @@ class GitHubPR:
            msg_body = re.sub(RE_GHSTACK_DESC, "", msg_body)
        msg = self.get_title() + f" (#{self.pr_num})\n\n"
        msg += msg_body
+
+        # Mention PR co-authors
+        for author_login, author_name in self.get_authors().items():
+            if author_login != self.get_pr_creator_login():
+                msg += f"\nCo-authored-by: {author_name}"
+
        msg += f"\nPull Request resolved: {self.get_pr_url()}\n"
        msg += f"Approved by: {approved_by_urls}\n"
        if ghstack_deps:
            msg += f"ghstack dependencies: {', '.join([f'#{pr.pr_num}' for pr in ghstack_deps])}\n"
        return msg

-    def add_numbered_label(self, label_base: str) -> None:
+    def add_numbered_label(self, label_base: str, dry_run: bool) -> None:
        labels = self.get_labels() if self.labels is not None else []
        full_label = label_base
        count = 0
@ -1127,7 +1120,7 @@ class GitHubPR:
            if label_base in label:
                count += 1
                full_label = f"{label_base}X{count}"
-        gh_add_labels(self.org, self.project, self.pr_num, [full_label])
+        gh_add_labels(self.org, self.project, self.pr_num, [full_label], dry_run)

    def merge_into(
        self,
@ -1157,9 +1150,9 @@ class GitHubPR:

        repo.push(self.default_branch(), dry_run)
        if not dry_run:
-            self.add_numbered_label(MERGE_COMPLETE_LABEL)
+            self.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)
            for pr in additional_merged_prs:
-                pr.add_numbered_label(MERGE_COMPLETE_LABEL)
+                pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run)

        if comment_id and self.pr_num:
            # When the merge process reaches this part, we can assume that the commit
@ -1199,7 +1192,11 @@ class GitHubPR:
        skip_mandatory_checks: bool = False,
        comment_id: Optional[int] = None,
        branch: Optional[str] = None,
+        skip_all_rule_checks: bool = False,
    ) -> List["GitHubPR"]:
+        """
+        :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally
+        """
        branch_to_merge_into = self.default_branch() if branch is None else branch
        if repo.current_branch() != branch_to_merge_into:
            repo.checkout(branch_to_merge_into)
@ -1215,6 +1212,7 @@ class GitHubPR:
                repo,
                skip_mandatory_checks,
                comment_id=comment_id,
+                skip_all_rule_checks=skip_all_rule_checks,
            )


@ -1669,7 +1667,19 @@ def get_classifications(
    # going forward. It's preferable to try calling Dr.CI API directly first
    # to get the latest results as well as update Dr.CI PR comment
    drci_classifications = get_drci_classifications(pr_num=pr_num, project=project)
-    print(f"From Dr.CI API: {json.dumps(drci_classifications)}")
+
+    def get_readable_drci_results(drci_classifications: Any) -> str:
+        try:
+            s = f"From Dr.CI API ({pr_num}):\n"
+            for classification, jobs in drci_classifications.items():
+                s += f"  {classification}: \n"
+                for job in jobs:
+                    s += f"    {job['id']} {job['name']}\n"
+            return s
+        except Exception:
+            return f"From Dr.CI API: {json.dumps(drci_classifications)}"
+
+    print(get_readable_drci_results(drci_classifications))

    # NB: if the latest results from Dr.CI is not available, i.e. when calling from
    # SandCastle, we fallback to any results we can find on Dr.CI check run summary
@ -1882,8 +1892,8 @@ def do_revert_prs(
            pr.org, pr.project, pr.pr_num, revert_message, dry_run=dry_run
        )

+        pr.add_numbered_label("reverted", dry_run)
        if not dry_run:
-            pr.add_numbered_label("reverted")
            gh_post_commit_comment(pr.org, pr.project, commit_sha, revert_msg)
            gh_update_pr_state(pr.org, pr.project, pr.pr_num)

@ -2053,7 +2063,7 @@ def merge(
    print(f"Attempting merge of {initial_commit_sha} ({pr_link})")

    if MERGE_IN_PROGRESS_LABEL not in pr.get_labels():
-        gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL])
+        gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL], dry_run)

    explainer = TryMergeExplainer(
        skip_mandatory_checks,
@ -2073,8 +2083,7 @@ def merge(

    check_for_sev(pr.org, pr.project, skip_mandatory_checks)

-    if skip_mandatory_checks or can_skip_internal_checks(pr, comment_id):
-        # do not wait for any pending signals if PR is closed as part of co-development process
+    if skip_mandatory_checks:
        gh_post_pr_comment(
            pr.org,
            pr.project,
@ -2201,8 +2210,7 @@ def merge(
    # Finally report timeout back
    msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team."
    msg += f"The last exception was: {last_exception}"
-    if not dry_run:
-        gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"])
+    gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"], dry_run)
    raise RuntimeError(msg)


@ -2281,6 +2289,16 @@ def main() -> None:
        )
        return

+    if args.check_mergeability:
+        if pr.is_ghstack_pr():
+            get_ghstack_prs(repo, pr)  # raises error if out of sync
+        pr.merge_changes(
+            repo,
+            skip_mandatory_checks=True,
+            skip_all_rule_checks=True,
+        )
+        return
+
    if not args.force and pr.has_invalid_submodule_updates():
        message = (
            f"This PR updates submodules {', '.join(pr.get_changed_submodules())}\n"
@ -2329,7 +2347,10 @@ def main() -> None:
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
    finally:
-        gh_remove_label(org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL)
+        if not args.check_mergeability:
+            gh_remove_label(
+                org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL, args.dry_run
+            )


 if __name__ == "__main__":
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -169,6 +169,9 @@ jobs:
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
@ -218,6 +221,9 @@ jobs:
            -e NUM_TEST_SHARDS \
            -e REENABLED_ISSUES \
            -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
            -e PR_LABELS \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
--- a/.github/workflows/_mac-test-mps.yml
+++ b/.github/workflows/_mac-test-mps.yml
@ -34,12 +34,14 @@ jobs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
      keep-going: ${{ steps.filter.outputs.keep-going }}
+      ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }}
+      ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }}
+      ci-no-td: ${{ steps.filter.outputs.ci-no-td }}
      reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
        with:
-          fetch-depth: 1
          submodules: false

      - name: Select all requested test configurations
@ -95,6 +97,9 @@ jobs:
          PY_VERS: 3.9
          PR_BODY: ${{ github.event.pull_request.body }}
          CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ needs.filter.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
          REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }}
        run: |
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -148,6 +148,9 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt
          GITHUB_REPOSITORY: ${{ github.repository }}
          GITHUB_WORKFLOW: ${{ github.workflow }}
--- a/.github/workflows/_rocm-test.yml
+++ b/.github/workflows/_rocm-test.yml
@ -71,6 +71,7 @@ jobs:
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
+          role-duration-seconds: 18000

      - name: Login to Amazon ECR
        id: login-ecr
@ -148,6 +149,9 @@ jobs:
          BRANCH: ${{ steps.parse-ref.outputs.branch }}
          SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@ -196,6 +200,9 @@ jobs:
            -e NUM_TEST_SHARDS \
            -e REENABLED_ISSUES \
            -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -128,6 +128,7 @@ jobs:
          PYTHON_VERSION: "3.8"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
+          SCCACHE_REGION: us-east-1
          VC_PRODUCT: "BuildTools"
          VC_VERSION: ""
          VC_YEAR: "2019"
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -25,7 +25,7 @@ on:
      timeout-minutes:
        required: false
        type: number
-        default: 300
+        default: 240
        description: |
          Set the maximum (in minutes) how long the workflow should take to finish

@ -132,14 +132,26 @@ jobs:
          test-matrix: ${{ inputs.test-matrix }}
          job-name: ${{ steps.get-job-id.outputs.job-name }}

+      - name: Set Test step time
+        id: test-timeout
+        shell: bash
+        env:
+          JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+        run: |
+          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"
+
      - name: Test
        id: test
        shell: bash
+        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
          PYTHON_VERSION: 3.8
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          VC_PRODUCT: "BuildTools"
          VC_VERSION: ""
          VS_VERSION: "16.8.6"
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -143,6 +143,9 @@ jobs:
          PYTORCH_RETRY_TEST_CASES: 1
          PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
+          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
+          NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }}
+          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
@ -185,6 +188,9 @@ jobs:
            -e PYTORCH_RETRY_TEST_CASES \
            -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \
            -e CONTINUE_THROUGH_ERROR \
+            -e VERBOSE_TEST_LOGS \
+            -e NO_TEST_TIMEOUT \
+            -e NO_TD \
            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e SCCACHE_BUCKET \
            -e XLA_CLANG_CACHE_S3_BUCKET_NAME \
--- a/.github/workflows/check_mergeability_ghstack.yml
+++ b/.github/workflows/check_mergeability_ghstack.yml
@ -1,29 +1,84 @@
-name: Check mergeability and dependencies for ghstack prs
+name: Check mergeability of ghstack PR

 on:
  pull_request:
-    types: [opened, synchronize, reopened, edited]
+    types: [opened, synchronize, reopened]
+    branches: [gh/**/base]

 jobs:
-  check-regex:
+  ghstack-mergeability-check:
    runs-on: ubuntu-latest
-    outputs:
-      regex-match: ${{ steps.regex-match.outputs.match }}
    steps:
      - uses: actions/checkout@v4
-
-      - id: regex-match
-        uses: actions-ecosystem/action-regex-match@d50fd2e7a37d0e617aea3d7ada663bd56862b9cc
        with:
-          text: ${{ github.head_ref }}
-          regex: '^(gh/[^/]+/[0-9]+/)head$'
+          fetch-depth: 0
+
+      - name: Setup git
+        shell: bash
+        run: |
+          git config --global user.email "pytorchmergebot@users.noreply.github.com"
+          git config --global user.name "PyTorch MergeBot"
+          git fetch origin main:main
+
+      - name: Wait for orig branch
+        shell: bash
+        run: |
+          BRANCH="${{ github.base_ref }}"
+          echo "$BRANCH"
+          BRANCH="${BRANCH%/base}/orig"
+          echo "$BRANCH"
+
+          WAIT_SECONDS=300
+          END_WAIT=$((SECONDS+WAIT_SECONDS))
+          BRANCH_EXISTS=0
+
+          while [ $SECONDS -lt $END_WAIT ]; do
+            git fetch --prune origin "${BRANCH}" || true
+            if git rev-parse --verify "origin/${BRANCH}"; then
+              BRANCH_EXISTS=1
+              break
+            fi
+            echo "Waiting for branch ${BRANCH} to exist..."
+            sleep 30  # Wait for 30 seconds before retrying
+          done
+
+          if [ $BRANCH_EXISTS -eq 0 ]; then
+            echo "Branch ${BRANCH} not found after ${WAIT_SECONDS} seconds."
+            echo "Mergeability check failed for infrastructure reasons."
+            exit 1
+          fi
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+          cache: pip
+          architecture: x64
+
+      - run: pip install pyyaml==6.0 rockset==1.0.3
+        shell: bash
+
+      - name: Verify mergeability
+        shell: bash
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUM: ${{ github.event.pull_request.number }}
+        run: |
+          set -ex
+          python3 .github/scripts/trymerge.py --check-mergeability "${PR_NUM}"
+
+      - name: Print debug info
+        if: failure()
+        shell: bash
+        env:
+          PR_NUM: ${{ github.event.pull_request.number }}
+        run: |
+          {
+            echo "# PR $PR_NUM is not mergeable into main"
+            echo "To debug, run the diagnostic workflow:"
+            echo "https://github.com/pytorch/test-infra/actions/workflows/pr-dependencies-check.yml"
+          } >> "$GITHUB_STEP_SUMMARY"

-  pr-dependencies-check:
-    needs: check-regex
-    if: ${{ needs.check-regex.outputs.regex-match != '' }}
-    uses: pytorch/test-infra/.github/workflows/pr-dependencies-check.yml@main
-    with:
-      pr_number: ${{ github.event.pull_request.number }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
--- a/.github/workflows/cherry-pick.yml
+++ b/.github/workflows/cherry-pick.yml
@ -0,0 +1,57 @@
+name: Create a cherry pick from a PR
+
+on:
+  repository_dispatch:
+    types: [try-cherry-pick]
+
+jobs:
+  cherry-pick:
+    name: cherry-pick-pr-${{ github.event.client_payload.pr_num }}
+    runs-on: ubuntu-latest
+    environment: cherry-pick-bot
+    env:
+        GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
+    steps:
+      - name: Checkout repo
+        id: checkout
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+          token: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.11'
+          cache: pip
+
+      # Not the direct dependencies but the script uses trymerge
+      - run: pip install pyyaml==6.0 rockset==1.0.3
+
+      - name: Setup committer id
+        run: |
+          git config --global user.name "PyTorch Bot"
+          git config --global user.email "pytorchbot@users.noreply.github.com"
+
+      - name: Cherry pick the PR
+        shell: bash
+        env:
+          PR_NUM: ${{ github.event.client_payload.pr_num }}
+          BRANCH: ${{ github.event.client_payload.branch }}
+          CLASSIFICATION: ${{ github.event.client_payload.classification }}
+          FIXES: ${{ github.event.client_payload.fixes || '' }}
+          ACTOR: ${{ github.actor }}
+          GITHUB_TOKEN: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }}
+        run: |
+          set -ex
+
+          python .github/scripts/cherry_pick.py \
+            --onto-branch "${BRANCH}" \
+            --classification "${CLASSIFICATION}" \
+            --fixes "${FIXES}" \
+            --github-actor "${ACTOR}" \
+            "${PR_NUM}"
+
+concurrency:
+  group: cherry-pick-pr-${{ github.event.client_payload.pr_num }}
+  cancel-in-progress: true
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -15,6 +15,9 @@ jobs:
    if: ${{ github.repository == 'pytorch/pytorch' }}
    name: Create Release
    runs-on: ubuntu-latest
+    # https://github.com/softprops/action-gh-release?tab=readme-ov-file#permissions
+    permissions:
+      contents: write
    steps:
      - uses: malfet/checkout@silent-checkout
        with:
--- a/.github/workflows/delete_old_branches.yml
+++ b/.github/workflows/delete_old_branches.yml
@ -0,0 +1,39 @@
+# A workflow that deletes branches of closed PRs
+
+name: Delete old branches
+
+on:
+  schedule:
+    # Run daily.
+    - cron: 30 1 * * *
+  workflow_dispatch:
+
+concurrency:
+  group: delete-old-branches
+  cancel-in-progress: true
+
+permissions:
+  contents: write
+
+jobs:
+  delete:
+    if: ${{ github.repository == 'pytorch/pytorch' }}
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.8'
+          architecture: x64
+          check-latest: false
+
+      - name: Delete old branches
+        run: python .github/scripts/delete_old_branches.py
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -31,7 +31,7 @@ permissions: read-all

 jobs:
  docker-build:
-    runs-on: [self-hosted, linux.2xlarge]
+    runs-on: [self-hosted, linux.12xlarge]
    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
    timeout-minutes: 240
    strategy:
@ -43,6 +43,7 @@ jobs:
          - docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9
          - docker-image-name: pytorch-linux-focal-py3.8-clang10
          - docker-image-name: pytorch-linux-focal-py3.11-clang10
+          - docker-image-name: pytorch-linux-focal-py3.12-clang10
          - docker-image-name: pytorch-linux-focal-rocm-n-1-py3
          - docker-image-name: pytorch-linux-focal-rocm-n-py3
          - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -16,28 +16,28 @@ concurrency:
 permissions: read-all

 jobs:
-  linux-focal-rocm5_7-py3_8-inductor-build:
-    name: rocm5.7-py3.8-inductor
+  linux-focal-rocm6_0-py3_8-inductor-build:
+    name: rocm6.0-py3.8-inductor
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm5_7-py3_8-inductor-test:
+  linux-focal-rocm6_0-py3_8-inductor-test:
    permissions:
      id-token: write
      contents: read
-    name: rocm5.7-py3.8-inductor
+    name: rocm6.0-py3.8-inductor
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-inductor-build
+    needs: linux-focal-rocm6_0-py3_8-inductor-build
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.docker-image }}
-      test-matrix:  ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.0-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-inductor-build.outputs.docker-image }}
+      test-matrix:  ${{ needs.linux-focal-rocm6_0-py3_8-inductor-build.outputs.test-matrix }}

  linux-focal-cuda12_1-py3_10-gcc9-inductor-build:
    name: cuda12.1-py3.10-gcc9-sm86
@ -124,6 +124,7 @@ jobs:
          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/lint-bc.yml
+++ b/.github/workflows/lint-bc.yml
@ -6,8 +6,6 @@ on:
      - opened
      - synchronize
      - reopened
-      - labeled
-      - unlabeled
    branches-ignore:
      - nightly
  workflow_dispatch:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -15,53 +15,36 @@ permissions: read-all
 # The names of steps that actually test the code should be suffixed with `(nonretryable)`.
 # When any other step fails, it's job will be retried once by retryBot.
 jobs:
-  lintrunner:
+  lintrunner-clang:
    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
    with:
      timeout: 120
      runner: linux.2xlarge
      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
+      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
+      submodules: true
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
-        # The generic Linux job chooses to use base env, not the one setup by the image
-        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
-        conda activate "${CONDA_ENV}"
+        export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT"
+        export CLANG=1
+        .github/scripts/lintrunner.sh

-        CACHE_DIRECTORY="/tmp/.lintbin"
-        # Try to recover the cached binaries
-        if [[ -d "${CACHE_DIRECTORY}" ]]; then
-          # It's ok to fail this as lintrunner init would download these binaries
-          # again if they do not exist
-          cp -r "${CACHE_DIRECTORY}" . || true
-        fi
-
-        # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
-
-        # Do build steps necessary for linters
-        python3 -m tools.linter.clang_tidy.generate_build_files
-        python3 -m tools.generate_torch_version --is_debug=false
-        python3 -m tools.pyi.gen_pyi \
-          --native-functions-path aten/src/ATen/native/native_functions.yaml \
-          --tags-path aten/src/ATen/native/tags.yaml \
-          --deprecated-functions-path "tools/autograd/deprecated.yaml"
-
-        RC=0
-        # Run lintrunner on all files
-        if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then
-          echo ""
-          echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m"
-          echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m"
-          RC=1
-        fi
-
-        # Use jq to massage the JSON lint output into GitHub Actions workflow commands.
-        jq --raw-output \
-          '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \
-          lint.json || true
-
-        exit $RC
+  lintrunner-noclang:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    with:
+      timeout: 120
+      runner: linux.2xlarge
+      docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter
+      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
+      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
+      fetch-depth: 0
+      submodules: true
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT"
+        .github/scripts/lintrunner.sh

  quick-checks:
    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@ -225,7 +208,7 @@ jobs:
          cache: pip
      - name: Install dependencies
        run: |
-          pip install pytest-rerunfailures==11.1.* pytest-shard==0.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
+          pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.*
          pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/
      - name: Run run_test.py (nonretryable)
        run: |
--- a/.github/workflows/mac-mps.yml
+++ b/.github/workflows/mac-mps.yml
@ -19,7 +19,7 @@ jobs:
    with:
      sync-tag: macos-12-py3-arm64-build
      build-environment: macos-12-py3-arm64
-      runner-type: macos-m1-12
+      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -196,11 +196,11 @@ jobs:
      docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_0-py3_8-build:
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
@ -208,14 +208,14 @@ jobs:
          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm5_7-py3_8-test:
+  linux-focal-rocm6_0-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm5.7-py3.8
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs: linux-focal-rocm6_0-py3_8-build
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.0-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -179,6 +179,29 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }}

+  linux-focal-py3_12-clang10-build:
+    name: linux-focal-py3.12-clang10
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-py3.12-clang10
+      docker-image-name: pytorch-linux-focal-py3.12-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+        ]}
+
+  linux-focal-py3_12-clang10-test:
+    name: linux-focal-py3.12-clang10
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-py3_12-clang10-build
+    with:
+      build-environment: linux-focal-py3.12-clang10
+      docker-image: ${{ needs.linux-focal-py3_12-clang10-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_12-clang10-build.outputs.test-matrix }}
+      timeout-minutes: 600
+
  linux-focal-cuda11_8-py3_10-gcc9-build:
    name: linux-focal-cuda11.8-py3.10-gcc9
    uses: ./.github/workflows/_linux-build.yml
@ -233,7 +256,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3-clang12-mobile-build
-      docker-image-name: pytorch-linux-jammy-py3-clang12-asan
+      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      build-generates-artifacts: false
      test-matrix: |
        { include: [
@ -357,13 +380,13 @@ jobs:
          { config: "default", shard: 1, num_shards: 1 },
        ]}

-  linux-focal-rocm5_7-py3_8-build:
+  linux-focal-rocm6_0-py3_8-build:
    # don't run build twice on main
    if: github.event_name == 'pull_request'
-    name: linux-focal-rocm5.7-py3.8
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -18,11 +18,11 @@ concurrency:
 permissions: read-all

 jobs:
-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_0-py3_8-build:
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -35,14 +35,14 @@ jobs:
          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" },
        ]}

-  linux-focal-rocm5_7-py3_8-test:
+  linux-focal-rocm6_0-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm5.7-py3.8
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs: linux-focal-rocm6_0-py3_8-build
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.0-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -88,28 +88,28 @@ jobs:
      docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }}

-  linux-focal-rocm5_6-py3_8-build:
-    name: linux-focal-rocm5.6-py3.8
+  linux-focal-rocm6_0-py3_8-build:
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.6-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm5_6-py3_8-test:
+  linux-focal-rocm6_0-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm5.6-py3.8
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_6-py3_8-build
+    needs: linux-focal-rocm6_0-py3_8-build
    with:
-      build-environment: linux-focal-rocm5.6-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.0-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}

  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
--- a/.github/workflows/target-determination-indexer.yml
+++ b/.github/workflows/target-determination-indexer.yml
@ -0,0 +1,152 @@
+name: Index PyTorch Tests for Target Determination
+
+on:
+  workflow_dispatch:
+  # TODO: Trigger every few hours
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  index:
+    runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each
+    environment: target-determinator-env
+    steps:
+      - name: Setup Linux
+        uses: ./.github/actions/setup-linux
+
+      - name: Calculate docker image
+        id: calculate-docker-image
+        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        with:
+          docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9
+
+      - name: Use following to pull public copy of the image
+        id: print-ghcr-mirror
+        env:
+          ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
+        shell: bash
+        run: |
+          tag=${ECR_DOCKER_IMAGE##*/}
+          echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}"
+
+      - name: Pull docker image
+        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        with:
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+
+      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
+        id: install-nvidia-driver
+        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
+
+      - name: Clone PyTorch
+        uses: actions/checkout@v3
+        with:
+          path: pytorch
+
+      - name: Clone CodeLlama
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/codellama
+          ref: main
+          path: codellama
+
+      - name: Clone Target Determination Code
+        uses: actions/checkout@v3
+        with:
+          repository: osalpekar/llm-target-determinator
+          ref: v0.0.1
+          path: llm-target-determinator
+
+      - name: Install Requirements
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+
+          conda create \
+            --yes \
+            --quiet \
+            --name "tdenv" \
+            "python=3.9"
+
+          conda activate tdenv
+
+          cd "${GITHUB_WORKSPACE}"
+          pwd
+          cd llm-target-determinator
+          pip install -r requirements.txt
+          cd ../codellama
+          pip install -e .
+
+      - name: Configure AWS credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write
+          aws-region: us-east-1
+
+      - name: Fetch CodeLlama Checkpoint
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+
+          conda activate tdenv
+          pip install awscli==1.32.18
+          cd codellama/
+          mkdir "CodeLlama-7b-Python"
+          aws s3 cp \
+            "s3://target-determinator-assets/CodeLlama-7b-Python" \
+            "CodeLlama-7b-Python" \
+            --recursive
+
+      - name: Run Indexer
+        id: indexer
+        shell: bash -l {0}
+        run: |
+          set -euxo pipefail
+
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
+
+          python create_filelist.py
+
+          torchrun \
+            --standalone \
+            --nnodes=1 \
+            --nproc-per-node=1 \
+            indexer.py \
+            --experiment-name indexer-files
+
+      - name: Upload Index to S3
+        shell: bash -l {0}
+        if: ${{ steps.indexer.outcome == 'success' }}
+        run: |
+          set -euxo pipefail
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
+
+          TIMESTAMP=$(date -Iseconds)
+          ZIP_NAME = "indexer-files-${TIMESTAMP}.zip"
+
+          # Create a zipfile with all the generated indices
+          zip -r "${ZIP_NAME}" indexer-files
+
+          # Move the old index into the archived/ folder
+          aws s3 cp \
+            "s3://target-determinator-assets/indexes/latest/*" \
+            "s3://target-determinator-assets/indexes/archived/"
+
+          # Move the new index into the latestl/ folder
+          aws s3 cp \
+            "${ZIP_NAME}" \
+            "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}"
+
+          # Note that because the above 2 operations are not atomic, there will
+          # be a period of a few seconds between these where there is no index
+          # present in the latest/ folder. To account for this, the retriever
+          # should have some retry logic with backoff to ensure fetching the
+          # index doesn't fail.
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -95,7 +95,7 @@ jobs:
    with:
      sync-tag: macos-12-py3-arm64-build
      build-environment: macos-12-py3-arm64
-      runner-type: macos-m1-12
+      runner-type: macos-m1-stable
      build-generates-artifacts: true
      # To match the one pre-installed in the m1 runners
      python-version: 3.9.12
@ -177,11 +177,11 @@ jobs:
          { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge.nonephemeral" },
        ]}

-  linux-focal-rocm5_7-py3_8-build:
-    name: linux-focal-rocm5.7-py3.8
+  linux-focal-rocm6_0-py3_8-build:
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
+      build-environment: linux-focal-rocm6.0-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
@ -189,15 +189,15 @@ jobs:
          { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

-  linux-focal-rocm5_7-py3_8-test:
+  linux-focal-rocm6_0-py3_8-test:
    permissions:
      id-token: write
      contents: read
-    name: linux-focal-rocm5.7-py3.8
+    name: linux-focal-rocm6.0-py3.8
    uses: ./.github/workflows/_rocm-test.yml
-    needs: linux-focal-rocm5_7-py3_8-build
+    needs: linux-focal-rocm6_0-py3_8-build
    with:
-      build-environment: linux-focal-rocm5.7-py3.8
-      docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }}
+      build-environment: linux-focal-rocm6.0-py3.8
+      docker-image: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-rocm6_0-py3_8-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -13,46 +13,13 @@ jobs:
  do_update_viablestrict:
    if: ${{ github.repository_owner == 'pytorch' }}
    runs-on: ubuntu-20.04
-    environment: mergebot
+    environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }}
    steps:
-      - name: Checkout repo
-        uses: actions/checkout@v3
+      - name: Update viable/strict
+        uses: pytorch/test-infra/.github/actions/update-viablestrict@main
        with:
-          fetch-depth: 0
-          token: ${{ secrets.MERGEBOT_TOKEN }}
-
-      - name: Setup Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: '3.8'
-          architecture: x64
-          check-latest: false
-          cache: pip
-          cache-dependency-path: |
-            **/.ci/docker/requirements-ci.txt
-            **/.github/requirements-gha-cache.txt
-
-      - name: Install Python Packages
-        run: |
-          pip3 install rockset==1.0.3
-          pip3 install boto3==1.19.12
-
-      - name: Get latest viable commit
-        env:
-          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
-        run: |
-          output=$(python3 .github/scripts/fetch_latest_green_commit.py)
-          echo "latest_viable_sha=$output" >> "${GITHUB_OUTPUT}"
-        id: get-latest-commit
-
-      - name: Push SHA to viable/strict branch
-        if: steps.get-latest-commit.outputs.latest_viable_sha != 'None'
-        env:
-          GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }}
-        run: |
-          git config --global user.email "pytorchmergebot@users.noreply.github.com"
-          git config --global user.name "PyTorch MergeBot"
-          echo "Set the latest sha variable to be ${{ steps.get-latest-commit.outputs.latest_viable_sha }}"
-          # Pushing an older green commit here will fail because it's non-fast-forward, which is ok
-          # to ignore because we already have the later green commit in visable/strict
-          git push origin "${{ steps.get-latest-commit.outputs.latest_viable_sha }}":viable/strict || true
+          repository: pytorch/pytorch
+          stable-branch: viable/strict
+          requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]'
+          secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }}
+          rockset-api-key: ${{ secrets.ROCKSET_API_KEY }}
--- a/.gitignore
+++ b/.gitignore
@ -126,6 +126,7 @@ env
 .circleci/scripts/COMMIT_MSG
 scripts/release_notes/*.json
 sccache-stats*.json
+lint.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1,5 +1,3 @@
-merge_base_with = "origin/main"
-
 [[linter]]
 code = 'FLAKE8'
 include_patterns = ['**/*.py']
@ -48,7 +46,7 @@ init_command = [
    'mccabe==0.7.0',
    'pycodestyle==2.11.1',
    'pyflakes==3.1.0',
-    'torchfix==0.2.0',
+    'torchfix==0.4.0',
 ]


@ -66,6 +64,8 @@ include_patterns = [
    'aten/src/ATen/native/**/Foreach*.*',
    'aten/src/ATen/native/cuda/fused*.*',
    'aten/src/ATen/native/cuda/Fused*.cu',
+    'aten/src/ATen/native/cudnn/*.h',
+    'aten/src/ATen/native/cudnn/*.cpp',
    'c10/**/*.h',
    'c10/**/*.cpp',
    'torch/csrc/**/*.h',
@ -120,39 +120,6 @@ include_patterns = [
 ]
 exclude_patterns = [
    '**/fb/**',
-    'torch/include/**',
-    'torch/csrc/**',
-    'torch/_dynamo/**/*.py',
-    'torch/_inductor/**/*.py',
-    'torch/_numpy/**/*.py',
-    'torch/_functorch/aot_autograd.py',
-    'torch/_functorch/benchmark_utils.py',
-    'torch/_functorch/compile_utils.py',
-    'torch/_functorch/compilers.py',
-    'torch/_functorch/eager_transforms.py',
-    'torch/_functorch/fx_minifier.py',
-    'torch/_functorch/partitioners.py',
-    'torch/_functorch/top_operators_github_usage.py',
-    'torch/_functorch/vmap.py',
-    'torch/_subclasses/schema_check_mode.py',
-    'torch/distributed/elastic/agent/server/api.py',
-    'torch/testing/_internal/**',
-    'torch/distributed/fsdp/fully_sharded_data_parallel.py',
-    # TODO(suo): these exclusions were added just to get lint clean on master.
-    # Follow up to do more target suppressions and remove them.
-    'torch/ao/quantization/fx/convert.py',
-    'torch/ao/quantization/_dbr/function_fusion.py',
-    'test/test_datapipe.py',
-    'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py',
-    'test/test_numpy_interop.py',
-    'torch/torch_version.py',
-    'torch/fx/proxy.py',
-    'torch/fx/passes/shape_prop.py',
-    'torch/fx/node.py',
-    'torch/fx/experimental/symbolic_shapes.py',
-    'torch/fx/experimental/proxy_tensor.py',
-    'torch/_subclasses/fake_utils.py',
-    'torch/_subclasses/fake_tensor.py',
 ]
 command = [
    'python3',
@ -168,47 +135,22 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.1.6',
-    'mypy==1.7.0',
+    'mypy==1.8.0',
+    'sympy==1.11.1',
    'types-requests==2.27.25',
    'types-PyYAML==6.0.7',
    'types-tabulate==0.8.8',
    'types-protobuf==3.19.18',
    'types-pkg-resources==0.1.3',
    'types-Jinja2==2.11.9',
+    'types-colorama==0.4.6',
+    'filelock==3.13.1',
    'junitparser==2.1.1',
    'rich==10.9.0',
    'pyyaml==6.0',
    'optree==0.10.0',
 ]

-[[linter]]
-code = 'MYPYINDUCTOR'
-include_patterns = [
-    'torch/_dynamo/**/*.py',
-    'torch/_inductor/**/*.py',
-]
-exclude_patterns = [
-    '**/fb/**',
-    'torch/_dynamo/backends/**/*.py',
-    'torch/_dynamo/variables/**/*.py',
-    'torch/_dynamo/polyfill.py',
-    'torch/_inductor/fx_passes/serialized_patterns/**',
-]
-command = [
-    'python3',
-    'tools/linter/adapters/mypy_linter.py',
-    '--config=mypy-inductor.ini',
-    '--code=MYPYINDUCTOR',
-    '--',
-    '@{{PATHSFILE}}'
-]
-init_command = [
-    'python3',
-    'tools/linter/adapters/pip_init.py',
-    '--dry-run={{DRYRUN}}',
-    'types-colorama==0.4.6',
-]
-
 [[linter]]
 code = 'MYPYSTRICT'
 include_patterns = [
@ -249,7 +191,8 @@ include_patterns = [
    'c10/**/*.h',
    # Enable coverage of headers in torch/csrc and excluding sub-directories for now.
    'torch/csrc/*.h',
-    'torch/csrc/autograd/**/*.h',
+    'torch/csrc/*.cpp',
+    'torch/csrc/**/*.h',
    'torch/csrc/**/*.cpp',
 ]
 exclude_patterns = [
@ -258,6 +201,8 @@ exclude_patterns = [
    # CUDA files are also excluded.
    '**/fb/**',
    '**/*pb.h',
+    'c10/**/cuda/*pp',
+    'aten/**/cuda/*pp',
    '**/cuda/*pp',
    '**/*XPU*',
    '**/xpu/*pp',
@ -277,8 +222,6 @@ exclude_patterns = [
    'third_party/**/*',
    'torch/csrc/api/**',
    'torch/csrc/autograd/generated/**',
-    'torch/csrc/autograd/profiler_legacy.cpp',
-    'torch/csrc/cuda/**',
    'torch/csrc/dynamo/*',
    'torch/csrc/distributed/**/*',
    'torch/csrc/inductor/**/*',
@ -329,6 +272,26 @@ command = [
    '@{{PATHSFILE}}'
 ]

+[[linter]]
+code = 'TYPENOSKIP'
+include_patterns = ['mypy.ini']
+command = [
+    'python3',
+    'tools/linter/adapters/grep_linter.py',
+    '--pattern=follow_imports\s*=\s*skip',
+    '--linter-name=TYPENOSKIP',
+    '--error-name=use of follow_imports = skip',
+    """--error-description=\
+        follow_imports = skip is forbidden from mypy.ini configuration as it \
+        is extremely easy to accidentally turn off type checking unintentionally.  If \
+        you need to suppress type errors, use a top level # mypy: ignore-errors.  \
+        Do not rely on automatic Any substitution; instead, manually # type: ignore \
+        at use sites or define a pyi type stub with more relaxed types. \
+    """,
+    '--',
+    '@{{PATHSFILE}}'
+]
+
 [[linter]]
 code = 'NOQA'
 include_patterns = ['**/*.py', '**/*.pyi']
@ -1392,6 +1355,7 @@ exclude_patterns = [
    'test/nn/test_embedding.py',
    'test/nn/test_init.py',
    'test/nn/test_lazy_modules.py',
+    'test/nn/test_load_state_dict.py',
    'test/nn/test_module_hooks.py',
    'test/nn/test_multihead_attention.py',
    'test/nn/test_packed_sequence.py',
@ -1586,7 +1550,6 @@ exclude_patterns = [
    'test/test_mkldnn_verbose.py',
    'test/test_mobile_optimizer.py',
    'test/test_model_dump.py',
-    'test/test_module_init.py',
    'test/test_modules.py',
    'test/test_monitor.py',
    'test/test_mps.py',
@ -2689,7 +2652,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.1.11',
+    'ruff==0.2.2',
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -228,6 +228,7 @@ filegroup(
        [
            "aten/src/ATen/cuda/*.cpp",
            "aten/src/ATen/cuda/detail/*.cpp",
+            "aten/src/ATen/cuda/tunable/*.cpp",
            "aten/src/ATen/cudnn/*.cpp",
            "aten/src/ATen/native/cuda/*.cpp",
            "aten/src/ATen/native/cuda/linalg/*.cpp",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -43,11 +43,15 @@ set(CMAKE_C_STANDARD   11 CACHE STRING "The C standard whose features are reques
 # ---[ Utils
 include(cmake/public/utils.cmake)

-# --- [ Check that minimal gcc version is 9.4+
-if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.4)
-  message(FATAL "GCC-9.4 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}")
+# --- [ Check that minimal gcc version is 9.3+
+if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3)
+  message(FATAL_ERROR "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}")
 endif()

+# This define is needed to preserve behavior given anticpated changes to cccl/thrust
+# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html
+string(APPEND CMAKE_CUDA_FLAGS "-DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS")
+
 if(LINUX)
  include(cmake/CheckAbi.cmake)
  string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}")
@ -347,6 +351,8 @@ cmake_dependent_option(
    "NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(
    BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
+cmake_dependent_option(
+    BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF)

 option(USE_MIMALLOC "Use mimalloc" OFF)
 # Enable third party mimalloc library to improve memory allocation performance on Windows.
@ -1237,3 +1243,12 @@ if(DEFINED USE_CUSTOM_DEBINFO)
    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g")
    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g")
 endif()
+
+# Bundle PTXAS if needed
+if(BUILD_BUNDLE_PTXAS AND USE_CUDA)
+   if(NOT EXISTS "${PROJECT_SOURCE_DIR}/build/bin/ptxas")
+     message(STATUS "Copying PTXAS into the bin folder")
+     file(COPY "${CUDAToolkit_BIN_DIR}/ptxas" DESTINATION "${PROJECT_BINARY_DIR}")
+   endif()
+   install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}")
+endif()
--- a/15
+++ b/15
@ -97,9 +97,9 @@ test/functorch/test_ops.py @zou3519 @chillee @kshitij12345
 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345

 # torch MPS
-test/test_mps.py @kulinseth
-aten/src/ATen/mps/ @kulinseth
-aten/src/ATen/native/mps/ @kulinseth
+test/test_mps.py @kulinseth @malfet
+aten/src/ATen/mps/ @kulinseth @malfet
+aten/src/ATen/native/mps/ @kulinseth @malfet

 # Profiler
 torch/csrc/autograd/profiler* @aaronenyeshi
@ -130,3 +130,12 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd
 # torch.export
 /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
+
+# serialization-related files
+/aten/src/ATen/MapAllocator* @mikaylagawarecki
+/caffe2/serialize/ @mikaylagawarecki
+/torch/serialization.py @mikaylagawarecki
+/torch/storage.py @mikaylagawarecki
+/torch/csrc/Storage* @mikaylagawarecki
+# subscribing for PyTorchFileWriter/PyTorchFileReader changes
+/torch/csrc/jit/python/init.cpp @mikaylagawarecki
--- a/README.md
+++ b/README.md
@ -158,13 +158,13 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv)
 #### Prerequisites
 If you are installing from source, you will need:
 - Python 3.8 or later (for Linux, Python 3.8.1+ is needed)
- A compiler that fully supports C++17, such as clang or gcc (especially for aarch64, gcc 9.4.0 or newer is required)
+- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required)

 We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro.

 If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following:
 - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads)
- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above
+- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA

 Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/pdf/cuDNN-Support-Matrix.pdf) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
--- a/RELEASE.md
+++ b/RELEASE.md
@ -49,7 +49,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | Stable CUDA | Experimental CUDA |
 | --- | --- | --- | --- |
-| 2.2 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
+| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
 | 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 |
 | 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 |
 | 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 |
--- a/android/pytorch_android/generate_test_torchscripts.py
+++ b/android/pytorch_android/generate_test_torchscripts.py
@ -125,6 +125,15 @@ class Test(torch.jit.ScriptModule):
            r = r.contiguous()
        return r

+    @torch.jit.script_method
+    def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+        r = torch.nn.functional.conv3d(x, w)
+        if toChannelsLast:
+            r = r.contiguous(memory_format=torch.channels_last_3d)
+        else:
+            r = r.contiguous()
+        return r
+
    @torch.jit.script_method
    def contiguous(self, x: Tensor) -> Tensor:
        return x.contiguous()
--- a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl
+++ b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl
--- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
+++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java
@ -348,15 +348,32 @@ public abstract class PytorchTestBase {
  @Test
  public void testChannelsLastConv2d() throws IOException {
    long[] inputShape = new long[] {1, 3, 2, 2};
-    long[] dataNCHW = new long[] {1, 2, 3, 4, 11, 12, 13, 14, 101, 102, 103, 104};
-    Tensor inputNCHW = Tensor.fromBlob(dataNCHW, inputShape, MemoryFormat.CONTIGUOUS);
-    long[] dataNHWC = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104};
-    Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST);
+    long[] dataNCHW = new long[] {
+      111, 112,
+      121, 122,

+      211, 212,
+      221, 222,
+
+      311, 312,
+      321, 322};
+    Tensor inputNCHW = Tensor.fromBlob(dataNCHW, inputShape, MemoryFormat.CONTIGUOUS);
+    long[] dataNHWC = new long[] {
+      111, 211, 311,       112, 212, 312,
+
+      121, 221, 321,       122, 222, 322};
+    Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST);
    long[] weightShape = new long[] {3, 3, 1, 1};
-    long[] dataWeightOIHW = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
+    long[] dataWeightOIHW = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1};
    Tensor wNCHW = Tensor.fromBlob(dataWeightOIHW, weightShape, MemoryFormat.CONTIGUOUS);
-    long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1};
+    long[] dataWeightOHWI = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1};
+
    Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST);

    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
@ -367,7 +384,15 @@ public abstract class PytorchTestBase {
        outputNCHW,
        MemoryFormat.CONTIGUOUS,
        new long[] {1, 3, 2, 2},
-        new long[] {2, 4, 6, 8, 11, 12, 13, 14, -101, -102, -103, -104});
+        new long[] {
+          2*111, 2*112,
+          2*121, 2*122,
+
+          211, 212,
+          221, 222,
+
+          -311, -312,
+          -321, -322});

    final IValue outputNHWC =
        module.runMethod("conv2d", IValue.from(inputNHWC), IValue.from(wNHWC), IValue.from(true));
@ -375,7 +400,89 @@ public abstract class PytorchTestBase {
        outputNHWC,
        MemoryFormat.CHANNELS_LAST,
        new long[] {1, 3, 2, 2},
-        new long[] {2, 11, -101, 4, 12, -102, 6, 13, -103, 8, 14, -104});
+        new long[] {
+          2*111, 211, -311,      2*112, 212, -312,
+          2*121, 221, -321,      2*122, 222, -322});
+  }
+
+  @Test
+  public void testChannelsLastConv3d() throws IOException {
+    long[] inputShape = new long[] {1, 3, 2, 2, 2};
+    long[] dataNCDHW = new long[] {
+      1111, 1112,
+      1121, 1122,
+      1211, 1212,
+      1221, 1222,
+
+      2111, 2112,
+      2121, 2122,
+      2211, 2212,
+      2221, 2222,
+
+      3111, 3112,
+      3121, 3122,
+      3211, 3212,
+      3221, 3222};
+    Tensor inputNCDHW = Tensor.fromBlob(dataNCDHW, inputShape, MemoryFormat.CONTIGUOUS);
+    long[] dataNDHWC = new long[] {
+      1111, 2111, 3111,
+      1112, 2112, 3112,
+
+      1121, 2121, 3121,
+      1122, 2122, 3122,
+
+      1211, 2211, 3211,
+      1212, 2212, 3212,
+
+      1221, 2221, 3221,
+      1222, 2222, 3222};
+
+    Tensor inputNDHWC = Tensor.fromBlob(dataNDHWC, inputShape, MemoryFormat.CHANNELS_LAST_3D);
+
+    long[] weightShape = new long[] {3, 3, 1, 1, 1};
+    long[] dataWeightOIDHW = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1,
+    };
+    Tensor wNCDHW = Tensor.fromBlob(dataWeightOIDHW, weightShape, MemoryFormat.CONTIGUOUS);
+    long[] dataWeightODHWI = new long[] {
+      2, 0, 0,
+      0, 1, 0,
+      0, 0, -1,
+    };
+    Tensor wNDHWC = Tensor.fromBlob(dataWeightODHWI, weightShape, MemoryFormat.CHANNELS_LAST_3D);
+
+    final Module module = loadModel(TEST_MODULE_ASSET_NAME);
+
+    final IValue outputNCDHW =
+        module.runMethod("conv3d", IValue.from(inputNCDHW), IValue.from(wNCDHW), IValue.from(false));
+    assertIValueTensor(
+        outputNCDHW,
+        MemoryFormat.CONTIGUOUS,
+        new long[] {1, 3, 2, 2, 2},
+        new long[] {
+          2*1111, 2*1112,     2*1121, 2*1122,
+          2*1211, 2*1212,     2*1221, 2*1222,
+
+          2111, 2112,     2121, 2122,
+          2211, 2212,     2221, 2222,
+
+          -3111, -3112,     -3121, -3122,
+          -3211, -3212,     -3221, -3222});
+
+    final IValue outputNDHWC =
+        module.runMethod("conv3d", IValue.from(inputNDHWC), IValue.from(wNDHWC), IValue.from(true));
+    assertIValueTensor(
+        outputNDHWC,
+        MemoryFormat.CHANNELS_LAST_3D,
+        new long[] {1, 3, 2, 2, 2},
+        new long[] {
+          2*1111, 2111, -3111,      2*1112, 2112, -3112,
+          2*1121, 2121, -3121,      2*1122, 2122, -3122,
+
+          2*1211, 2211, -3211,      2*1212, 2212, -3212,
+          2*1221, 2221, -3221,      2*1222, 2222, -3222});
  }

  @Test
--- a/android/pytorch_android/test_asset.jit
+++ b/android/pytorch_android/test_asset.jit
@ -84,6 +84,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
        r = r.contiguous()
    return r

+def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor:
+    r = torch.conv3d(x, w)
+    if (toChannelsLast):
+        # memory_format=torch.channels_last_3d
+        r = r.contiguous(memory_format=2)
+    else:
+        r = r.contiguous()
+    return r
+
 def contiguous(self, x: Tensor) -> Tensor:
    return x.contiguous()

--- a/aten/src/ATen/AccumulateType.h
+++ b/aten/src/ATen/AccumulateType.h
@ -4,7 +4,9 @@
 #include <c10/core/ScalarType.h>
 #include <c10/util/BFloat16.h>
 #include <c10/util/Float8_e4m3fn.h>
+#include <c10/util/Float8_e4m3fnuz.h>
 #include <c10/util/Float8_e5m2.h>
+#include <c10/util/Float8_e5m2fnuz.h>
 #include <c10/util/Half.h>

 // Defines the accumulation type for a scalar type.
@ -87,6 +89,8 @@ MPS_ACC_TYPE(BFloat16, float);
 MPS_ACC_TYPE(Half, float);
 MPS_ACC_TYPE(Float8_e5m2, float);
 MPS_ACC_TYPE(Float8_e4m3fn, float);
+MPS_ACC_TYPE(Float8_e5m2fnuz, float);
+MPS_ACC_TYPE(Float8_e4m3fnuz, float);
 MPS_ACC_TYPE(float, float);
 MPS_ACC_TYPE(double, float);
 MPS_ACC_TYPE(int8_t, int64_t);
@ -107,6 +111,8 @@ CUDA_ACC_TYPE(BFloat16, float);
 CUDA_ACC_TYPE(Half, float);
 CUDA_ACC_TYPE(Float8_e5m2, float);
 CUDA_ACC_TYPE(Float8_e4m3fn, float);
+CUDA_ACC_TYPE(Float8_e5m2fnuz, float);
+CUDA_ACC_TYPE(Float8_e4m3fnuz, float);
 CUDA_ACC_TYPE(float, float);
 CUDA_ACC_TYPE(double, double);
 CUDA_ACC_TYPE(int8_t, int64_t);
@ -123,8 +129,8 @@ CUDA_ACC_TYPE(c10::complex<double>, c10::complex<double>);
 CPU_ACC_TYPE(BFloat16, float);
 CPU_ACC_TYPE(Half, float);
 CPU_ACC_TYPE(Float8_e5m2, float);
-CPU_ACC_TYPE(Float8_e5m2fnuz, float);
 CPU_ACC_TYPE(Float8_e4m3fn, float);
+CPU_ACC_TYPE(Float8_e5m2fnuz, float);
 CPU_ACC_TYPE(Float8_e4m3fnuz, float);
 CPU_ACC_TYPE(float, double);
 CPU_ACC_TYPE(double, double);
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -60,11 +60,11 @@ endif()

 file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
-file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh")
-file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp")
+file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h")
+file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp")
 file(GLOB cuda_nvrtc_stub_h "cuda/nvrtc_stub/*.h")
 file(GLOB cuda_nvrtc_stub_cpp "cuda/nvrtc_stub/*.cpp")
-file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu")
+file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu" "cuda/tunable/*.cu")
 file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh")
 file(GLOB cudnn_cpp "cudnn/*.cpp")
 file(GLOB ops_h "ops/*.h")
@ -72,10 +72,10 @@ file(GLOB ops_h "ops/*.h")
 file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp")

-file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h")
-file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp")
+file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h" "hip/tunable/*.cuh" "hip/tunable/*.h")
+file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp" "hip/tunable/*.cpp")
 list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp")
-file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip")
+file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip" "hip/tunable/*.hip")
 file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h")
 file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp")
 file(GLOB miopen_h "miopen/*.h")
@ -141,6 +141,7 @@ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp")
 file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu")
 file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp")
 file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp")
+file(GLOB native_nested_h "native/nested/*.h")
 file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu")
 file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp")

@ -449,19 +450,7 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
        set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
      endif()
    endif()
-    if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND
-        CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
-      set(GCC_7 True)
-    else()
-      set(GCC_7 False)
-    endif()
-    if(GCC_7)
-      set(CMAKE_BUILD_TYPE Release)  # Always build Sleef as a Release build to work around a gcc-7 bug
-    endif()
    add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef)
-    if(GCC_7)
-      set(CMAKE_BUILD_TYPE ${OLD_CMAKE_BUILD_TYPE})
-    endif()
    set_property(TARGET sleef PROPERTY FOLDER "dependencies")
    list(APPEND ATen_THIRD_PARTY_INCLUDE ${CMAKE_BINARY_DIR}/include)
    link_directories(${CMAKE_BINARY_DIR}/sleef/lib)
@ -585,7 +574,7 @@ configure_file(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/AT
 install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake"
  DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen")

-set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS})
+set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h})
 if(NOT INTERN_BUILD_MOBILE)
  list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h})
  # Metal
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -133,6 +133,15 @@ void Context::setSDPUseMath(bool e) {
  enabled_mathSDP = e;
 }

+bool Context::userEnabledCuDNNSDP() const {
+  return enabled_cudnnSDP;
+}
+
+void Context::setSDPUseCuDNN(bool e) {
+  enabled_cudnnSDP = e;
+}
+
+
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";
 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -1,11 +1,13 @@
 #pragma once

 #include <ATen/CPUGeneratorImpl.h>
+#include <ATen/DeviceAccelerator.h>
 #include <ATen/LinalgBackend.h>
 #include <ATen/core/ATenGeneral.h>
 #include <ATen/core/DeprecatedTypeProperties.h>
 #include <ATen/core/Generator.h>
 #include <ATen/core/LegacyTypeDispatch.h>
+#include <ATen/detail/AcceleratorHooksInterface.h>
 #include <ATen/detail/CUDAHooksInterface.h>
 #include <ATen/detail/HIPHooksInterface.h>
 #include <ATen/detail/IPUHooksInterface.h>
@ -56,9 +58,26 @@ class TORCH_API Context {
      AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled.");
    }
  }
+  const AcceleratorHooksInterface& getAcceleratorHooksInterface(
+      c10::optional<c10::DeviceType> opt_device_type = c10::nullopt) {
+    c10::DeviceType device_type = opt_device_type.has_value()
+        ? opt_device_type.value()
+        : at::getAccelerator(true).value();
+    if (device_type == at::kCUDA) {
+      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kMPS) {
+      return at::detail::getMPSHooks();
+    } else if (device_type == at::kPrivateUse1) {
+      return at::detail::getPrivateUse1Hooks();
+    } else {
+      AT_ERROR(
+          c10::DeviceTypeName(device_type), " device type not an accelerator.");
+    }
+  }
  Device getDeviceFromPtr(void* data, c10::DeviceType device_type) {
    initCUDAIfNeeded(device_type);
    initHIPIfNeeded(device_type);
+    initXPUIfNeeded(device_type);
    if (device_type == at::kCPU) {
      return c10::DeviceType::CPU;
    } else if (device_type == at::kCUDA) {
@ -131,6 +150,9 @@ class TORCH_API Context {
  void lazyInitHIP() {
    c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); });
  }
+  void lazyInitXPU() {
+    c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); });
+  }
  void lazyInitPrivateUse1() {
    c10::call_once(thp_init, [&] {
      if (isPrivateUse1HooksRegistered()) {
@ -181,6 +203,9 @@ class TORCH_API Context {
  void setSDPUseMath(bool);
  bool userEnabledMathSDP() const;

+  void setSDPUseCuDNN(bool);
+  bool userEnabledCuDNNSDP() const;
+
  at::LinalgBackend linalgPreferredBackend() const;
  void setLinalgPreferredBackend(at::LinalgBackend);

@ -307,9 +332,15 @@ class TORCH_API Context {
      lazyInitHIP();
    }
  }
+  void initXPUIfNeeded(c10::DeviceType p) {
+    if (p == c10::DeviceType::XPU) {
+      lazyInitXPU();
+    }
+  }
  static bool checkCuBLASConfigDeterministic();
  c10::once_flag thc_init;
  c10::once_flag thh_init;
+  c10::once_flag thx_init;
  c10::once_flag thp_init;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
@ -319,6 +350,7 @@ class TORCH_API Context {
  bool enabled_flashSDP = true;
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
+  bool enabled_cudnnSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -0,0 +1,31 @@
+#include <ATen/DeviceAccelerator.h>
+#include <ATen/Context.h>
+
+namespace at {
+
+C10_API std::optional<DeviceType> getAccelerator(bool checked) {
+#define CHECK_NO_CUDA \
+  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");
+
+#define CHECK_NO_PU1 \
+  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
+
+    if (is_privateuse1_backend_registered()) {
+        // We explicitly allow PrivateUse1 and another device at the same time
+        // as we use this for testing.
+        // Whenever a PrivateUse1 device is registered, use it first.
+        return kPrivateUse1;
+    } else if (at::hasCUDA()) {
+        CHECK_NO_PU1
+        return kCUDA;
+    } else {
+        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
+        return std::nullopt;
+    }
+
+#undef CHECK_NO_CUDA
+#undef CHECK_NO_PU1
+}
+
+
+} // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/macros/Macros.h>
+
+#include <ATen/detail/MTIAHooksInterface.h>
+#include <optional>
+
+// This file defines the top level Accelerator concept for PyTorch.
+// A device is an accelerator per the definition here if:
+// - It is mutually exclusive with all other accelerators
+// - It performs asynchronous compute via a Stream/Event system
+// - It provides a set of common APIs as defined by AcceleratorHooksInterface
+//
+// As of today, accelerator devices are (in no particular order):
+// CUDA, MTIA, PrivateUse1
+// We want to add once all the proper APIs are supported and tested:
+// HIP, MPS, XPU
+
+namespace at {
+
+// Ensures that only one accelerator is available (at
+// compile time if possible) and return it.
+// When checked is true, the returned optional always has a value.
+TORCH_API std::optional<c10::DeviceType> getAccelerator(bool checked = false);
+
+} // namespace at
--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -145,7 +145,7 @@ SymInt computeStorageNbytes(
  // of the last element according to stride
  SymInt size = 1;
  for (const auto i : c10::irange(sizes.size())) {
-    if (sizes[i] == 0) {
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) {
      return 0;
    }

--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -352,6 +352,41 @@ const char* FunctionalTensorWrapper::tensorimpl_type_name() const {
    return "FunctionalTensorWrapper";
 }

+void FunctionalTensorWrapper::copy_tensor_metadata(
+    const FunctionalTensorWrapper* src_impl,
+    FunctionalTensorWrapper* dest_impl,
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) {
+    TensorImpl::copy_tensor_metadata(
+        src_impl,
+        dest_impl,
+        version_counter,
+        allow_tensor_metadata_change);
+
+    // FunctionalTensorWrapper-specific fields.
+    dest_impl->value_ = src_impl->value_;
+    dest_impl->level_ = src_impl->level_;
+    dest_impl->mutation_counter_ = src_impl->mutation_counter_;
+    dest_impl->mutation_hidden_from_autograd_counter_ = src_impl->mutation_hidden_from_autograd_counter_;
+    dest_impl->mutation_during_no_grad_or_inference_mode_ = src_impl->mutation_during_no_grad_or_inference_mode_;
+    dest_impl->has_metadata_mutation_ = src_impl->has_metadata_mutation_;
+    dest_impl->is_multi_output_view_ = src_impl->is_multi_output_view_;
+    dest_impl->was_storage_changed_ = src_impl->was_storage_changed_;
+    dest_impl->generation_ = src_impl->generation_;
+    dest_impl->view_metas_ = src_impl->view_metas_;
+}
+
+
+void FunctionalTensorWrapper::copy_tensor_metadata_and_refresh(
+    const FunctionalTensorWrapper* src_impl,
+    FunctionalTensorWrapper* dest_impl,
+    const c10::VariableVersion& version_counter,
+    bool allow_tensor_metadata_change) const {
+    copy_tensor_metadata(src_impl, dest_impl, version_counter, allow_tensor_metadata_change);
+    dest_impl->refresh_numel();
+    dest_impl->refresh_contiguous();
+}
+
 template <typename VariableVersion>
 c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach_core(
    VariableVersion&& version_counter,
@ -367,16 +402,11 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach_
  }

  auto impl = c10::make_intrusive<FunctionalTensorWrapper>(value_);
-  copy_tensor_metadata(
+  copy_tensor_metadata_and_refresh(
      /*src_impl=*/this,
      /*dest_impl=*/impl.get(),
      /*version_counter=*/std::forward<VariableVersion>(version_counter),
      /*allow_tensor_metadata_change=*/allow_tensor_metadata_change);
-  impl->level_ = level_;
-  impl->generation_ = generation_;
-  impl->view_metas_ = view_metas_;
-  impl->refresh_numel();
-  impl->refresh_contiguous();
  return impl;
 }

@ -394,6 +424,18 @@ c10::intrusive_ptr<TensorImpl> FunctionalTensorWrapper::shallow_copy_and_detach(
      std::move(version_counter), allow_tensor_metadata_change);
 }

+void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) {
+    AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set()));
+    auto functional_impl =
+        static_cast<FunctionalTensorWrapper*>(impl.get());
+    copy_tensor_metadata_and_refresh(
+        /*src_impl=*/functional_impl,
+        /*dest_impl=*/this,
+        /*version_counter=*/version_counter(),
+        /*allow_tensor_metadata_change=*/allow_tensor_metadata_change());
+}
+
+
 c10::Device FunctionalTensorWrapper::device_custom() const {
  return value_.unsafeGetTensorImpl()->device();
 }
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -211,6 +211,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
      VariableVersion&& version_counter,
      bool allow_tensor_metadata_change) const;

+  void shallow_copy_from(const c10::intrusive_ptr<TensorImpl>& impl) override;
+  void copy_tensor_metadata_and_refresh(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change) const;
+
  // Note that value is not taken by reference: internally, the wrapper will
  // change the value tensor that it points to over time.
  Tensor value_;
@ -230,6 +237,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {

  size_t generation_ = 0;
  std::vector<at::functionalization::ViewMeta> view_metas_;
+
+ protected:
+  static void copy_tensor_metadata(
+      const FunctionalTensorWrapper* src_impl,
+      FunctionalTensorWrapper* dest_impl,
+      const c10::VariableVersion& version_counter,
+      bool allow_tensor_metadata_change);
 };

 // Utility functions for the functionalization pass.
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .2.0
 .0.0