Speed up fx graph iteration by implementing it in C++

ghstack-source-id: af7493f6f73baf00e30a6d5790a601729bd9c900 Pull Request resolved: https://github.com/pytorch/pytorch/pull/128288
2025-10-24 15:44:58 +08:00 · 2024-06-08 17:12:47 -07:00
2579 changed files with 55839 additions and 82362 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +0,0 @@
-0.6b
-manylinux_2_17
-rocm6.1
-7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
-77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -373,13 +373,6 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
-  pytorch-linux-jammy-py3.12-halide)
-    CUDA_VERSION=12.4
-    ANACONDA_PYTHON_VERSION=3.12
-    GCC_VERSION=11
-    CONDA_CMAKE=yes
-    HALIDE=yes
-    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -497,7 +490,6 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
-       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--- a/.ci/docker/centos-rocm/Dockerfile
+++ b/.ci/docker/centos-rocm/Dockerfile
@ -113,18 +113,18 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton (Early fail)
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
 RUN bash ./install_cache.sh && rm install_cache.sh

+# Install AOTriton
+COPY ci_commit_pins/aotriton.txt aotriton.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm/aotriton && rm -rf install_aotriton.sh aotriton aotriton.txt common_utils.sh
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 # Include BUILD_ENVIRONMENT environment variable in image
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
--- a/.ci/docker/ci_commit_pins/aotriton.txt
+++ b/.ci/docker/ci_commit_pins/aotriton.txt
@ -0,0 +1 @@
+24a3fe9cb57e5cda3c923df29743f9767194cc27
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-c572f9e509b5ec5d56f4d218271e36269bba244f
+d4b3e5cc607e97afdba79dc90f8ef968142f347c
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -1 +0,0 @@
-340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-21eae954efa5bf584da70324b640288c3ee7aede
+01cbe5045a6898c9a925f01435c8277b2fe6afcc
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-1b2f15840e0d70eec50d84c7a0575cb835524def
+b8c64f64c18d8cac598b3adb355c21e7439c21de
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-dedb7bdf339a3546896d4820366ca562c586bfa0
+45fff310c891f5a92d55445adf8cc9d29df5841e
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -4,20 +4,21 @@ set -ex

 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

-TARBALL='aotriton.tar.bz2'
-# This read command alwasy returns with exit code 1
-read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
-ARCH=$(uname -m)
+AOTRITON_DIR="aotriton"
+AOTRITON_PINNED_NAME="aotriton" # No .txt extension
+AOTRITON_PINNED_COMMIT=$(get_pinned_commit ${AOTRITON_PINNED_NAME})
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"

-cd "${AOTRITON_INSTALL_PREFIX}"
-# Must use -L to follow redirects
-curl -L --retry 3 -o "${TARBALL}" "${AOTRITON_URL}"
-ACTUAL_SHA256=$(sha256sum "${TARBALL}" | cut -d " " -f 1)
-if [ "${SHA256}" != "${ACTUAL_SHA256}" ]; then
-  echo -n "Error: The SHA256 of downloaded tarball is ${ACTUAL_SHA256},"
-  echo " which does not match the expected value ${SHA256}."
-  exit
-fi
-tar xf "${TARBALL}" && rm -rf "${TARBALL}"
+git clone https://github.com/ROCm/aotriton.git "${AOTRITON_DIR}"
+cd "${AOTRITON_DIR}"
+git checkout "${AOTRITON_PINNED_COMMIT}"
+git submodule sync --recursive
+git submodule update --init --recursive --force --depth 1
+mkdir build
+cd build
+cmake .. -G Ninja -DCMAKE_INSTALL_PREFIX=./install_dir -DCMAKE_BUILD_TYPE=Release -DAOTRITON_COMPRESS_KERNEL=OFF -DAOTRITON_NO_PYTHON=ON -DAOTRITON_NO_SHARED=ON
+ninja install
+mkdir -p "${AOTRITON_INSTALL_PREFIX}"
+cp -r install_dir/* "${AOTRITON_INSTALL_PREFIX}"
+find /tmp/ -mindepth 1 -delete
+rm -rf ~/.triton
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -37,9 +37,6 @@ install_conda_dependencies() {

 install_pip_dependencies() {
  pushd executorch/.ci/docker
-  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
-  # binaries later, ExecuTorch only needs CPU
-  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
  # Install all Python dependencies
  pip_install -r requirements-ci.txt
  popd
@ -47,14 +44,13 @@ install_pip_dependencies() {

 setup_executorch() {
  pushd executorch
-  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
-  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh
+  source .ci/scripts/utils.sh

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+  install_flatc_from_source
+  pip_install .

-  as_jenkins .ci/scripts/setup-linux.sh cmake
+  # Make sure that all the newly generate files are owned by Jenkins
+  chown -R jenkins .
  popd
 }

--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -1,46 +0,0 @@
-#!/bin/bash
-set -ex
-
-source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
-
-COMMIT=$(get_pinned_commit halide)
-test -n "$COMMIT"
-
-# activate conda to populate CONDA_PREFIX
-test -n "$ANACONDA_PYTHON_VERSION"
-eval "$(conda shell.bash hook)"
-conda activate py_$ANACONDA_PYTHON_VERSION
-
-if [ -n "${UBUNTU_VERSION}" ];then
-    apt update
-    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
-                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
-fi
-
-conda_install numpy scipy imageio cmake ninja
-
-git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
-cmake -DCMAKE_BUILD_TYPE=Release \
-        -DLLVM_ENABLE_PROJECTS="clang" \
-        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
-        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
-        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
-        -S llvm-project/llvm -B llvm-build -G Ninja
-cmake --build llvm-build
-cmake --install llvm-build --prefix llvm-install
-export LLVM_ROOT=`pwd`/llvm-install
-export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
-
-git clone https://github.com/halide/Halide.git
-pushd Halide
-git checkout ${COMMIT} && git submodule update --init --recursive
-pip_install -r requirements.txt
-cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
-cmake --build build
-test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
-cmake --install build --prefix ${CONDA_PREFIX}
-chown -R jenkins ${CONDA_PREFIX}
-popd
-rm -rf Halide llvm-build llvm-project llvm-install
-
-python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -33,9 +33,7 @@ pip_install coloredlogs packaging
 pip_install onnxruntime==1.18
 pip_install onnx==1.16.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240613 --no-deps
-# required by onnxscript
-pip_install ml_dtypes
+pip_install onnxscript==0.1.0.dev20240523 --no-deps

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.10.0
+mypy==1.9.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.10.0
+#Pinned versions: 1.9.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -306,7 +306,7 @@ pywavelets==1.5.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.0.0
+lxml==5.0.0.
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -103,14 +103,6 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -105,18 +105,18 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton-rocm.txt triton_version.txt

-# Install AOTriton
-COPY ./aotriton_version.txt aotriton_version.txt
-COPY ./common/common_utils.sh common_utils.sh
-COPY ./common/install_aotriton.sh install_aotriton.sh
-RUN ["/bin/bash", "-c", "./install_aotriton.sh /opt/rocm && rm -rf install_aotriton.sh aotriton_version.txt common_utils.sh"]
-ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
-
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
 RUN bash ./install_cache.sh && rm install_cache.sh

+# Install AOTriton
+COPY ci_commit_pins/aotriton.txt aotriton.txt
+COPY ./common/common_utils.sh common_utils.sh
+COPY ./common/install_aotriton.sh install_aotriton.sh
+RUN bash ./install_aotriton.sh /opt/rocm/aotriton && rm -rf install_aotriton.sh aotriton aotriton.txt common_utils.sh
+ENV AOTRITON_INSTALLED_PREFIX /opt/rocm/aotriton
+
 # Include BUILD_ENVIRONMENT environment variable in image
 ARG BUILD_ENVIRONMENT
 ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT}
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -155,14 +155,6 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

-ARG HALIDE
-# Build and install halide
-COPY ./common/install_halide.sh install_halide.sh
-COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/halide.txt halide.txt
-RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
-RUN rm install_halide.sh common_utils.sh halide.txt
-
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -284,26 +284,12 @@ else
        # Which should be backward compatible with Numpy-1.X
        python -mpip install --pre numpy==2.0.0rc1
      fi
-
-      WERROR=1 python setup.py clean
-
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
-        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
-      else
-        WERROR=1 python setup.py bdist_wheel
-      fi
+      WERROR=1 python setup.py bdist_wheel
    else
-      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
-        exit 1
-      else
-        python setup.py bdist_wheel
-      fi
+      python setup.py bdist_wheel
    fi
    pip_install_whl "$(echo dist/*.whl)"

@ -342,10 +328,9 @@ else
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -358,7 +343,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -370,7 +355,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -56,29 +56,9 @@ function assert_git_not_dirty() {
 function pip_install_whl() {
  # This is used to install PyTorch and other build artifacts wheel locally
  # without using any network connection
-
-  # Convert the input arguments into an array
-  local args=("$@")
-
-  # Check if the first argument contains multiple paths separated by spaces
-  if [[ "${args[0]}" == *" "* ]]; then
-    # Split the string by spaces into an array
-    IFS=' ' read -r -a paths <<< "${args[0]}"
-    # Loop through each path and install individually
-    for path in "${paths[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  else
-    # Loop through each argument and install individually
-    for path in "${args[@]}"; do
-      echo "Installing $path"
-      python3 -mpip install --no-index --no-deps "$path"
-    done
-  fi
+  python3 -mpip install --no-index --no-deps "$@"
 }

-
 function pip_install() {
  # retry 3 times
  # old versions of pip don't have the "--progress-bar" flag
@ -208,6 +188,28 @@ function clone_pytorch_xla() {
  fi
 }

+function checkout_install_torchdeploy() {
+  local commit
+  commit=$(get_pinned_commit multipy)
+  pushd ..
+  git clone --recurse-submodules https://github.com/pytorch/multipy.git
+  pushd multipy
+  git checkout "${commit}"
+  python multipy/runtime/example/generate_examples.py
+  BUILD_CUDA_TESTS=1 pip install -e .
+  popd
+  popd
+}
+
+function test_torch_deploy(){
+ pushd ..
+ pushd multipy
+ ./multipy/runtime/build/test_deploy
+ ./multipy/runtime/build/test_deploy_gpu
+ popd
+ popd
+}
+
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
@ -222,8 +224,6 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  echo "Print all dependencies after TorchBench is installed"
-  python -mpip freeze
  popd
 }

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,8 +18,8 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
+time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
-time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 # FSDP tests
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -264,6 +264,18 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

+# temp workarounds for https://github.com/pytorch/pytorch/issues/126692, remove when fixed
+if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
+  pushd test
+  CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)")
+  if [ "$CUDA_VERSION" == "12.4" ]; then
+    ISCUDA124="cu124"
+  else
+    ISCUDA124=""
+  fi
+  popd
+fi
+
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -277,9 +289,6 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
-
-  # modify LD_LIBRARY_PATH to ensure it has the conda env.
-  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
@ -338,31 +347,17 @@ test_inductor_distributed() {
  assert_git_not_dirty
 }

-test_inductor_shard() {
-  if [[ -z "$NUM_TEST_SHARDS" ]]; then
-    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
-    exit 1
-  fi
-
+test_inductor() {
  python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --inductor \
-    --include test_modules test_ops test_ops_gradients test_torch \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-
+  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py \
-    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
-    --shard "$1" "$NUM_TEST_SHARDS" \
-    --verbose
-}
+  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose

-test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
  fi
 }

@ -381,7 +376,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
+    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -406,7 +401,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -531,10 +526,9 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      # Test AOTInductor with the ABI-compatible mode on CI
      # This can be removed once the ABI-compatible mode becomes default.
-      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
    python "benchmarks/dynamo/$suite.py" \
@ -544,10 +538,10 @@ test_single_dynamo_benchmark() {
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
  fi
 }

@ -556,11 +550,6 @@ test_inductor_micro_benchmark() {
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

-test_inductor_halide() {
-  python test/run_test.py --include inductor/test_halide.py --verbose
-  assert_git_not_dirty
-}
-
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -575,15 +564,11 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
-      local dt="float32"
-      if [[ "${TEST_CONFIG}" == *amp* ]]; then
-        dt="amp"
-      fi
+    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 --freezing "$@"
      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
@ -607,7 +592,7 @@ test_inductor_torchbench_smoketest_perf() {
    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"
+    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -622,8 +607,13 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7
+  # Use 4.7 for cuda 12.4, change back to 4.9 after fixing https://github.com/pytorch/pytorch/issues/126692
+  if [ "$CUDA_VERSION" == "12.4" ]; then
+    THRESHOLD=4.7
+  else
+    THRESHOLD=4.9
+  fi
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t $THRESHOLD

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -642,7 +632,7 @@ test_inductor_torchbench_smoketest_perf() {
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_huggingface_training.csv"
  done
 }

@ -1179,21 +1169,15 @@ test_executorch() {

  pushd /executorch

-  export PYTHON_EXECUTABLE=python
-  export EXECUTORCH_BUILD_PYBIND=ON
-  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
-
-  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
-  # from the PR
+  # NB: We need to build ExecuTorch runner here and not inside the Docker image
+  # because it depends on PyTorch
  # shellcheck disable=SC1091
-  source .ci/scripts/setup-linux.sh cmake
-
-  echo "Run ExecuTorch unit tests"
-  pytest -v -n auto
-  # shellcheck disable=SC1091
-  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh
+  source .ci/scripts/utils.sh
+  build_executorch_runner "cmake"

  echo "Run ExecuTorch regression tests for some models"
+  # NB: This is a sample model, more can be added here
+  export PYTHON_EXECUTABLE=python
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1253,10 +1237,11 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
+elif [[ "$TEST_CONFIG" == deploy ]]; then
+  checkout_install_torchdeploy
+  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
-  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1268,14 +1253,13 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
  fi
  install_torchtext
  install_torchvision
-  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1294,7 +1278,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1302,14 +1286,10 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
  test_inductor_cpp_wrapper_abi_compatible
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
  install_torchvision
-  test_inductor_shard 1
-  test_inductor_aoti
+  test_inductor
  test_inductor_distributed
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
-  install_torchvision
-  test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard 1
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -97,16 +97,8 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
-      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
-      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
-      # todo: after folder is populated use the pypi_pkg channel instead
-      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
-      retry pip install -q numpy protobuf typing-extensions
-    else
-      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-      retry pip install -q numpy protobuf typing-extensions
-    fi
+    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+    retry pip install -q numpy protobuf typing-extensions
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$PACKAGE_TYPE" == conda ]]; then
    export DOCKER_IMAGE="pytorch/conda-cuda"
  elif [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux:cpu"
+    export DOCKER_IMAGE="pytorch/manylinux-cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
  fi
 fi

@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
  # Only linux Python < 3.13 are supported wheels for triton
+  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,6 +100,32 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

+JAVA_HOME=
+BUILD_JNI=OFF
+if [[ "$PACKAGE_TYPE" == libtorch ]]; then
+  POSSIBLE_JAVA_HOMES=()
+  POSSIBLE_JAVA_HOMES+=(/usr/local)
+  POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
+  POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
+  # Add the Windows-specific JNI path
+  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
+  for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
+    if [[ -e "$JH/include/jni.h" ]] ; then
+      # Skip if we're not on Windows but haven't found a JAVA_HOME
+      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
+        break
+      fi
+      echo "Found jni.h under $JH"
+      JAVA_HOME="$JH"
+      BUILD_JNI=ON
+      break
+    fi
+  done
+  if [ -z "$JAVA_HOME" ]; then
+    echo "Did not find jni.h"
+  fi
+fi
+
 cat >"$envfile" <<EOL
 # =================== The following code will be executed inside Docker container ===================
 export TZ=UTC
@ -110,7 +136,6 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
-export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -134,6 +159,8 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
 export ANACONDA_USER='pytorch'

 export USE_FBGEMM=1
+export JAVA_HOME=$JAVA_HOME
+export BUILD_JNI=$BUILD_JNI
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -25,10 +25,6 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

-if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
-  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
-fi
-
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.clang-tidy
+++ b/.clang-tidy
@ -62,6 +62,4 @@ readability-string-compare,
 '
 HeaderFilterRegex: '^(aten/|c10/|torch/).*$'
 WarningsAsErrors: '*'
-CheckOptions:
-  misc-header-include-cycle.IgnoredFilesList: 'format.h;ivalue.h;custom_class.h;Dict.h;List.h'
 ...
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,14 +14,12 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
-        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
-        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -52,13 +52,6 @@ inputs:
    description: Hugging Face Hub token
    required: false
    default: ""
-  use_split_build:
-    description: |
-      [Experimental] Build a libtorch only wheel and build pytorch such that
-      are built from the libtorch wheel.
-    required: false
-    type: boolean
-    default: false
 outputs:
  docker-image:
    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -151,7 +144,6 @@ runs:
        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
-        USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
      shell: bash
      run: |
        # detached container should get cleaned up by teardown_ec2_linux
@ -171,7 +163,6 @@ runs:
          -e PR_LABELS \
          -e OUR_GITHUB_JOB_ID \
          -e HUGGING_FACE_HUB_TOKEN \
-          -e USE_SPLIT_BUILD \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -192,7 +183,7 @@ runs:

    - name: Store PyTorch Build Artifacts on S3
      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
      with:
        name: ${{ inputs.build-environment }}
        retention-days: 14
@ -200,16 +191,6 @@ runs:
        path: artifacts.zip
        s3-bucket: ${{ inputs.s3-bucket }}

-    - name: Store PyTorch Build Artifacts on S3 for split build
-      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
-      with:
-        name: ${{ inputs.build-environment }}-experimental-split-build
-        retention-days: 14
-        if-no-files-found: error
-        path: artifacts.zip
-        s3-bucket: ${{ inputs.s3-bucket }}
-
    - name: Upload sccache stats
      if: steps.build.outcome != 'skipped'
      uses: seemethere/upload-artifact-s3@v5
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -26,7 +26,6 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
-          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-b829e936f7cc61b48149f5f957a451a38bf2a178
+1980f8af5bcd0bb2ce51965cf79d8d4c25dad8a0
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-23512dbebd44a11eb84afbf53c3c071dd105297e
+d6015d42d9a1834bc7595c4bd6852562fb80b30b
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -27,9 +27,11 @@
  - third_party/onnx
  - caffe2/python/onnx/**
  approved_by:
+  - BowenBao
  - justinchuby
  - liqunfu
  - shubhambhokare1
+  - thiagocrepaldi
  - titaiwangms
  - wschin
  - xadupre
@ -242,7 +244,6 @@
  - torch/csrc/xpu/**
  - torch/xpu/**
  - test/xpu/**
-  - test/test_xpu.py
  - third_party/xpu.txt
  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
@ -375,21 +376,13 @@

 - name: CPU inductor
  patterns:
-  - torch/_inductor/mkldnn_ir.py
  - torch/_inductor/mkldnn_lowerings.py
  - torch/_inductor/fx_passes/mkldnn_fusion.py
  - torch/_inductor/fx_passes/quantization.py
-  - torch/_inductor/codegen/cpp_prefix.h
  - torch/_inductor/codegen/cpp.py
-  - torch/_inductor/codegen/cpp_utils.py
-  - torch/_inductor/codegen/cpp_micro_gemm.py
-  - torch/_inductor/codegen/cpp_template_kernel.py
-  - torch/_inductor/codegen/cpp_template.py
-  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
-  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
  - aten/src/ATen/native/quantized/cpu/**
  - test/quantization/core/test_quantized_op.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -26,4 +26,3 @@ retryable_workflows:
 - windows-binary
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
-mergebot: True
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -93,8 +93,6 @@ done

 # Copy Include Files
 cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/roctracer $TRITON_ROCM_DIR/include
-cp -r $ROCM_HOME/include/hsa $TRITON_ROCM_DIR/include

 # Copy linker
 mkdir -p $TRITON_ROCM_DIR/llvm/bin
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -3,11 +3,11 @@
 import json
 import os
 import re
-from typing import Any, cast, Dict, List, Optional
+from typing import Any, Optional

 from urllib.error import HTTPError

-from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels
+from github_utils import gh_fetch_url, gh_post_pr_comment

 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR
@ -19,7 +19,6 @@ REQUIRES_ISSUE = {
    "critical",
    "fixnewfeature",
 }
-RELEASE_BRANCH_REGEX = re.compile(r"release/(?P<version>.+)")


 def parse_args() -> Any:
@ -59,33 +58,6 @@ def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
    return commit_sha if pr.is_closed() else None


-def get_release_version(onto_branch: str) -> Optional[str]:
-    """
-    Return the release version if the target branch is a release branch
-    """
-    m = re.match(RELEASE_BRANCH_REGEX, onto_branch)
-    return m.group("version") if m else ""
-
-
-def get_tracker_issues(
-    org: str, project: str, onto_branch: str
-) -> List[Dict[str, Any]]:
-    """
-    Find the tracker issue from the repo. The tracker issue needs to have the title
-    like [VERSION] Release Tracker following the convention on PyTorch
-    """
-    version = get_release_version(onto_branch)
-    if not version:
-        return []
-
-    tracker_issues = gh_query_issues_by_labels(org, project, labels=["release tracker"])
-    if not tracker_issues:
-        return []
-
-    # Figure out the tracker issue from the list by looking at the title
-    return [issue for issue in tracker_issues if version in issue.get("title", "")]
-
-
 def cherry_pick(
    github_actor: str,
    repo: GitRepo,
@ -105,49 +77,17 @@ def cherry_pick(
    )

    try:
-        org, project = repo.gh_owner_and_name()
-
-        cherry_pick_pr = ""
        if not dry_run:
+            org, project = repo.gh_owner_and_name()
            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)

-        tracker_issues_comments = []
-        tracker_issues = get_tracker_issues(org, project, onto_branch)
-        for issue in tracker_issues:
-            issue_number = int(str(issue.get("number", "0")))
-            if not issue_number:
-                continue
+            msg = f"The cherry pick PR is at {cherry_pick_pr}"
+            if fixes:
+                msg += f" and it is linked with issue {fixes}"
+            elif classification in REQUIRES_ISSUE:
+                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"

-            res = cast(
-                Dict[str, Any],
-                post_tracker_issue_comment(
-                    org,
-                    project,
-                    issue_number,
-                    pr.pr_num,
-                    cherry_pick_pr,
-                    classification,
-                    fixes,
-                    dry_run,
-                ),
-            )
-
-            comment_url = res.get("html_url", "")
-            if comment_url:
-                tracker_issues_comments.append(comment_url)
-
-        msg = f"The cherry pick PR is at {cherry_pick_pr}"
-        if fixes:
-            msg += f" and it is linked with issue {fixes}."
-        elif classification in REQUIRES_ISSUE:
-            msg += f" and it is recommended to link a {classification} cherry pick PR with an issue."
-
-        if tracker_issues_comments:
-            msg += " The following tracker issues are updated:\n"
-            for tracker_issues_comment in tracker_issues_comments:
-                msg += f"* {tracker_issues_comment}\n"
-
-        post_pr_comment(org, project, pr.pr_num, msg, dry_run)
+            post_comment(org, project, pr.pr_num, msg)

    finally:
        if current_branch:
@ -219,9 +159,7 @@ def submit_pr(
        raise RuntimeError(msg) from error


-def post_pr_comment(
-    org: str, project: str, pr_num: int, msg: str, dry_run: bool = False
-) -> List[Dict[str, Any]]:
+def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
    """
    Post a comment on the PR itself to point to the cherry picking PR when success
    or print the error when failure
@ -244,35 +182,7 @@ def post_pr_comment(
    comment = "\n".join(
        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
    )
-    return gh_post_pr_comment(org, project, pr_num, comment, dry_run)
-
-
-def post_tracker_issue_comment(
-    org: str,
-    project: str,
-    issue_num: int,
-    pr_num: int,
-    cherry_pick_pr: str,
-    classification: str,
-    fixes: str,
-    dry_run: bool = False,
-) -> List[Dict[str, Any]]:
-    """
-    Post a comment on the tracker issue (if any) to record the cherry pick
-    """
-    comment = "\n".join(
-        (
-            "Link to landed trunk PR (if applicable):",
-            f"* https://github.com/{org}/{project}/pull/{pr_num}",
-            "",
-            "Link to release branch PR:",
-            f"* {cherry_pick_pr}",
-            "",
-            "Criteria Category:",
-            " - ".join((classification.capitalize(), fixes.capitalize())),
-        )
-    )
-    return gh_post_pr_comment(org, project, issue_num, comment, dry_run)
+    gh_post_pr_comment(org, project, pr_num, comment)


 def main() -> None:
@ -304,7 +214,7 @@ def main() -> None:

    except RuntimeError as error:
        if not args.dry_run:
-            post_pr_comment(org, project, pr_num, str(error))
+            post_comment(org, project, pr_num, str(error))
        else:
            raise error

--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -48,7 +48,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.1": (
@ -61,7 +61,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.4": (
@ -74,7 +74,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
@ -347,6 +347,10 @@ def generate_wheels_matrix(
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
+            # Disable py3.12 builds for ROCm because of triton dependency
+            # on llnl-hatchet, which doesn't have py3.12 wheels available
+            if gpu_arch_type == "rocm" and python_version == "3.12":
+                continue
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
@ -386,31 +390,6 @@ def generate_wheels_matrix(
                        ),
                    }
                )
-                if arch_version != "cuda-aarch64":
-                    ret.append(
-                        {
-                            "python_version": python_version,
-                            "gpu_arch_type": gpu_arch_type,
-                            "gpu_arch_version": gpu_arch_version,
-                            "desired_cuda": translate_desired_cuda(
-                                gpu_arch_type, gpu_arch_version
-                            ),
-                            "use_split_build": "True",
-                            "devtoolset": (
-                                "cxx11-abi" if arch_version == "cuda-aarch64" else ""
-                            ),
-                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
-                            "package_type": package_type,
-                            "pytorch_extra_install_requirements": (
-                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
-                                if os != "linux-aarch64"
-                                else ""
-                            ),
-                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
-                                ".", "_"
-                            ),
-                        }
-                    )
            else:
                ret.append(
                    {
--- a/.github/scripts/get_workflow_type.py
+++ b/.github/scripts/get_workflow_type.py
@ -0,0 +1,99 @@
+import json
+from argparse import ArgumentParser
+from typing import Any
+
+from github import Auth, Github
+from github.Issue import Issue
+
+
+WORKFLOW_TYPE_LABEL = "label"
+WORKFLOW_TYPE_RG = "rg"
+WORKFLOW_TYPE_BOTH = "both"
+
+
+def parse_args() -> Any:
+    parser = ArgumentParser("Get dynamic rollout settings")
+    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
+    parser.add_argument(
+        "--github-repo",
+        type=str,
+        required=False,
+        default="pytorch/test-infra",
+        help="GitHub repo to get the issue",
+    )
+    parser.add_argument(
+        "--github-issue", type=int, required=True, help="GitHub issue umber"
+    )
+    parser.add_argument(
+        "--github-user", type=str, required=True, help="GitHub username"
+    )
+    parser.add_argument(
+        "--github-branch", type=str, required=True, help="Current GitHub branch"
+    )
+
+    return parser.parse_args()
+
+
+def get_gh_client(github_token: str) -> Github:
+    auth = Auth.Token(github_token)
+    return Github(auth=auth)
+
+
+def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
+    repo = gh.get_repo(repo)
+    return repo.get_issue(number=issue_num)
+
+
+def is_exception_branch(branch: str) -> bool:
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+
+
+def get_workflow_type(issue: Issue, username: str) -> str:
+    user_list = issue.get_comments()[0].body.split("\r\n")
+    try:
+        run_option = issue.get_comments()[1].body.split("\r\n")[0]
+    except Exception as e:
+        run_option = "single"
+
+    if user_list[0] == "!":
+        # Use old runners for everyone
+        return WORKFLOW_TYPE_LABEL
+    elif user_list[1] == "*":
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for everyone
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for everyone
+            return WORKFLOW_TYPE_RG
+    elif username in user_list:
+        if run_option == WORKFLOW_TYPE_BOTH:
+            # Use ARC runners and old runners for a specific user
+            return WORKFLOW_TYPE_BOTH
+        else:
+            # Use only ARC runners for a specific user
+            return WORKFLOW_TYPE_RG
+    else:
+        # Use old runners by default
+        return WORKFLOW_TYPE_LABEL
+
+
+def main() -> None:
+    args = parse_args()
+
+    if is_exception_branch(args.github_branch):
+        output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+    else:
+        try:
+            gh = get_gh_client(args.github_token)
+            issue = get_issue(gh, args.github_repo, args.github_issue)
+
+            output = {"workflow_type": get_workflow_type(issue, args.github_user)}
+        except Exception as e:
+            output = {"workflow_type": WORKFLOW_TYPE_LABEL}
+
+    json_output = json.dumps(output)
+    print(json_output)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -202,12 +202,3 @@ def gh_update_pr_state(org: str, repo: str, pr_num: int, state: str = "open") ->
            )
        else:
            raise
-
-
-def gh_query_issues_by_labels(
-    org: str, repo: str, labels: List[str], state: str = "open"
-) -> List[Dict[str, Any]]:
-    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues"
-    return gh_fetch_json(
-        url, method="GET", params={"labels": ",".join(labels), "state": state}
-    )
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -29,7 +29,6 @@ python3 -m tools.pyi.gen_pyi \
    --native-functions-path aten/src/ATen/native/native_functions.yaml \
    --tags-path aten/src/ATen/native/tags.yaml \
    --deprecated-functions-path "tools/autograd/deprecated.yaml"
-python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -1,210 +0,0 @@
-# flake8: noqa: G004
-
-import logging
-import os
-from argparse import ArgumentParser
-from logging import LogRecord
-from typing import Any, Iterable
-
-from github import Auth, Github
-from github.Issue import Issue
-
-
-WORKFLOW_LABEL_META = ""  # use meta runners
-WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
-
-GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
-GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
-
-
-class ColorFormatter(logging.Formatter):
-    """Color codes the log messages based on the log level"""
-
-    COLORS = {
-        "WARNING": "\033[33m",  # Yellow
-        "ERROR": "\033[31m",  # Red
-        "CRITICAL": "\033[31m",  # Red
-        "INFO": "\033[0m",  # Reset
-        "DEBUG": "\033[0m",  # Reset
-    }
-
-    def format(self, record: LogRecord) -> str:
-        log_color = self.COLORS.get(record.levelname, "\033[0m")  # Default to reset
-        record.msg = f"{log_color}{record.msg}\033[0m"
-        return super().format(record)
-
-
-handler = logging.StreamHandler()
-handler.setFormatter(ColorFormatter(fmt="%(levelname)-8s: %(message)s"))
-
-log = logging.getLogger(os.path.basename(__file__))
-log.addHandler(handler)
-log.setLevel(logging.INFO)
-
-
-def set_github_output(key: str, value: str) -> None:
-    """
-    Defines outputs of the github action that invokes this script
-    """
-    if not GITHUB_OUTPUT:
-        # See https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ for deprecation notice
-        log.warning(
-            "No env var found for GITHUB_OUTPUT, you must be running this code locally. Falling back to the deprecated print method."
-        )
-        print(f"::set-output name={key}::{value}")
-        return
-
-    with open(GITHUB_OUTPUT, "a") as f:
-        log.info(f"Setting output: {key}='{value}'")
-        f.write(f"{key}={value}\n")
-
-
-def parse_args() -> Any:
-    parser = ArgumentParser("Get dynamic rollout settings")
-    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
-    parser.add_argument(
-        "--github-issue-repo",
-        type=str,
-        required=False,
-        default="pytorch/test-infra",
-        help="GitHub repo to get the issue",
-    )
-    parser.add_argument(
-        "--github-repo",
-        type=str,
-        required=True,
-        help="GitHub repo where CI is running",
-    )
-    parser.add_argument(
-        "--github-issue", type=int, required=True, help="GitHub issue number"
-    )
-    parser.add_argument(
-        "--github-actor", type=str, required=True, help="GitHub triggering_actor"
-    )
-    parser.add_argument(
-        "--github-issue-owner", type=str, required=True, help="GitHub issue owner"
-    )
-    parser.add_argument(
-        "--github-branch", type=str, required=True, help="Current GitHub branch or tag"
-    )
-    parser.add_argument(
-        "--github-ref-type",
-        type=str,
-        required=True,
-        help="Current GitHub ref type, branch or tag",
-    )
-
-    return parser.parse_args()
-
-
-def get_gh_client(github_token: str) -> Github:
-    auth = Auth.Token(github_token)
-    return Github(auth=auth)
-
-
-def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
-    repo = gh.get_repo(repo)
-    return repo.get_issue(number=issue_num)
-
-
-def get_potential_pr_author(
-    gh: Github, repo: str, username: str, ref_type: str, ref_name: str
-) -> str:
-    # If the trigger was a new tag added by a bot, this is a ciflow case
-    # Fetch the actual username from the original PR. The PR number is
-    # embedded in the tag name: ciflow/<name>/<pr-number>
-    if username == "pytorch-bot[bot]" and ref_type == "tag":
-        split_tag = ref_name.split("/")
-        if (
-            len(split_tag) == 3
-            and split_tag[0] == "ciflow"
-            and split_tag[2].isnumeric()
-        ):
-            pr_number = split_tag[2]
-            try:
-                repository = gh.get_repo(repo)
-                pull = repository.get_pull(number=int(pr_number))
-            except Exception as e:
-                raise Exception(  # noqa: TRY002
-                    f"issue with pull request {pr_number} from repo {repository}"
-                ) from e
-            return pull.user.login
-    # In all other cases, return the original input username
-    return username
-
-
-def is_exception_branch(branch: str) -> bool:
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
-
-
-def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
-    try:
-        first_comment = issue.get_comments()[0].body.strip("\n\t ")
-
-        if first_comment[0] == "!":
-            log.info("LF Workflows are disabled for everyone. Using meta runners.")
-            return WORKFLOW_LABEL_META
-        elif first_comment[0] == "*":
-            log.info("LF Workflows are enabled for everyone. Using LF runners.")
-            return WORKFLOW_LABEL_LF
-        else:
-            all_opted_in_users = {
-                usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
-            }
-            opted_in_requestors = {
-                usr for usr in workflow_requestors if usr in all_opted_in_users
-            }
-            if opted_in_requestors:
-                log.info(
-                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                )
-                return WORKFLOW_LABEL_LF
-            else:
-                log.info(
-                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                )
-                return WORKFLOW_LABEL_META
-
-    except Exception as e:
-        log.error(
-            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-        )
-        return WORKFLOW_LABEL_META
-
-
-def main() -> None:
-    args = parse_args()
-
-    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-        label_type = WORKFLOW_LABEL_META
-    else:
-        try:
-            gh = get_gh_client(args.github_token)
-            # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
-            issue = get_issue(gh, args.github_issue_repo, args.github_issue)
-            username = get_potential_pr_author(
-                gh,
-                args.github_repo,
-                args.github_actor,
-                args.github_ref_type,
-                args.github_branch,
-            )
-            label_type = get_workflow_type(
-                issue,
-                (
-                    args.github_issue_owner,
-                    username,
-                ),
-            )
-        except Exception as e:
-            log.error(
-                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
-            )
-            label_type = WORKFLOW_LABEL_META
-
-    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -180,9 +180,6 @@ def mock_gh_get_info() -> Any:
    return {
        "closed": False,
        "isCrossRepository": False,
-        "headRefName": "foo",
-        "baseRefName": "bar",
-        "baseRepository": {"defaultBranchRef": {"name": "bar"}},
        "files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
        "changedFiles": 0,
    }
@ -397,7 +394,6 @@ class TestTryMerge(TestCase):
        # self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
        self.assertGreater(pr.get_commit_count(), 60)

-    @skip("GitHub doesn't keep this data anymore")
    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
        "Fetch comments and conclusions for PR with 60 commits"
        pr = GitHubPR("pytorch", "pytorch", 94787)
@ -895,24 +891,6 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(ignorable["FLAKY"]) == 1)
        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)

-    def test_ignore_failures_older_run_same_workflow(self, *args: Any) -> None:
-        pr = GitHubPR("pytorch", "pytorch", 129013)
-        checks = pr.get_checkrun_conclusions()
-        checks = get_classifications(
-            pr.pr_num,
-            pr.project,
-            checks,
-            [],
-        )
-        pending, failed, ignorable = categorize_checks(
-            checks,
-            list(checks.keys()),
-        )
-        self.assertTrue(len(pending) == 0)
-        self.assertTrue(len(failed) == 0)
-        self.assertTrue(len(ignorable["FLAKY"]) == 2)
-        self.assertTrue(len(ignorable["UNSTABLE"]) == 13)
-
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
        """
@ -1041,7 +1019,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
        )

    @skip(
-        reason="This test is run against a mutable PR that has changed, so it no longer works. The test should be changed"
+        reason="This test is run against a mutalbe PR that has changed, so it no longer works. The test should be changed"
    )
    @mock.patch("trymerge.read_merge_rules")
    @mock.patch("trymerge.GitRepo")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -81,10 +81,9 @@ JobNameToStateDict = Dict[str, JobCheckState]


 class WorkflowCheckState:
-    def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
+    def __init__(self, name: str, url: str, status: Optional[str]):
        self.name: str = name
        self.url: str = url
-        self.run_id: int = run_id
        self.status: Optional[str] = status
        self.jobs: JobNameToStateDict = {}

@ -123,7 +122,6 @@ fragment PRCheckSuites on CheckSuiteConnection {
      workflowRun {
        workflow {
          name
-          databaseId
        }
        databaseId
        url
@ -514,7 +512,7 @@ def add_workflow_conclusions(
    workflows: Dict[str, WorkflowCheckState] = {}

    # for the jobs that don't have a workflow
-    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", 0, None)
+    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", None)

    def add_conclusions(edges: Any) -> None:
        for edge_idx, edge in enumerate(edges):
@ -525,30 +523,18 @@ def add_workflow_conclusions(
            workflow_obj: WorkflowCheckState = no_workflow_obj

            if workflow_run is not None:
-                # This is the usual workflow run ID we see on GitHub
-                workflow_run_id = workflow_run["databaseId"]
-                # While this is the metadata name and ID of the workflow itself
                workflow_name = workflow_run["workflow"]["name"]
-                workflow_id = workflow_run["workflow"]["databaseId"]
-
                workflow_conclusion = node["conclusion"]
                # Do not override existing status with cancelled
                if workflow_conclusion == "CANCELLED" and workflow_name in workflows:
                    continue
-
-                # Only keep the latest workflow run for each workflow, heuristically,
-                # it's the run with largest run ID
-                if (
-                    workflow_id not in workflows
-                    or workflows[workflow_id].run_id < workflow_run_id
-                ):
-                    workflows[workflow_id] = WorkflowCheckState(
+                if workflow_name not in workflows:
+                    workflows[workflow_name] = WorkflowCheckState(
                        name=workflow_name,
                        status=workflow_conclusion,
                        url=workflow_run["url"],
-                        run_id=workflow_run_id,
                    )
-                workflow_obj = workflows[workflow_id]
+                workflow_obj = workflows[workflow_name]

            while checkruns is not None:
                for checkrun_node in checkruns["nodes"]:
@ -586,12 +572,12 @@ def add_workflow_conclusions(
    # the jobs in but don't put the workflow in.  We care more about the jobs in
    # the workflow that ran than the container workflow.
    res: JobNameToStateDict = {}
-    for workflow in workflows.values():
+    for workflow_name, workflow in workflows.items():
        if len(workflow.jobs) > 0:
            for job_name, job in workflow.jobs.items():
                res[job_name] = job
        else:
-            res[workflow.name] = JobCheckState(
+            res[workflow_name] = JobCheckState(
                workflow.name,
                workflow.url,
                workflow.status,
@ -1177,6 +1163,7 @@ class GitHubPR:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
+                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=comment_id,
                pr_num=self.pr_num,
                owner=self.org,
@ -1192,8 +1179,10 @@ class GitHubPR:
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
+                dry_run=dry_run,
                skip_mandatory_checks=skip_mandatory_checks,
                ignore_current=bool(ignore_current_checks),
+                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
@ -1500,6 +1489,7 @@ def checks_to_markdown_bullets(

@retries_decorator()
 def save_merge_record(
+    collection: str,
    comment_id: int,
    pr_num: int,
    owner: str,
@ -1515,44 +1505,59 @@ def save_merge_record(
    merge_base_sha: str,
    merge_commit_sha: str = "",
    is_failed: bool = False,
+    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
    ignore_current: bool = False,
    error: str = "",
+    workspace: str = "commons",
 ) -> None:
    """
-    This saves the merge records as a json, which can later be uploaded to s3
+    This saves the merge records into Rockset, so we can query them (for fun and profit)
    """
+    if dry_run:
+        # Decide not to save the record to Rockset if dry-run is set to not pollute
+        # the collection
+        return

-    # Prepare the record to be written into Rockset
-    data = [
-        {
-            "comment_id": comment_id,
-            "pr_num": pr_num,
-            "owner": owner,
-            "project": project,
-            "author": author,
-            "pending_checks": pending_checks,
-            "failed_checks": failed_checks,
-            "ignore_current_checks": ignore_current_checks,
-            "broken_trunk_checks": broken_trunk_checks,
-            "flaky_checks": flaky_checks,
-            "unstable_checks": unstable_checks,
-            "last_commit_sha": last_commit_sha,
-            "merge_base_sha": merge_base_sha,
-            "merge_commit_sha": merge_commit_sha,
-            "is_failed": is_failed,
-            "skip_mandatory_checks": skip_mandatory_checks,
-            "ignore_current": ignore_current,
-            "error": error,
-            # This is a unique identifier for the record for deduping purposes
-            # in rockset.  Any unique string would work
-            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
-        }
-    ]
-    repo_root = Path(__file__).resolve().parent.parent.parent
+    try:
+        import rockset  # type: ignore[import]

-    with open(repo_root / "merge_record.json", "w") as f:
-        json.dump(data, f)
+        # Prepare the record to be written into Rockset
+        data = [
+            {
+                "comment_id": comment_id,
+                "pr_num": pr_num,
+                "owner": owner,
+                "project": project,
+                "author": author,
+                "pending_checks": pending_checks,
+                "failed_checks": failed_checks,
+                "ignore_current_checks": ignore_current_checks,
+                "broken_trunk_checks": broken_trunk_checks,
+                "flaky_checks": flaky_checks,
+                "unstable_checks": unstable_checks,
+                "last_commit_sha": last_commit_sha,
+                "merge_base_sha": merge_base_sha,
+                "merge_commit_sha": merge_commit_sha,
+                "is_failed": is_failed,
+                "skip_mandatory_checks": skip_mandatory_checks,
+                "ignore_current": ignore_current,
+                "error": error,
+            }
+        ]
+
+        client = rockset.RocksetClient(
+            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
+        )
+        client.Documents.add_documents(
+            collection=collection,
+            data=data,
+            workspace=workspace,
+        )
+
+    except ModuleNotFoundError:
+        print("Rockset is missing, no record will be saved")
+        return


@retries_decorator(rc=[])
@ -2325,15 +2330,6 @@ def main() -> None:
            dry_run=args.dry_run,
        )
        return
-    if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
-        gh_post_pr_comment(
-            org,
-            project,
-            args.pr_num,
-            f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
-            dry_run=args.dry_run,
-        )
-        return

    if args.check_mergeability:
        if pr.is_ghstack_pr():
@ -2369,6 +2365,7 @@ def main() -> None:
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
+                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=args.comment_id,
                pr_num=args.pr_num,
                owner=org,
@ -2383,9 +2380,11 @@ def main() -> None:
                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
+                dry_run=args.dry_run,
                skip_mandatory_checks=args.force,
                ignore_current=args.ignore_current,
                error=str(e),
+                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -30,9 +30,6 @@
  {%- if config["devtoolset"] %}
      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
  {%- endif %}
-  {%- if config.use_split_build is defined %}
-      use_split_build: !{{ config["use_split_build"] }}
-  {%- endif %}
 {%- endif %}
 {%- if config["package_type"] == "libtorch" %}
  {%- if config["libtorch_config"] %}
@ -47,7 +44,6 @@
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.8"
  {%- endif %}
-
 {%- else %}
      DESIRED_PYTHON: "!{{ config["python_version"] }}"
 {%- endif %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -27,11 +27,6 @@ on:
        type: string
        description: |
          A JSON description of what configs to run later on.
-      runner:
-        required: false
-        type: string
-        default: "linux.large"
-        description: Runner type

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -39,7 +34,7 @@ env:
 jobs:
  filter:
    if: github.repository_owner == 'pytorch'
-    runs-on: ${{ inputs.runner }}
+    runs-on: [self-hosted, linux.large]
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -21,13 +21,6 @@ on:
        default: 210
        type: number
        description: timeout for the job
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
      ALPINE_IMAGE:
        required: false
        type: string
@ -117,7 +110,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -145,7 +137,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
          } >> "${GITHUB_ENV} }}"

      - name: List the env
@ -255,7 +246,6 @@ jobs:
            -e PYTORCH_ROOT \
            -e SKIP_ALL_TESTS \
            -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
-            -e USE_SPLIT_BUILD \
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -63,13 +63,6 @@ on:
        required: true
        type: string
        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -104,7 +97,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -132,7 +124,6 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
-            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -55,13 +55,6 @@ on:
        required: false
        type: string
        description: Desired python version
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      github-token:
        required: true
@ -100,7 +93,6 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
-      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/_linux-build-label.yml
+++ b/.github/workflows/_linux-build-label.yml
@ -56,13 +56,6 @@ on:
        required: false
        type: string
        default: ""
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -114,4 +107,3 @@ jobs:
          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          use_split_build: ${{ inputs.use_split_build }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -64,14 +64,6 @@ on:
        required: false
        type: string
        default: ""
-      use_split_build:
-        description: |
-          [Experimental] Build a libtorch only wheel and build pytorch such that
-          are built from the libtorch wheel.
-        required: false
-        type: boolean
-        default: false
-
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -189,7 +181,6 @@ jobs:
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
          # detached container should get cleaned up by teardown_ec2_linux
          container_name=$(docker run \
@ -208,7 +199,6 @@ jobs:
            -e PR_LABELS \
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
-            -e USE_SPLIT_BUILD \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -228,7 +218,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -236,16 +226,6 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

-      - name: Store PyTorch Build Artifacts on S3
-        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
-        with:
-          name: ${{ inputs.build-environment }}-experimental-split-build
-          retention-days: 14
-          if-no-files-found: error
-          path: artifacts.zip
-          s3-bucket: ${{ inputs.s3-bucket }}
-
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped'
        uses: seemethere/upload-artifact-s3@v5
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -3,272 +3,39 @@ name: Check whether the workflow owner can use ARC runners
 on:
  workflow_call:
    inputs:
-      triggering_actor:
+      user_name:
        required: true
        type: string
-        description: The triggering_actor for the workflow. Use github.triggering_actor
-      issue_owner:
-        required: true
-        type: string
-        description: The owner of the issue. Use github.event.pull_request.user.login || github.event.issue.user.login
+        description: The name of the workflow owner.
      curr_branch:
        required: true
        type: string
-        description: Current branch or tag.
-      curr_ref_type:
-        required: false
-        type: string
-        default: branch
-        description: The value of "github.ref_type", "branch" or "tag"
+        description: Current branch.
      issue_number:
        required: false
        type: string
        default: "5132"
-        description: |
-          Fetch's GitHub Issue from pytorch/test-infra
-          Example: https://github.com/pytorch/test-infra/issues/5132

    outputs:
-      label-type:
+      workflow-type:
        description: Type of runners to use
-        value: ${{ jobs.runner-determinator.outputs.label-type }}
+        value: ${{ jobs.runner-determinator.outputs.workflow-type }}

 jobs:
  runner-determinator:
-    runs-on: ubuntu-latest
+    runs-on: linux.4xlarge
    outputs:
-      label-type: ${{ steps.set-condition.outputs.label-type }}
+      workflow-type: ${{ steps.set-condition.outputs.workflow-type }}
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      ISSUE_NUMBER: ${{ inputs.issue_number }}
-      TRIGGERING_ACTOR: ${{ inputs.triggering_actor }}
-      ISSUE_OWNER: ${{ inputs.issue_owner }}
+      USERNAME: ${{ inputs.user_name }}
    steps:
-      # - name: Checkout PyTorch
-      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-      #   with:
-      #     fetch-depth: 1
-      #     submodules: true
-
-      # TODO: Remove the hardcoded step below
-      # Hardcoding below is temporary for testing ALI runners
-      # This file below should match the script found in .github/scripts/runner_determinator.py
-      - name: Hardcode runner-determinator script
-        run: |
-          cat <<EOF > runner_determinator.py
-          # flake8: noqa: G004
-
-          import logging
-          import os
-          from argparse import ArgumentParser
-          from logging import LogRecord
-          from typing import Any, Iterable
-
-          from github import Auth, Github
-          from github.Issue import Issue
-
-
-          WORKFLOW_LABEL_META = ""  # use meta runners
-          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
-
-          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
-          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
-
-
-          class ColorFormatter(logging.Formatter):
-              """Color codes the log messages based on the log level"""
-
-              COLORS = {
-                  "WARNING": "\033[33m",  # Yellow
-                  "ERROR": "\033[31m",  # Red
-                  "CRITICAL": "\033[31m",  # Red
-                  "INFO": "\033[0m",  # Reset
-                  "DEBUG": "\033[0m",  # Reset
-              }
-
-              def format(self, record: LogRecord) -> str:
-                  log_color = self.COLORS.get(record.levelname, "\033[0m")  # Default to reset
-                  record.msg = f"{log_color}{record.msg}\033[0m"
-                  return super().format(record)
-
-
-          handler = logging.StreamHandler()
-          handler.setFormatter(ColorFormatter(fmt="%(levelname)-8s: %(message)s"))
-
-          log = logging.getLogger(os.path.basename(__file__))
-          log.addHandler(handler)
-          log.setLevel(logging.INFO)
-
-
-          def set_github_output(key: str, value: str) -> None:
-              """
-              Defines outputs of the github action that invokes this script
-              """
-              if not GITHUB_OUTPUT:
-                  # See https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ for deprecation notice
-                  log.warning(
-                      "No env var found for GITHUB_OUTPUT, you must be running this code locally. Falling back to the deprecated print method."
-                  )
-                  print(f"::set-output name={key}::{value}")
-                  return
-
-              with open(GITHUB_OUTPUT, "a") as f:
-                  log.info(f"Setting output: {key}='{value}'")
-                  f.write(f"{key}={value}\n")
-
-
-          def parse_args() -> Any:
-              parser = ArgumentParser("Get dynamic rollout settings")
-              parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
-              parser.add_argument(
-                  "--github-issue-repo",
-                  type=str,
-                  required=False,
-                  default="pytorch/test-infra",
-                  help="GitHub repo to get the issue",
-              )
-              parser.add_argument(
-                  "--github-repo",
-                  type=str,
-                  required=True,
-                  help="GitHub repo where CI is running",
-              )
-              parser.add_argument(
-                  "--github-issue", type=int, required=True, help="GitHub issue number"
-              )
-              parser.add_argument(
-                  "--github-actor", type=str, required=True, help="GitHub triggering_actor"
-              )
-              parser.add_argument(
-                  "--github-issue-owner", type=str, required=True, help="GitHub issue owner"
-              )
-              parser.add_argument(
-                  "--github-branch", type=str, required=True, help="Current GitHub branch or tag"
-              )
-              parser.add_argument(
-                  "--github-ref-type",
-                  type=str,
-                  required=True,
-                  help="Current GitHub ref type, branch or tag",
-              )
-
-              return parser.parse_args()
-
-
-          def get_gh_client(github_token: str) -> Github:
-              auth = Auth.Token(github_token)
-              return Github(auth=auth)
-
-
-          def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
-              repo = gh.get_repo(repo)
-              return repo.get_issue(number=issue_num)
-
-
-          def get_potential_pr_author(
-              gh: Github, repo: str, username: str, ref_type: str, ref_name: str
-          ) -> str:
-              # If the trigger was a new tag added by a bot, this is a ciflow case
-              # Fetch the actual username from the original PR. The PR number is
-              # embedded in the tag name: ciflow/<name>/<pr-number>
-              if username == "pytorch-bot[bot]" and ref_type == "tag":
-                  split_tag = ref_name.split("/")
-                  if (
-                      len(split_tag) == 3
-                      and split_tag[0] == "ciflow"
-                      and split_tag[2].isnumeric()
-                  ):
-                      pr_number = split_tag[2]
-                      try:
-                          repository = gh.get_repo(repo)
-                          pull = repository.get_pull(number=int(pr_number))
-                      except Exception as e:
-                          raise Exception(  # noqa: TRY002
-                              f"issue with pull request {pr_number} from repo {repository}"
-                          ) from e
-                      return pull.user.login
-              # In all other cases, return the original input username
-              return username
-
-
-          def is_exception_branch(branch: str) -> bool:
-              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
-
-
-          def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
-              try:
-                  first_comment = issue.get_comments()[0].body.strip("\n\t ")
-
-                  if first_comment[0] == "!":
-                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
-                      return WORKFLOW_LABEL_META
-                  elif first_comment[0] == "*":
-                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
-                      return WORKFLOW_LABEL_LF
-                  else:
-                      all_opted_in_users = {
-                          usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
-                      }
-                      opted_in_requestors = {
-                          usr for usr in workflow_requestors if usr in all_opted_in_users
-                      }
-                      if opted_in_requestors:
-                          log.info(
-                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
-                          )
-                          return WORKFLOW_LABEL_LF
-                      else:
-                          log.info(
-                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
-                          )
-                          return WORKFLOW_LABEL_META
-
-              except Exception as e:
-                  log.error(
-                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
-                  )
-                  return WORKFLOW_LABEL_META
-
-
-          def main() -> None:
-              args = parse_args()
-
-              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
-                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
-                  label_type = WORKFLOW_LABEL_META
-              else:
-                  try:
-                      gh = get_gh_client(args.github_token)
-                      # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
-                      issue = get_issue(gh, args.github_issue_repo, args.github_issue)
-                      username = get_potential_pr_author(
-                          gh,
-                          args.github_repo,
-                          args.github_actor,
-                          args.github_ref_type,
-                          args.github_branch,
-                      )
-                      label_type = get_workflow_type(
-                          issue,
-                          (
-                              args.github_issue_owner,
-                              username,
-                          ),
-                      )
-                  except Exception as e:
-                      log.error(
-                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
-                      )
-                      label_type = WORKFLOW_LABEL_META
-
-              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
-
-
-          if __name__ == "__main__":
-              main()
-          EOF
-
-          cat runner_determinator.py
+      - name: Checkout PyTorch
+        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+        with:
+          fetch-depth: 1
+          submodules: true

      - name: Install dependencies
        run: python3 -m pip install urllib3==1.26.18 PyGithub==2.3.0
@ -277,14 +44,15 @@ jobs:
        id: set-condition
        run: |
          curr_branch="${{ inputs.curr_branch }}"
-          curr_ref_type="${{ inputs.curr_ref_type }}"
          echo "Current branch is '$curr_branch'"

-          python3 runner_determinator.py \
+          output="$(python3 .github/scripts/get_workflow_type.py \
            --github-token "$GITHUB_TOKEN" \
            --github-issue "$ISSUE_NUMBER" \
            --github-branch "$curr_branch" \
-            --github-actor "$TRIGGERING_ACTOR" \
-            --github-issue-owner "$ISSUE_OWNER" \
-            --github-ref-type "$curr_ref_type" \
-            --github-repo "$GITHUB_REPOSITORY"
+            --github-user "$USERNAME")"
+
+          echo "Output: '${output}'"
+
+          WORKFLOW_TYPE=$(echo "${output}" | jq -r '.workflow_type')
+          echo "workflow-type=$WORKFLOW_TYPE" >> "$GITHUB_OUTPUT"
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -47,9 +47,6 @@ jobs:
    timeout-minutes: 240
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
-    defaults:
-      run:
-        shell: bash
    steps:
      # Duplicated in win-test because this MUST go before a checkout
      - name: Enable git symlinks on Windows and disable fsmonitor daemon
@ -92,7 +89,6 @@ jobs:

      - name: Parse ref
        id: parse-ref
-        shell: bash
        run: python3 .github/scripts/parse_ref.py

      - name: Get workflow job id
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -41,9 +41,6 @@ jobs:
      fail-fast: false
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
-    defaults:
-      run:
-        shell: bash
    steps:
      # Duplicated in win-build because this MUST go before a checkout
      - name: Enable git symlinks on Windows and disable fsmonitor daemon
@ -227,7 +224,6 @@ jobs:

      - name: Parse ref
        id: parse-ref
-        shell: bash
        run: python3 .github/scripts/parse_ref.py

      - name: Uninstall PyTorch
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -5,11 +5,6 @@ on:
    branches:
      - main
      - release/*
-    tags:
-      # Final Release tags look like: v1.11.0
-      - v[0-9]+.[0-9]+.[0-9]+
-      # Release candidate tags look like: v1.11.0-rc1
-      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
  release:
    types: [published]
  pull_request:
@ -23,8 +18,6 @@ jobs:
    # https://github.com/softprops/action-gh-release?tab=readme-ov-file#permissions
    permissions:
      contents: write
-    outputs:
-      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
    steps:
      - uses: malfet/checkout@silent-checkout
        with:
@ -56,44 +49,11 @@ jobs:
            # Create archive
            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
-      - name: Upload source distribution for release
+      - name: Upload source distribution
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@v1
        with:
          files: ${{env.PT_RELEASE_FILE}}
-      - name: Upload source distribution to GHA artifacts for release tags
-        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: ${{ env.PT_RELEASE_FILE }}
-          path: ${{ env.PT_RELEASE_FILE }}
-      - name: Set output
-        id: release_name
-        run: echo "::set-output name=pt_release_name::${{ env.PT_RELEASE_NAME }}.tar.gz"
-
-  upload_source_code_to_s3:
-    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
-    runs-on: linux.2xlarge
-    environment: sourcecode-upload
-    name: Upload source code to S3 for release tags
-    permissions:
-      id-token: write
-    needs: release
-    steps:
-      - uses: actions/download-artifact@v2
-        with:
-          name: ${{ needs.release.outputs.pt_release_name }}
-      - name: Configure AWS credentials(PyTorch account)
-        uses: aws-actions/configure-aws-credentials@v3
-        with:
-          role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role
-          aws-region: us-east-1
-      - uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: pytorch
-          s3-prefix: source_code/test
-          if-no-files-found: warn
-          path: ${{ needs.release.outputs.pt_release_name }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -54,7 +54,6 @@ jobs:
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
-          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-aarch64-test:  # Testing
@ -162,7 +162,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -378,7 +378,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -486,7 +486,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -48,7 +48,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -72,48 +72,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda11_8-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda11_8-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda11_8-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu118
-      GPU_ARCH_VERSION: 11.8
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda11_8-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_8-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -130,7 +88,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -154,48 +112,6 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

-  manywheel-py3_8-cuda12_1-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_1-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_1-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu121
-      GPU_ARCH_VERSION: 12.1
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_1-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-
  manywheel-py3_8-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -212,7 +128,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_4-test:  # Testing
@ -235,45 +151,3 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-
-  manywheel-py3_8-cuda12_4-split-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_8-cuda12_4-split-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs: manywheel-py3_8-cuda12_4-split-build
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      BUILDER_ROOT: /builder
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu124
-      GPU_ARCH_VERSION: 12.4
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
-      use_split_build: True
-      DESIRED_PYTHON: "3.8"
-      build_name: manywheel-py3_8-cuda12_4-split
-      build_environment: linux-binary-manywheel
-      runs_on: linux.4xlarge.nvidia.gpu
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_8-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-s390x-test:  # Testing
@ -117,7 +117,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -180,7 +180,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -306,7 +306,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -165,7 +165,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -284,7 +284,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -403,7 +403,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -522,7 +522,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -536,7 +536,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -782,7 +782,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1027,7 +1027,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1271,7 +1271,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1517,7 +1517,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1763,7 +1763,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2008,7 +2008,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2252,7 +2252,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2498,7 +2498,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2744,7 +2744,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2989,7 +2989,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3233,7 +3233,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3479,7 +3479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3725,7 +3725,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3970,7 +3970,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4214,7 +4214,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4460,7 +4460,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4706,7 +4706,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -28,8 +28,7 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -96,8 +95,7 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -56,29 +56,3 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
-      use-gha: anything-non-empty-to-use-gha
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -24,8 +24,7 @@ jobs:
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" },
        ]}

  linux-focal-rocm6_1-py3_8-inductor-test:
@ -49,8 +48,7 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -83,6 +81,32 @@ jobs:
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

+  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
@ -92,8 +116,7 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
@ -105,26 +128,6 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

-  linux-jammy-cpu-py3_12-inductor-halide-build:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-jammy-py3.12-gcc11
-      docker-image-name: pytorch-linux-jammy-py3.12-halide
-      test-matrix: |
-        { include: [
-          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-        ]}
-
-  linux-jammy-cpu-py3_12-inductor-halide-test:
-    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-jammy-cpu-py3_12-inductor-halide-build
-    with:
-      build-environment: linux-jammy-py3.12-gcc11
-      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
-
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
@ -172,21 +175,11 @@ jobs:
          { config: "cpu_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_inductor_huggingface_amp_freezing", shard: 1, num_shards: 1, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_timm_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_timm_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_torchbench_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
-          { config: "cpu_inductor_torchbench_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
-          { config: "cpu_aot_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
        ]}
    secrets:
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@ -36,24 +36,33 @@ jobs:
          ref: v0.0.2
          path: llm-target-determinator

-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+      - name: Setup Conda
+        uses: conda-incubator/setup-miniconda@v2.1.1
        with:
-          python-version: "3.9"
+          miniconda-version: "py39_4.12.0"
+          python-version: 3.9

-      - name: Install requirements
+      - name: Install Requirements
        shell: bash -l {0}
        run: |
          set -euxo pipefail
-          ${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt
-          cd "${GITHUB_WORKSPACE}/codellama"
-          ${CONDA_RUN} pip install -e .
+          conda create \
+            --yes \
+            --quiet \
+            --name "tdenv" \
+            "python=3.9"
+          conda activate tdenv
+          cd "${GITHUB_WORKSPACE}/llm-target-determinator"
+          pip install -r requirements.txt
+          cd ../codellama
+          pip install -e .

      - name: Fetch CodeLlama Checkpoint
        shell: bash -l {0}
        run: |
          set -euxo pipefail
-          cd "${GITHUB_WORKSPACE}/codellama"
+          conda activate tdenv
+          cd codellama/
          mkdir "CodeLlama-7b-Python"
          aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress

@ -66,7 +75,7 @@ jobs:
          shell: bash
          command: |
            set -euxo pipefail
-            ${CONDA_RUN} python -m pip install awscli==1.29.40
+            python3 -m pip install awscli==1.29.40
            cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
            aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive

@ -79,8 +88,9 @@ jobs:
        shell: bash -l {0}
        run: |
          set -euxo pipefail
+          conda activate tdenv
          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
-          ${CONDA_RUN} torchrun \
+          torchrun \
            --standalone \
            --nnodes=1 \
            --nproc-per-node=1 \
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -73,6 +73,7 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
@ -294,53 +295,3 @@ jobs:
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      use_split_build: true
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
-
-  linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build:
-    name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      use_split_build: true
-      build-environment: linux-focal-cuda11.8-py3.9-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
-      cuda-arch-list: 8.6
-      test-matrix: |
-        { include: [
-          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
-        ]}
-      build-with-debug: false
-
-  linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build
-      - target-determination
-    with:
-      build-environment: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -35,33 +35,22 @@ jobs:
      id-token: write
      contents: read

-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-
  linux-jammy-py3_8-gcc11-build:
    name: linux-jammy-py3.8-gcc11
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
        ]}

  linux-jammy-py3_8-gcc11-test:
@ -86,9 +75,7 @@ jobs:
  linux-jammy-py3_8-gcc11-no-ops:
    name: linux-jammy-py3.8-gcc11-no-ops
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11-no-ops
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
@ -99,9 +86,7 @@ jobs:
  linux-jammy-py3_8-gcc11-pch:
    name: linux-jammy-py3.8-gcc11-pch
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11-pch
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
@ -113,19 +98,17 @@ jobs:
  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.10-clang15-asan
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" },
        ]}
      sync-tag: asan-build

@ -145,15 +128,13 @@ jobs:
  linux-focal-py3_8-clang10-onnx-build:
    name: linux-focal-py3.8-clang10-onnx
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang10-onnx
      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
        ]}

  linux-focal-py3_8-clang10-onnx-test:
@ -170,22 +151,19 @@ jobs:
  linux-focal-py3_8-clang10-build:
    name: linux-focal-py3.8-clang10
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang10
      docker-image-name: pytorch-linux-focal-py3.8-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
        ]}
  linux-focal-py3_8-clang10-test:
    name: linux-focal-py3.8-clang10
@ -201,24 +179,22 @@ jobs:
  linux-focal-py3_11-clang10-build:
    name: linux-focal-py3.11-clang10
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.11-clang10
      docker-image-name: pytorch-linux-focal-py3.11-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
        ]}

+
  linux-focal-py3_11-clang10-test:
    name: linux-focal-py3.11-clang10
    uses: ./.github/workflows/_linux-test.yml
@ -233,20 +209,17 @@ jobs:
  linux-focal-py3_12-clang10-build:
    name: linux-focal-py3.12-clang10
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.12-clang10
      docker-image-name: pytorch-linux-focal-py3.12-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
        ]}

  linux-focal-py3_12-clang10-test:
@ -262,16 +235,14 @@ jobs:
  linux-focal-cuda11_8-py3_10-gcc9-build:
    name: linux-focal-cuda11.8-py3.10-gcc9
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda11.8-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda11_8-py3_10-gcc9-test:
@ -289,18 +260,17 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-build:
    name: linux-focal-cuda12.1-py3.10-gcc9
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -318,9 +288,7 @@ jobs:
  linux-jammy-py3-clang12-mobile-build:
    name: linux-jammy-py3-clang12-mobile-build
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3-clang12-mobile-build
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      build-generates-artifacts: false
@ -332,9 +300,7 @@ jobs:
  linux-jammy-cuda-11_8-cudnn9-py3_8-clang12-build:
    name: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12
      test-matrix: |
@ -345,9 +311,7 @@ jobs:
  linux-focal-py3-clang9-mobile-custom-build-static:
    name: linux-focal-py3-clang9-mobile-custom-build-static
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3-clang9-mobile-custom-build-static
      docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e
      build-generates-artifacts: false
@ -359,14 +323,12 @@ jobs:
  linux-focal-py3_8-clang9-xla-build:
    name: linux-focal-py3_8-clang9-xla
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang9-xla
      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
      test-matrix: |
        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
+          { config: "xla", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}

  linux-focal-py3_8-clang9-xla-test:
@ -397,43 +359,37 @@ jobs:
  linux-focal-cpu-py3_10-gcc9-bazel-test:
    name: linux-focal-cpu-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: cpu
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-bazel-test:
    name: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: "12.1"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_4-py3_10-gcc9-bazel-test:
    name: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      cuda-version: "12.4"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single:
@ -461,9 +417,7 @@ jobs:
  linux-jammy-py3_8-gcc11-mobile-lightweight-dispatch-build:
    name: linux-jammy-py3.8-gcc11-mobile-lightweight-dispatch-build
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc111-mobile-lightweight-dispatch-build
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      build-generates-artifacts: false
@ -477,9 +431,7 @@ jobs:
    if: github.event_name == 'pull_request'
    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
@ -493,19 +445,17 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
@ -522,14 +472,12 @@ jobs:
  linux-jammy-py3-clang12-executorch-build:
    name: linux-jammy-py3-clang12-executorch
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3-clang12-executorch
      docker-image-name: pytorch-linux-jammy-py3-clang12-executorch
      test-matrix: |
        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "executorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

  linux-jammy-py3-clang12-executorch-test:
@ -540,59 +488,3 @@ jobs:
      build-environment: linux-jammy-py3-clang12-executorch
      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
-    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      use_split_build: true
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
-  linux-focal-py3_12-clang10-experimental-split-build:
-    name: linux-focal-py3.12-clang10-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      use_split_build: True
-      build-environment: linux-focal-py3.12-clang10
-      docker-image-name: pytorch-linux-focal-py3.12-clang10
-      test-matrix: |
-        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-        ]}
-  linux-focal-py3_12-clang10-experimental-split-build-test:
-    name: linux-focal-py3.12-clang10-experimental-split-build
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-py3_12-clang10-experimental-split-build
-    with:
-      build-environment: linux-focal-py3.12-clang10-experimental-split-build
-      docker-image: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.test-matrix }}
-      timeout-minutes: 600
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -36,15 +36,6 @@ jobs:
      id-token: write
      contents: read

-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build:
    name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
@ -106,8 +97,7 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.8-clang10
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
        ]}

  linux-focal-py3_8-clang10-test:
@ -129,8 +119,7 @@ jobs:
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
-          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
        ]}

  linux-focal-rocm6_1-py3_8-test:
@ -150,16 +139,14 @@ jobs:
  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.10-clang15-asan
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
-          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
        ]}
      sync-tag: asan-build

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,15 +34,6 @@ jobs:
      id-token: write
      contents: read

-  get-label-type:
-    name: get-label-type
-    uses: ./.github/workflows/_runner-determinator.yml
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-
  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build-label.yml
@ -222,9 +213,7 @@ jobs:
  linux-focal-rocm6_1-py3_8-build:
    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
-    needs: get-label-type
    with:
-      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
@ -249,59 +238,3 @@ jobs:
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
-
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      use_split_build: true
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      build-environment: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
-
-  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build:
-    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
-    uses: ./.github/workflows/_linux-build-label.yml
-    with:
-      use_split_build: true
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9
-      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
-      test-matrix: |
-        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-        ]}
-
-  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build-test:
-    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build-test
-    uses: ./.github/workflows/_linux-test.yml
-    needs:
-      - linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build
-      - target-determination
-    with:
-      timeout-minutes: 360
-      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
-      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -9,8 +9,6 @@ jobs:
    name: try_merge_pr_${{ github.event.client_payload.pr_num }}
    runs-on: linux.20_04.4x
    environment: mergebot
-    permissions:
-      id-token: write
    env:
        GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
    steps:
@ -45,7 +43,6 @@ jobs:
          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
-          GITHUB_RUN_ID: ${{ github.run_id }}
        run: |
          set -x
          if [ -n "${REBASE}" ]; then
@ -87,22 +84,6 @@ jobs:
          set -x
          python3 .github/scripts/comment_on_pr.py "${PR_NUM}" "merge"

-      - name: configure aws credentials
-        uses: aws-actions/configure-aws-credentials@v3
-        continue-on-error: true
-        with:
-          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
-          aws-region: us-east-1
-
-      - name: Upload merge record to s3
-        if: always()
-        continue-on-error: true
-        uses: seemethere/upload-artifact-s3@v5
-        with:
-          s3-bucket: ossci-raw-job-status
-          s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
-          path: merge_record.json
-
 # We want newer merge commands to supercede old ones
 concurrency:
  group: try-merge-${{ github.event.client_payload.pr_num }}
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -25,7 +25,10 @@ jobs:

  upload-test-stats:
    needs: get_workflow_conclusion
-    if: github.repository_owner == 'pytorch'
+    if:
+      github.repository_owner == 'pytorch' &&
+      (github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' ||
+      needs.get_workflow_conclusion.outputs.conclusion == 'success' || needs.get_workflow_conclusion.outputs.conclusion == 'failure')
    runs-on: ubuntu-22.04
    environment: upload-stats
    name: Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
--- a/.gitignore
+++ b/.gitignore
@ -129,7 +129,6 @@ env
 scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
-merge_record.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -68,8 +68,6 @@ include_patterns = [
    'aten/src/ATen/native/cudnn/*.cpp',
    'c10/**/*.h',
    'c10/**/*.cpp',
-    'distributed/c10d/*DMAConnectivity.*',
-    'distributed/c10d/*SymmetricMemory.*',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.hpp',
    'torch/csrc/**/*.cpp',
@ -138,7 +136,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.1.6',
-    'mypy==1.10.0',
+    'mypy==1.9.0',
    'sympy==1.11.1',
    'types-requests==2.27.25',
    'types-PyYAML==6.0.7',
@ -204,8 +202,6 @@ include_patterns = [
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.cpp',
-    'torch/csrc/jit/serialization/*.h',
-    'torch/csrc/jit/serialization/*.cpp',
 ]
 exclude_patterns = [
    # The negative filters below are to exclude files that include onnx_pb.h or
@ -220,6 +216,7 @@ exclude_patterns = [
    'c10/util/complex_math.h',
    'c10/util/complex_utils.h',
    'c10/util/flat_hash_map.h',
+    'c10/util/Float8*.h',
    'c10/util/logging*.h',
    'c10/util/hash.h',
    'c10/util/strong_type.h',
@ -227,6 +224,7 @@ exclude_patterns = [
    'c10/util/win32-headers.h',
    'c10/util/*inl.h',
    'c10/test/**/*.h',
+    'aten/src/ATen/core/TensorImpl_test.cpp',
    'third_party/**/*',
    'torch/csrc/api/**',
    'torch/csrc/autograd/generated/**',
@ -234,8 +232,10 @@ exclude_patterns = [
    'torch/csrc/dynamo/eval_frame.h',
    'torch/csrc/inductor/**/*',
    'torch/csrc/jit/**/*',
-    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
+    'torch/csrc/jit/serialization/import_legacy.cpp',
+    'torch/csrc/jit/serialization/export.cpp',
    'torch/csrc/lazy/**/*',
+    'torch/csrc/mps/**/*',
 ]
 init_command = [
    'python3',
@ -999,6 +999,7 @@ command = [
 ]
 exclude_patterns = [
    'tools/gen_vulkan_spv.py',
+    'torch/__init__.py',  # Skip this file to format because it's part of the public API
    # We don't care too much about files in this directory, don't enforce
    # formatting on them
    'caffe2/**/*.py',
@ -1098,12 +1099,14 @@ exclude_patterns = [
    'test/test_namedtuple_return_api.py',
    'test/test_native_functions.py',
    'test/test_native_mha.py',
+    'test/test_nestedtensor.py',
    'test/test_nn.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
    'test/test_prims.py',
    'test/test_proxy_tensor.py',
    'test/test_pruning_op.py',
+    'test/test_public_bindings.py',
    'test/test_quantization.py',
    'test/test_reductions.py',
    'test/test_scatter_gather_ops.py',
@ -1129,6 +1132,8 @@ exclude_patterns = [
    'test/test_type_promotion.py',
    'test/test_unary_ufuncs.py',
    'test/test_vulkan.py',
+    'test/test_xnnpack_integration.py',
+    'test/torch_np/numpy_test/**/*.py',
    'torch/_awaits/__init__.py',
    'torch/_custom_op/__init__.py',
    'torch/_custom_op/autograd.py',
@ -1189,6 +1194,9 @@ exclude_patterns = [
    'torch/_export/serde/upgrade.py',
    'torch/_export/trace.py',
    'torch/_export/verifier.py',
+    'torch/_higher_order_ops/__init__.py',
+    'torch/_higher_order_ops/out_dtype.py',
+    'torch/_higher_order_ops/wrap.py',
    'torch/_vendor/**',
    'torch/ao/__init__.py',
    'torch/ao/nn/__init__.py',
@ -1385,8 +1393,172 @@ exclude_patterns = [
    'torch/contrib/_tensorboard_vis.py',
    "torch/cuda/_gpu_trace.py",
    'torch/cuda/_memory_viz.py',  # mypy: Value of type "object" is not indexable
+    'torch/distributed/__init__.py',
+    'torch/distributed/_composable_state.py',
+    'torch/distributed/_shard/__init__.py',
+    'torch/distributed/_shard/_utils.py',
+    'torch/distributed/_shard/api.py',
+    'torch/distributed/_shard/checkpoint/__init__.py',
+    'torch/distributed/_shard/common_op_utils.py',
+    'torch/distributed/_shard/metadata.py',
+    'torch/distributed/_shard/op_registry_utils.py',
+    'torch/distributed/_shard/sharded_optim/__init__.py',
+    'torch/distributed/_shard/sharded_optim/api.py',
+    'torch/distributed/_shard/sharded_tensor/__init__.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/__init__.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/_common.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/init.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/misc_ops.py',
+    'torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py',
+    'torch/distributed/_shard/sharded_tensor/api.py',
+    'torch/distributed/_shard/sharded_tensor/logger.py',
+    'torch/distributed/_shard/sharded_tensor/logging_handlers.py',
+    'torch/distributed/_shard/sharded_tensor/metadata.py',
+    'torch/distributed/_shard/sharded_tensor/reshard.py',
+    'torch/distributed/_shard/sharded_tensor/shard.py',
+    'torch/distributed/_shard/sharded_tensor/utils.py',
+    'torch/distributed/_shard/sharder.py',
+    'torch/distributed/_shard/sharding_plan/__init__.py',
+    'torch/distributed/_shard/sharding_plan/api.py',
+    'torch/distributed/_shard/sharding_spec/__init__.py',
+    'torch/distributed/_shard/sharding_spec/_internals.py',
+    'torch/distributed/_shard/sharding_spec/api.py',
+    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py',
+    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py',
+    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py',
+    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py',
+    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py',
+    'torch/distributed/_sharded_tensor/__init__.py',
+    'torch/distributed/_sharding_spec/__init__.py',
+    'torch/distributed/_tools/__init__.py',
+    'torch/distributed/_tools/memory_tracker.py',
+    'torch/distributed/algorithms/__init__.py',
+    'torch/distributed/algorithms/_checkpoint/__init__.py',
+    'torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py',
+    'torch/distributed/algorithms/_comm_hooks/__init__.py',
+    'torch/distributed/algorithms/_comm_hooks/default_hooks.py',
+    'torch/distributed/algorithms/_optimizer_overlap/__init__.py',
+    'torch/distributed/algorithms/_optimizer_overlap/optimizer_overlap.py',
+    'torch/distributed/algorithms/_quantization/__init__.py',
+    'torch/distributed/algorithms/_quantization/quantization.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/__init__.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py',
+    'torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py',
+    'torch/distributed/algorithms/join.py',
+    'torch/distributed/algorithms/model_averaging/__init__.py',
+    'torch/distributed/algorithms/model_averaging/averagers.py',
+    'torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py',
+    'torch/distributed/algorithms/model_averaging/utils.py',
+    'torch/distributed/argparse_util.py',
+    'torch/distributed/autograd/__init__.py',
+    'torch/distributed/benchmarks/benchmark_ddp_rpc.py',
+    'torch/distributed/c10d_logger.py',
+    'torch/distributed/collective_utils.py',
+    'torch/distributed/constants.py',
+    'torch/distributed/distributed_c10d.py',
+    'torch/distributed/elastic/__init__.py',
+    'torch/distributed/elastic/agent/__init__.py',
+    'torch/distributed/elastic/agent/server/__init__.py',
+    'torch/distributed/elastic/agent/server/api.py',
+    'torch/distributed/elastic/agent/server/local_elastic_agent.py',
+    'torch/distributed/elastic/events/__init__.py',
+    'torch/distributed/elastic/events/api.py',
+    'torch/distributed/elastic/events/handlers.py',
+    'torch/distributed/elastic/metrics/__init__.py',
+    'torch/distributed/elastic/metrics/api.py',
+    'torch/distributed/elastic/multiprocessing/__init__.py',
+    'torch/distributed/elastic/multiprocessing/api.py',
+    'torch/distributed/elastic/multiprocessing/errors/__init__.py',
+    'torch/distributed/elastic/multiprocessing/errors/error_handler.py',
+    'torch/distributed/elastic/multiprocessing/errors/handlers.py',
+    'torch/distributed/elastic/multiprocessing/redirects.py',
+    'torch/distributed/elastic/multiprocessing/tail_log.py',
+    'torch/distributed/elastic/rendezvous/__init__.py',
+    'torch/distributed/elastic/rendezvous/api.py',
+    'torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py',
+    'torch/distributed/elastic/rendezvous/dynamic_rendezvous.py',
+    'torch/distributed/elastic/rendezvous/etcd_rendezvous.py',
+    'torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py',
+    'torch/distributed/elastic/rendezvous/etcd_server.py',
+    'torch/distributed/elastic/rendezvous/etcd_store.py',
+    'torch/distributed/elastic/rendezvous/registry.py',
+    'torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py',
+    'torch/distributed/elastic/rendezvous/utils.py',
+    'torch/distributed/elastic/timer/__init__.py',
+    'torch/distributed/elastic/timer/api.py',
+    'torch/distributed/elastic/timer/file_based_local_timer.py',
+    'torch/distributed/elastic/timer/local_timer.py',
+    'torch/distributed/elastic/utils/__init__.py',
+    'torch/distributed/elastic/utils/api.py',
+    'torch/distributed/elastic/utils/data/__init__.py',
+    'torch/distributed/elastic/utils/data/cycling_iterator.py',
+    'torch/distributed/elastic/utils/data/elastic_distributed_sampler.py',
+    'torch/distributed/elastic/utils/distributed.py',
+    'torch/distributed/elastic/utils/log_level.py',
+    'torch/distributed/elastic/utils/logging.py',
+    'torch/distributed/elastic/utils/store.py',
+    'torch/distributed/examples/memory_tracker_example.py',
+    'torch/distributed/launch.py',
+    'torch/distributed/launcher/__init__.py',
+    'torch/distributed/launcher/api.py',
+    'torch/distributed/logging_handlers.py',
+    'torch/distributed/nn/__init__.py',
+    'torch/distributed/nn/api/__init__.py',
+    'torch/distributed/nn/api/remote_module.py',
+    'torch/distributed/nn/functional.py',
+    'torch/distributed/nn/jit/__init__.py',
+    'torch/distributed/nn/jit/instantiator.py',
+    'torch/distributed/nn/jit/templates/__init__.py',
+    'torch/distributed/nn/jit/templates/remote_module_template.py',
+    'torch/distributed/optim/__init__.py',
+    'torch/distributed/optim/apply_optimizer_in_backward.py',
+    'torch/distributed/optim/functional_adadelta.py',
+    'torch/distributed/optim/functional_adagrad.py',
+    'torch/distributed/optim/functional_adam.py',
+    'torch/distributed/optim/functional_adamax.py',
+    'torch/distributed/optim/functional_adamw.py',
+    'torch/distributed/optim/functional_rmsprop.py',
+    'torch/distributed/optim/functional_rprop.py',
+    'torch/distributed/optim/functional_sgd.py',
+    'torch/distributed/optim/named_optimizer.py',
+    'torch/distributed/optim/optimizer.py',
+    'torch/distributed/optim/post_localSGD_optimizer.py',
+    'torch/distributed/optim/utils.py',
+    'torch/distributed/optim/zero_redundancy_optimizer.py',
+    'torch/distributed/remote_device.py',
+    'torch/distributed/rendezvous.py',
+    'torch/distributed/rpc/__init__.py',
+    'torch/distributed/rpc/_testing/__init__.py',
+    'torch/distributed/rpc/_testing/faulty_agent_backend_registry.py',
+    'torch/distributed/rpc/_utils.py',
+    'torch/distributed/rpc/api.py',
+    'torch/distributed/rpc/backend_registry.py',
+    'torch/distributed/rpc/constants.py',
+    'torch/distributed/rpc/functions.py',
+    'torch/distributed/rpc/internal.py',
+    'torch/distributed/rpc/options.py',
+    'torch/distributed/rpc/rref_proxy.py',
+    'torch/distributed/rpc/server_process_global_profiler.py',
+    'torch/distributed/run.py',
+    'torch/distributed/tensor/__init__.py',
+    'torch/distributed/tensor/parallel/__init__.py',
+    'torch/distributed/tensor/parallel/_utils.py',
+    'torch/distributed/tensor/parallel/_view_with_dim_change.py',
+    'torch/distributed/tensor/parallel/api.py',
+    'torch/distributed/tensor/parallel/fsdp.py',
+    'torch/distributed/tensor/parallel/input_reshard.py',
+    'torch/distributed/tensor/parallel/multihead_attention_tp.py',
+    'torch/distributed/tensor/parallel/style.py',
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
+    'torch/functional.py',
    'torch/futures/__init__.py',
    'torch/fx/__init__.py',
    'torch/fx/_compatibility.py',
@ -1472,9 +1644,20 @@ exclude_patterns = [
    'torch/fx/subgraph_rewriter.py',
    'torch/fx/tensor_type.py',
    'torch/fx/traceback.py',
+    'torch/hub.py',
+    'torch/library.py',
    'torch/linalg/__init__.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
+    'torch/nn/__init__.py',
+    'torch/nn/_reduction.py',
+    'torch/nn/backends/__init__.py',
+    'torch/nn/backends/thnn.py',
+    'torch/nn/common_types.py',
+    'torch/nn/cpp.py',
+    'torch/nn/functional.py',
+    'torch/nn/grad.py',
+    'torch/nn/init.py',
    'torch/nn/intrinsic/__init__.py',
    'torch/nn/intrinsic/modules/__init__.py',
    'torch/nn/intrinsic/modules/fused.py',
@ -1491,6 +1674,40 @@ exclude_patterns = [
    'torch/nn/intrinsic/quantized/modules/bn_relu.py',
    'torch/nn/intrinsic/quantized/modules/conv_relu.py',
    'torch/nn/intrinsic/quantized/modules/linear_relu.py',
+    'torch/nn/modules/__init__.py',
+    'torch/nn/modules/_functions.py',
+    'torch/nn/modules/activation.py',
+    'torch/nn/modules/adaptive.py',
+    'torch/nn/modules/batchnorm.py',
+    'torch/nn/modules/channelshuffle.py',
+    'torch/nn/modules/container.py',
+    'torch/nn/modules/conv.py',
+    'torch/nn/modules/distance.py',
+    'torch/nn/modules/dropout.py',
+    'torch/nn/modules/flatten.py',
+    'torch/nn/modules/fold.py',
+    'torch/nn/modules/instancenorm.py',
+    'torch/nn/modules/lazy.py',
+    'torch/nn/modules/linear.py',
+    'torch/nn/modules/loss.py',
+    'torch/nn/modules/module.py',
+    'torch/nn/modules/normalization.py',
+    'torch/nn/modules/padding.py',
+    'torch/nn/modules/pixelshuffle.py',
+    'torch/nn/modules/pooling.py',
+    'torch/nn/modules/rnn.py',
+    'torch/nn/modules/sparse.py',
+    'torch/nn/modules/transformer.py',
+    'torch/nn/modules/upsampling.py',
+    'torch/nn/modules/utils.py',
+    'torch/nn/parallel/__init__.py',
+    'torch/nn/parallel/_functions.py',
+    'torch/nn/parallel/comm.py',
+    'torch/nn/parallel/data_parallel.py',
+    'torch/nn/parallel/parallel_apply.py',
+    'torch/nn/parallel/replicate.py',
+    'torch/nn/parallel/scatter_gather.py',
+    'torch/nn/parameter.py',
    'torch/nn/qat/__init__.py',
    'torch/nn/qat/dynamic/__init__.py',
    'torch/nn/qat/dynamic/modules/__init__.py',
@ -1528,6 +1745,35 @@ exclude_patterns = [
    'torch/nn/quantized/modules/normalization.py',
    'torch/nn/quantized/modules/rnn.py',
    'torch/nn/quantized/modules/utils.py',
+    'torch/nn/utils/__init__.py',
+    'torch/nn/utils/_deprecation_utils.py',
+    'torch/nn/utils/_expanded_weights/__init__.py',
+    'torch/nn/utils/_expanded_weights/conv_expanded_weights.py',
+    'torch/nn/utils/_expanded_weights/conv_utils.py',
+    'torch/nn/utils/_expanded_weights/embedding_expanded_weights.py',
+    'torch/nn/utils/_expanded_weights/expanded_weights_impl.py',
+    'torch/nn/utils/_expanded_weights/expanded_weights_utils.py',
+    'torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py',
+    'torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py',
+    'torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py',
+    'torch/nn/utils/_expanded_weights/linear_expanded_weights.py',
+    'torch/nn/utils/_per_sample_grad.py',
+    'torch/nn/utils/clip_grad.py',
+    'torch/nn/utils/convert_parameters.py',
+    'torch/nn/utils/fusion.py',
+    'torch/nn/utils/init.py',
+    'torch/nn/utils/memory_format.py',
+    'torch/nn/utils/parametrizations.py',
+    'torch/nn/utils/parametrize.py',
+    'torch/nn/utils/prune.py',
+    'torch/nn/utils/rnn.py',
+    'torch/nn/utils/spectral_norm.py',
+    'torch/nn/utils/weight_norm.py',
+    'torch/overrides.py',
+    'torch/quasirandom.py',
+    'torch/random.py',
+    'torch/return_types.py',
+    'torch/serialization.py',
    'torch/signal/__init__.py',
    'torch/signal/windows/__init__.py',
    'torch/signal/windows/windows.py',
@ -1536,6 +1782,7 @@ exclude_patterns = [
    'torch/sparse/_triton_ops.py',
    'torch/sparse/semi_structured.py',
    'torch/special/__init__.py',
+    'torch/storage.py',
    'torch/testing/_internal/__init__.py',
    'torch/testing/_internal/autocast_test_lists.py',
    'torch/testing/_internal/autograd_function_db.py',
@ -1543,7 +1790,9 @@ exclude_patterns = [
    'torch/testing/_internal/codegen/__init__.py',
    'torch/testing/_internal/codegen/random_topo_test.py',
    'torch/testing/_internal/common_cuda.py',
+    'torch/testing/_internal/common_device_type.py',
    'torch/testing/_internal/common_distributed.py',
+    'torch/testing/_internal/common_dtype.py',
    'torch/testing/_internal/common_jit.py',
    'torch/testing/_internal/common_methods_invocations.py',
    'torch/testing/_internal/common_modules.py',
@ -1608,6 +1857,7 @@ exclude_patterns = [
    'torch/testing/_internal/test_module/__init__.py',
    'torch/testing/_internal/test_module/future_div.py',
    'torch/testing/_internal/test_module/no_future_div.py',
+    'torch/utils/__init__.py',
    'torch/utils/_contextlib.py',
    'torch/utils/_cpp_extension_versioner.py',
    'torch/utils/_crash_handler.py',
@ -1658,6 +1908,53 @@ exclude_patterns = [
    'torch/utils/collect_env.py',
    'torch/utils/cpp_backtrace.py',
    'torch/utils/cpp_extension.py',
+    'torch/utils/data/__init__.py',
+    'torch/utils/data/_utils/__init__.py',
+    'torch/utils/data/_utils/collate.py',
+    'torch/utils/data/_utils/fetch.py',
+    'torch/utils/data/_utils/pin_memory.py',
+    'torch/utils/data/_utils/serialization.py',
+    'torch/utils/data/_utils/signal_handling.py',
+    'torch/utils/data/_utils/worker.py',
+    'torch/utils/data/backward_compatibility.py',
+    'torch/utils/data/dataloader.py',
+    'torch/utils/data/datapipes/__init__.py',
+    'torch/utils/data/datapipes/_decorator.py',
+    'torch/utils/data/datapipes/_hook_iterator.py',
+    'torch/utils/data/datapipes/_typing.py',
+    'torch/utils/data/datapipes/dataframe/__init__.py',
+    'torch/utils/data/datapipes/dataframe/dataframe_wrapper.py',
+    'torch/utils/data/datapipes/dataframe/dataframes.py',
+    'torch/utils/data/datapipes/dataframe/datapipes.py',
+    'torch/utils/data/datapipes/dataframe/structures.py',
+    'torch/utils/data/datapipes/datapipe.py',
+    'torch/utils/data/datapipes/gen_pyi.py',
+    'torch/utils/data/datapipes/iter/__init__.py',
+    'torch/utils/data/datapipes/iter/callable.py',
+    'torch/utils/data/datapipes/iter/combinatorics.py',
+    'torch/utils/data/datapipes/iter/combining.py',
+    'torch/utils/data/datapipes/iter/filelister.py',
+    'torch/utils/data/datapipes/iter/fileopener.py',
+    'torch/utils/data/datapipes/iter/grouping.py',
+    'torch/utils/data/datapipes/iter/routeddecoder.py',
+    'torch/utils/data/datapipes/iter/selecting.py',
+    'torch/utils/data/datapipes/iter/sharding.py',
+    'torch/utils/data/datapipes/iter/streamreader.py',
+    'torch/utils/data/datapipes/iter/utils.py',
+    'torch/utils/data/datapipes/map/__init__.py',
+    'torch/utils/data/datapipes/map/callable.py',
+    'torch/utils/data/datapipes/map/combinatorics.py',
+    'torch/utils/data/datapipes/map/combining.py',
+    'torch/utils/data/datapipes/map/grouping.py',
+    'torch/utils/data/datapipes/map/utils.py',
+    'torch/utils/data/datapipes/utils/__init__.py',
+    'torch/utils/data/datapipes/utils/common.py',
+    'torch/utils/data/datapipes/utils/decoder.py',
+    'torch/utils/data/datapipes/utils/snapshot.py',
+    'torch/utils/data/distributed.py',
+    'torch/utils/data/graph.py',
+    'torch/utils/data/graph_settings.py',
+    'torch/utils/data/sampler.py',
    'torch/utils/dlpack.py',
    'torch/utils/file_baton.py',
    'torch/utils/flop_counter.py',
@ -1697,9 +1994,8 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    '--no-black-binary',
    'black==23.12.1',
-    'ufmt==2.7.0',
-    'usort==1.0.8.post1',
-    'isort==5.13.2',
+    'ufmt==2.1.0',
+    'usort==1.0.6',
 ]
 is_formatter = true

@ -1783,7 +2079,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.5.0',
+    'ruff==0.4.8',
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -461,7 +461,15 @@ filegroup(
 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
+        "caffe2/perfkernels/adagrad.cc",
+        "caffe2/perfkernels/embedding_lookup.cc",
        "caffe2/perfkernels/embedding_lookup_idx.cc",
+        "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.cc",
+        "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup_idx.cc",
+        "caffe2/perfkernels/fused_nbit_rowwise_conversion.cc",
+        "caffe2/perfkernels/lstm_unit_cpu_common.cc",
+        "caffe2/perfkernels/math_cpu_base.cc",
+        "caffe2/perfkernels/typed_axpy.cc",
    ],
 )

@ -498,6 +506,7 @@ cc_library(
    hdrs = [
        "caffe2/core/common.h",
        "caffe2/perfkernels/common.h",
+        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
@ -744,7 +753,6 @@ cc_library(
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
-            "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
@ -762,7 +770,6 @@ cc_library(
        ":torch_headers",
        "@kineto",
        "@cpp-httplib",
-        "@nlohmann",
    ] + if_cuda([
        "@cuda//:nvToolsExt",
        "@cutlass",
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -865,13 +865,12 @@ cmake_dependent_option(
 # Suspect users building from source will need this
 add_definitions(-DFLASHATTENTION_DISABLE_ALIBI)

-# CAVEAT: Again, Flash Attention2 will error while building for sm52 while Mem
-# Eff Attention won't
+# CAVEAT: Again, do not check USE_ROCM here Flash Attention2 will error while
+# building for sm52 while Mem Eff Attention won't
 cmake_dependent_option(
  USE_MEM_EFF_ATTENTION
  "Enable memory-efficient attention for scaled dot product attention.\
-  Will be disabled if not supported by the platform" ON
-  "USE_CUDA OR USE_ROCM" OFF)
+  Will be disabled if not supported by the platform" ON "USE_CUDA" OFF)

 if(DEBUG_CUDA)
  string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo")
--- a/30
+++ b/30
@ -43,12 +43,12 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw

 # ONNX Export
-/torch/_dynamo/backends/onnxrt.py @wschin @xadupre
-/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 @xadupre
-/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 @xadupre
-/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 @xadupre
-/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
-/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
+/torch/_dynamo/backends/onnxrt.py @bowenbao @thiagocrepaldi @wschin
+/torch/csrc/jit/passes/onnx.h @bowenbao @thiagocrepaldi
+/torch/csrc/jit/passes/onnx.cpp @bowenbao @thiagocrepaldi
+/torch/csrc/jit/passes/onnx/ @bowenbao @thiagocrepaldi
+/torch/onnx/ @bowenbao @thiagocrepaldi @wschin
+/test/onnx/ @bowenbao @thiagocrepaldi @wschin

 # CI
 /.ci  @pytorch/pytorch-dev-infra
@ -57,7 +57,6 @@ nn/qat/ @jerryzh168
 /.ci/docker/ @jeffdaily
 /.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
 /.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
-/.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui

 # Github Actions
 # This list is for people wanting to be notified every time there's a change
@ -108,10 +107,10 @@ aten/src/ATen/detail/MTIAHooksInterface.h @egienvalue
 torch/csrc/mtia/ @egienvalue

 # Profiler
-torch/csrc/autograd/profiler* @aaronenyeshi @sraikund16
-torch/autograd/profiler* @aaronenyeshi @sraikund16
-torch/csrc/profiler/ @aaronenyeshi @sraikund16
-torch/profiler/ @aaronenyeshi @sraikund16
+torch/csrc/autograd/profiler* @aaronenyeshi
+torch/autograd/profiler* @aaronenyeshi
+torch/csrc/profiler/ @aaronenyeshi
+torch/profiler/ @aaronenyeshi

 # AOTDispatch tests
 test/functorch/test_aotdispatch.py @ezyang @Chillee
@ -133,15 +132,6 @@ caffe2/operators/hip @jeffdaily @jithunnair-amd
 caffe2/operators/rnn/hip @jeffdaily @jithunnair-amd
 caffe2/utils/hip @jeffdaily @jithunnair-amd

-# XPU-specific files
-/aten/src/ATen/xpu/ @EikanWang @gujinghui
-/c10/xpu/ @EikanWang @gujinghui
-/torch/csrc/xpu/ @EikanWang @gujinghui
-/torch/xpu/ @EikanWang @gujinghui
-/test/xpu/ @EikanWang @gujinghui
-/test/test_xpu.py @EikanWang @gujinghui
-/third_party/xpu.txt @EikanWang @gujinghui
-
 # torch.export
 /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
--- a/5
+++ b/5
@ -77,11 +77,6 @@ RUN case ${TARGETPLATFORM} in \
    esac && \
    /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
-RUN IS_CUDA=$(python -c 'import torch ; print(torch.cuda._is_compiled())'); \
-    echo "Is torch compiled with cuda: ${IS_CUDA}"; \
-    if test "${IS_CUDA}" != "True" -a ! -z "${CUDA_VERSION}"; then \
-        exit 1; \
-    fi

 FROM ${BASE_IMAGE} as official
 ARG PYTORCH_VERSION
--- a/RELEASE.md
+++ b/RELEASE.md
@ -290,7 +290,7 @@ After the final RC is created. The following tasks should be performed :

 * Create validation issue for the release, see for example [Validations for 2.1.2 release](https://github.com/pytorch/pytorch/issues/114904) and perform required validations.

-* Run performance tests in [benchmark repository](https://github.com/pytorch/benchmark). Make sure there are no performance regressions.
+* Run performance tests in [benchmark repository](https://github.com/pytorch/benchmark). Make sure there are no prerformance regressions.

 * Prepare and stage PyPI binaries for promotion. This is done with this script:
 [`pytorch/builder:release/pypi/promote_pypi_to_staging.sh`](https://github.com/pytorch/builder/blob/main/release/pypi/promote_pypi_to_staging.sh)
@ -429,12 +429,12 @@ need to support these particular versions of software.

 ## Operating Systems
 Supported OS flavors are summarized in the table below:
-| Operating System family | Architecture | Notes |
+| Operating System family | Architectrue | Notes |
 | --- | --- | --- |
 | Linux | aarch64, x86_64 | Wheels are manylinux2014 compatible, i.e. they should be runnable on any Linux system with glibc-2.17 or above. |
 | MacOS | arm64 | Builds should be compatible with MacOS 11 (Big Sur) or newer, but are actively tested against MacOS 14 (Sonoma). |
 | MacOS | x86_64 | Requires MacOS Catalina or above, not supported after 2.2, see https://github.com/pytorch/pytorch/issues/114602 |
-| Windows | x86_64 | Builds are compatible with Windows-10 or newer. |
+| Windows | x86_64 | Buils are compatible with Windows-10 or newer. |

 # Submitting Tutorials

--- a/SECURITY.md
+++ b/SECURITY.md
@ -6,7 +6,7 @@
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Using distributed features](#using-distributed-features)
- [**CI/CD security principles**](#cicd-security-principles)
+
 ## Reporting Security Issues

 Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
@ -40,7 +40,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de

 ### Untrusted inputs during training and prediction

-If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permissions strictly required, and keep your libraries updated with the latest security patches.
+If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permisisons strictly required, and keep your libraries updated with the lates security patches.

 If applicable, prepare your model against bad inputs and prompt injections. Some recommendations:
 - Pre-analysis: check how the model performs by default when exposed to prompt injection (e.g. using fuzzing for prompt injection).
@ -61,27 +61,3 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.

 For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
-
-## CI/CD security principles
-_Audience_: Contributors and reviewers, especially if modifying the workflow files/build system.
-
-PyTorch CI/CD security philosophy is based on finding a balance between open and transparent CI pipelines while keeping the environment efficient and safe.
-
-PyTorch testing requirements are complex, and a large part of the code base can only be tested on specialized powerful hardware, such as GPU, making it a lucrative target for resource misuse. To prevent this, we require workflow run approval for PRs from non-member contributors. To keep the volume of those approvals relatively low, we easily extend write permissions to the repository to regular contributors.
-
-More widespread write access to the repo presents challenges when it comes to reviewing changes, merging code into trunk, and creating releases. [Protected branches](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/about-protected-branches) are used to restrict the ability to merge to the trunk/release branches only to the repository administrators and merge bot. The merge bot is responsible for mechanistically merging the change and validating reviews against the path-based rules defined in [merge_rules.yml](https://github.com/pytorch/pytorch/blob/main/.github/merge_rules.yaml). Once a PR has been reviewed by person(s) mentioned in these rules, leaving a `@pytorchbot merge` comment on the PR will initiate the merge process. To protect merge bot credentials from leaking, merge actions must be executed only on ephemeral runners (see definition below) using a specialized deployment environment.
-
-To speed up the CI system, build steps of the workflow rely on the distributed caching mechanism backed by [sccache](https://github.com/mozilla/sccache), making them susceptible to cache corruption compromises. For that reason binary artifacts generated during CI should not be executed in an environment that contains an access to any sensitive/non-public information and should not be published for use by general audience. One should not have any expectation about the lifetime of those artifacts, although in practice they likely remain accessible for about two weeks after the PR has been closed.
-
-To speed up CI system setup, PyTorch relies heavily on Docker to pre-build and pre-install the dependencies. To prevent a potentially malicious PR from altering ones that were published in the past, ECR has been configured to use immutable tags.
-
-To improve runner availability and more efficient resource utilization, some of the CI runners are non-ephemeral, i.e., workflow steps from completely unrelated PRs could be scheduled sequentially on the same runner, making them susceptible to reverse shell attacks. For that reason, PyTorch does not rely on the repository secrets mechanism, as these can easily be compromised in such attacks.
-
-### Release pipelines security
-
-To ensure safe binary releases, PyTorch release pipelines are built on the following principles:
- - All binary builds/upload jobs must be run on ephemeral runners, i.e., on a machine that is allocated from the cloud to do the build and released back to the cloud after the build is finished. This protects those builds from interference from external actors, who potentially can get reverse shell access to a non-ephemeral runner and wait there for a binary build.
- - All binary builds are cold-start builds, i.e., distributed caching/incremental builds are not permitted. This renders builds much slower than incremental CI builds but isolates them from potential compromises of the intermediate artifacts caching systems.
- - All upload jobs are executed in a [deployment environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) that are restricted to protected branches
- - Security credentials needed to upload binaries to PyPI/conda or stable indexes `download.pytorch.org/whl` are never uploaded to repo secrets storage/environment. This requires an extra manual step to publish the release but ensures that access to those would not be compromised by deliberate/accidental leaks of secrets stored in the cloud.
- - No binary artifacts should be published to GitHub releases pages, as these are overwritable by anyone with write permission to the repo.
--- a/6
+++ b/6
@ -174,12 +174,6 @@ new_local_repository(
    path = "third_party/cpp-httplib",
 )

-new_local_repository(
-    name = "nlohmann",
-    build_file = "//third_party:nlohmann.BUILD",
-    path = "third_party/nlohmann",
-)
-
 new_local_repository(
    name = "tensorpipe",
    build_file = "//third_party:tensorpipe.BUILD",
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -53,6 +53,11 @@ if(NOT BUILD_LITE_INTERPRETER)
  file(GLOB_RECURSE ATen_CORE_TEST_SRCS "core/*_test.cpp")
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
+# Exclude TensorImpl_test.cpp if compiling without Caffe2
+if(NOT BUILD_LITE_INTERPRETER)
+  file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
+  EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
+endif()

 file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
@ -468,7 +473,6 @@ endif()

 if(USE_CUDA AND NOT USE_ROCM)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
-  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  if($ENV{ATEN_STATIC_CUDA})
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
      ${CUDA_LIBRARIES}
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -56,14 +56,6 @@ void Context::setDeterministicCuDNN(bool b) {
  deterministic_cudnn = b;
 }

-bool Context::deterministicMkldnn() const {
-  return deterministic_mkldnn;
-}
-
-void Context::setDeterministicMkldnn(bool b) {
-  deterministic_mkldnn = b;
-}
-
 bool Context::deterministicAlgorithms() const {
  return _deterministic_algorithms;
 }
@ -153,13 +145,6 @@ void Context::setSDPUseCuDNN(bool e) {
  enabled_cudnnSDP = e;
 }

-void Context::setSDPUseOverrideable(bool e) {
-  enabled_overrideable = e;
-}
-
-bool Context::userEnabledOverrideableSDP() const {
-  return enabled_overrideable;
-}

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -65,8 +65,6 @@ class TORCH_API Context {
        : at::getAccelerator(true).value();
    if (device_type == at::kCUDA) {
      return at::detail::getCUDAHooks();
-    } else if (device_type == at::kXPU) {
-      return at::detail::getXPUHooks();
    } else if (device_type == at::kMPS) {
      return at::detail::getMPSHooks();
    } else if (device_type == at::kPrivateUse1) {
@ -190,8 +188,6 @@ class TORCH_API Context {
  void setBenchmarkLimitCuDNN(int);
  bool deterministicCuDNN() const;
  void setDeterministicCuDNN(bool);
-  bool deterministicMkldnn() const;
-  void setDeterministicMkldnn(bool);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

@ -218,9 +214,6 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

-  void setSDPUseOverrideable(bool);
-  bool userEnabledOverrideableSDP() const;
-
  at::LinalgBackend linalgPreferredBackend() const;
  void setLinalgPreferredBackend(at::LinalgBackend);

@ -365,15 +358,13 @@ class TORCH_API Context {
  c10::once_flag thp_init;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
-  bool deterministic_mkldnn = false;
  bool _deterministic_algorithms = false;
  bool _deterministic_algorithms_warn_only = false;
  bool _deterministic_fill_uninitialized_memory = true;
  bool enabled_flashSDP = true;
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
-  bool enabled_cudnnSDP = true;
-  bool enabled_overrideable = true;
+  bool enabled_cudnnSDP = false;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
@ -394,11 +385,8 @@ class TORCH_API Context {
      ? at::LinalgBackend::Cusolver
      : at::LinalgBackend::Default;
  at::BlasBackend blas_preferred_backend =
-#ifdef USE_ROCM
-      (c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") != false)
-#else
-      (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true)
-#endif
+      (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true ||
+       c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true)
      ? at::BlasBackend::Cublaslt
      : at::BlasBackend::Cublas;
 #ifdef C10_MOBILE
--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -143,7 +143,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) {
      return at::detail::getXPUHooks().getDeviceFromPtr(data);
    default:
      TORCH_CHECK(
-          false, "Unsupported device_type: ", std::to_string(ctx.device_type));
+          false, "Unsupported device_type: " + c10::to_string(ctx.device_type));
  }
 }

@ -167,7 +167,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kUInt bits ", std::to_string(dtype.bits));
+              false, "Unsupported kUInt bits " + c10::to_string(dtype.bits));
      }
      break;
    case DLDataTypeCode::kDLInt:
@ -186,7 +186,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kInt bits ", std::to_string(dtype.bits));
+              false, "Unsupported kInt bits " + c10::to_string(dtype.bits));
      }
      break;
    case DLDataTypeCode::kDLFloat:
@ -202,7 +202,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
+              false, "Unsupported kFloat bits " + c10::to_string(dtype.bits));
      }
      break;
    case DLDataTypeCode::kDLBfloat:
@ -212,7 +212,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
+              false, "Unsupported kFloat bits " + c10::to_string(dtype.bits));
      }
      break;
    case DLDataTypeCode::kDLComplex:
@ -228,7 +228,7 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kFloat bits ", std::to_string(dtype.bits));
+              false, "Unsupported kFloat bits " + c10::to_string(dtype.bits));
      }
      break;
    case DLDataTypeCode::kDLBool:
@ -238,11 +238,11 @@ ScalarType toScalarType(const DLDataType& dtype) {
          break;
        default:
          TORCH_CHECK(
-              false, "Unsupported kDLBool bits ", std::to_string(dtype.bits));
+              false, "Unsupported kDLBool bits " + c10::to_string(dtype.bits));
      }
      break;
    default:
-      TORCH_CHECK(false, "Unsupported code ", std::to_string(dtype.code));
+      TORCH_CHECK(false, "Unsupported code " + c10::to_string(dtype.code));
  }
  return stype;
 }
@ -298,7 +298,9 @@ Tensor fromDLPack(DLManagedTensor* src) {
  return fromDLPack(src, std::move(deleter));
 }

-Tensor fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter) {
+Tensor fromDLPack(
+    DLManagedTensor* src,
+    std::function<void(void*)> deleter) {
  Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
  ScalarType stype = toScalarType(src->dl_tensor.dtype);
  if (!src->dl_tensor.strides) {
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -1,37 +1,39 @@
-#include <ATen/Context.h>
 #include <ATen/DeviceAccelerator.h>
+#include <ATen/Context.h>
+
 namespace at {

 C10_API std::optional<DeviceType> getAccelerator(bool checked) {
-#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
-  if (at::has##device_name()) {                    \
-    device_type = k##device_name;                  \
-    TORCH_CHECK(                                   \
-        !is_accelerator_detected,                  \
-        "Cannot have ",                            \
-        device_type.value(),                       \
-        " with other accelerators.");              \
-    is_accelerator_detected = true;                \
-  }
+#define CHECK_NO_CUDA \
+  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");

-  if (is_privateuse1_backend_registered()) {
-    // We explicitly allow PrivateUse1 and another device at the same time as we
-    // use this for testing. Whenever a PrivateUse1 device is registered, use it
-    // first.
-    return kPrivateUse1;
-  }
-  std::optional<DeviceType> device_type = std::nullopt;
-  bool is_accelerator_detected = false;
-  DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
-  DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
-  DETECT_AND_ASSIGN_ACCELERATOR(XPU)
-  if (checked) {
-    TORCH_CHECK(
-        device_type, "Cannot access accelerator device when none is available.")
-  }
-  return device_type;
+#define CHECK_NO_PU1 \
+  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");

-#undef DETECT_AND_ASSIGN_ACCELERATOR
+#define CHECK_NO_MTIA \
+  TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices");
+
+    if (is_privateuse1_backend_registered()) {
+        // We explicitly allow PrivateUse1 and another device at the same time
+        // as we use this for testing.
+        // Whenever a PrivateUse1 device is registered, use it first.
+        return kPrivateUse1;
+    } else if (at::hasCUDA()) {
+        CHECK_NO_PU1
+        CHECK_NO_MTIA
+        return kCUDA;
+    } else if (at::hasMTIA()) {
+        CHECK_NO_CUDA
+        CHECK_NO_PU1
+        return kMTIA;
+    } else {
+        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
+        return std::nullopt;
+    }
+
+#undef CHECK_NO_CUDA
+#undef CHECK_NO_PU1
 }

+
 } // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -13,9 +13,9 @@
 // - It provides a set of common APIs as defined by AcceleratorHooksInterface
 //
 // As of today, accelerator devices are (in no particular order):
-// CUDA, MTIA, XPU, PrivateUse1
+// CUDA, MTIA, PrivateUse1
 // We want to add once all the proper APIs are supported and tested:
-// HIP, MPS
+// HIP, MPS, XPU

 namespace at {

--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -29,7 +29,6 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
  return c10::GetCPUAllocator();
 }

-#ifndef C10_MOBILE
 constexpr uint64_t storage_max() {
  // int64_t and size_t are used somewhat inconsistently throughout ATen.
  // To be safe, storage size calculations must fit in both types.
@ -39,7 +38,6 @@ constexpr uint64_t storage_max() {
      std::numeric_limits<size_t>::max());
  return std::min(int64_max, size_max);
 }
-#endif

 inline void raise_warning_for_complex_half(ScalarType dtype) {
  if (dtype == kComplexHalf) {
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -462,7 +462,7 @@ inline Tensor _sum_to(
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(shape[i - leading_dims], 1)) &&
+    if (shape[i - leading_dims] == 1 &&
        TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) {
      reduce_dims.push_back(i);
    }
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
    return Tensor();
 }

-Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx) {
  auto values = at::_nested_get_values(mutated_view);
  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return values;
@ -317,12 +317,7 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
  auto lengths = at::_nested_get_lengths(base);
  auto ragged_idx = at::_nested_get_ragged_idx(base);
  auto dummy = at::_nested_get_jagged_dummy(base);
-  auto min_seqlen = at::_nested_get_min_seqlen(base);
-  auto max_seqlen = at::_nested_get_max_seqlen(base);
-  auto nt = at::_nested_view_from_jagged(
-      mutated_view, offsets, dummy, lengths, ragged_idx,
-      (min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : c10::nullopt),
-      (max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : c10::nullopt));
+  auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx);

  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return nt;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -514,9 +514,6 @@ c10::SymInt FunctionalTensorWrapper::sym_size_custom(int64_t d) const {
 c10::SymInt FunctionalTensorWrapper::sym_storage_offset_custom() const {
  return value_.unsafeGetTensorImpl()->sym_storage_offset();
 }
-c10::Layout FunctionalTensorWrapper::layout_impl() const {
-  return value_.unsafeGetTensorImpl()->layout();
-}

 namespace functionalization {
 namespace impl {
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -222,7 +222,6 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  c10::SymIntArrayRef sym_strides_custom() const override;
  c10::SymInt sym_storage_offset_custom() const override;
  c10::Device device_custom() const override;
-  c10::Layout layout_impl() const override;

 private:
  const char* tensorimpl_type_name() const override;
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@ -139,7 +139,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
    if (self_vmap_levels != (self_vmap_levels | other_vmap_levels)) {
      // Find one vmap level to complain about
      auto additional_bdims = (self_vmap_levels | other_vmap_levels) ^ self_vmap_levels;
-      [[maybe_unused]] auto offending_level = llvm::findLastSet(additional_bdims.to_ulong());
+      auto offending_level = llvm::findLastSet(additional_bdims.to_ulong());
      // The following prints out "vmap: aten::add_(tensor, ...) is not possible",
      // but it would be better to print out "tensor.add_(...) is not possible".
      // Afaict there's no official way to get the add_ and there is no way to
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@ -55,10 +55,6 @@ class TORCH_API MapAllocator {
    return base_ptr_;
  }

-  int flags() const {
-    return flags_;
-  }
-
  static MapAllocator* fromDataPtr(const at::DataPtr&);
  static at::DataPtr makeDataPtr(
      c10::string_view filename,
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@ -19,13 +19,7 @@ MemOverlap has_internal_overlap(TensorImpl* t) {
  auto strides = t->sym_strides();
  auto sizes = t->sym_sizes();
  for (const auto i : c10::irange(strides.size())) {
-    // NB: The size oblivious test is written very carefully here.  When
-    // unbacked SymInts are involved, we should try to conservatively report
-    // if memory overlap /could/ happen under some setting of unbacked
-    // SymInts.  Thus, if I have u0 size, we should assume that this has > 1
-    // elements (first expression), but if I have a u0 stride, I should NOT
-    // assume that it is not zero (second expression)
-    if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_gt(1)) && strides[i] == 0) {
+    if (strides[i] == 0 && sizes[i] > 1) {
      return MemOverlap::Yes;
    }
  }
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@ -8,14 +8,12 @@

 namespace at {

-#ifndef STRIP_ERROR_MESSAGES
 // Returns "Tensor['N', 'C', 'H', 'W']" for a tensor with names ('N', 'C', 'H', 'W').
 static std::string toDimnameRepr(const Tensor& tensor) {
  std::ostringstream os;
  os << "Tensor" << tensor.names();
  return os.str();
 }
-#endif

 int64_t dimname_to_position(const Tensor& tensor, Dimname dim) {
  TORCH_CHECK(dim.type() != NameType::WILDCARD,
--- a/aten/src/ATen/ParallelCommon.cpp
+++ b/aten/src/ATen/ParallelCommon.cpp
@ -29,7 +29,6 @@ const char* get_env_var(
  return value ? value : def_value;
 }

-#ifndef C10_MOBILE
 size_t get_env_num_threads(const char* var_name, size_t def_value = 0) {
  try {
    if (auto* value = std::getenv(var_name)) {
@ -44,7 +43,6 @@ size_t get_env_num_threads(const char* var_name, size_t def_value = 0) {
  }
  return def_value;
 }
-#endif

 } // namespace

--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -35,12 +35,6 @@ void SavedTensorDefaultHooks::enable() {
  tls.disabled_error_message = c10::nullopt;
 }

-/* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
-  bool prior  = tls.is_tracing;
-  tls.is_tracing = is_tracing;
-  return prior;
-}
-
 const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
  return tls.disabled_error_message;
 }
@ -65,20 +59,25 @@ void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_h
  tls.stack.emplace(pack_hook, unpack_hook);
 }

-std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
+void SavedTensorDefaultHooks::pop_hooks() {
  // Reference counting is handled by the caller of `pop_hooks`
  TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
-  std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
  tls.stack.pop();
-  return hooks;
 }

 std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
-  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
-  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
+  if (!is_initialized || tls.stack.empty()) {
    return std::make_pair(nullptr, nullptr);
  }
  return tls.stack.top();
 }

+std::stack<std::pair<PyObject*, PyObject*>> SavedTensorDefaultHooks::get_stack() {
+  return tls.stack;
+}
+
+void SavedTensorDefaultHooks::set_stack(std::stack<std::pair<PyObject*, PyObject*>> stack_) {
+  tls.stack = std::move(stack_);
+}
+
 }
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -22,18 +22,17 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
  // We did this for efficiency (so we didn't have to keep a separate bool
  // around)
  std::optional<std::string> disabled_error_message;
-
-  // See NOTE: [Deferring tensor pack/unpack hooks until runtime]
-  bool is_tracing = false;
 };

 } // namespace impl

 struct TORCH_API SavedTensorDefaultHooks {
  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
-  static std::pair<PyObject*, PyObject*> pop_hooks();
+  static void pop_hooks();
  static std::pair<PyObject*, PyObject*> get_hooks();
  static void lazy_initialize();
+  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
+  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);

  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
@ -43,20 +42,11 @@ struct TORCH_API SavedTensorDefaultHooks {
  // hooks, especially if their feature does not work with it. If they are
  // disabled, then the following will raise an error:
  // - Attempting to push_hooks
-  // - calling disable(message) with a non-zero stack (hooks) size
+  // - calling disable(message) with a non-zero stack (from get_stack) size
  static void disable(const std::string& error_message);
  static void enable();
  static bool is_enabled();
  static const std::optional<std::string>& get_disabled_error_message();
-
-  // NOTE: [Deferring tensor pack/unpack hooks until runtime]
-  // To preserve eager semantics of pack/unpack hooks firing only once per saved
-  // variable, Dynamo/AOTAutograd need to defer hook firing until runtime. Using
-  // disable() would loud error at trace time, and pushing a no-op hook would
-  // fail when the traced code is wrapped in a disable_saved_tensors_hooks ctx.
-  // To do so, we disable these hooks during tracing. See
-  // https://github.com/pytorch/pytorch/issues/113263.
-  static bool set_tracing(bool is_tracing);
 };

 } // namespace at
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -140,7 +140,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
        "), but got ",
        size.size());
    if (nnz() > 0) {
-      [[maybe_unused]] auto constexpr alt_options_msg =
+      auto alt_options_msg =
          "You could try the following options:\n\
 1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\
 2. If you need to resize this tensor, you have the following options:\n\
--- a/aten/src/ATen/TensorIndexing.h
+++ b/aten/src/ATen/TensorIndexing.h
@ -197,7 +197,7 @@ TORCH_API std::ostream& operator<<(
    const std::vector<TensorIndex>& tensor_indices);

 namespace impl {
-inline Tensor applySlice(
+static inline Tensor applySlice(
    const Tensor& self,
    int64_t dim,
    c10::SymInt start,
@ -227,7 +227,7 @@ inline Tensor applySlice(
      dim, std::move(start), std::move(stop), std::move(step));
 }

-inline Tensor applySelect(
+static inline Tensor applySelect(
    const Tensor& self,
    int64_t dim,
    SymInt index,
@ -266,7 +266,9 @@ inline Tensor applySelect(
  return self.select_symint(dim, std::move(index));
 }

-inline Tensor boolToIndexingTensorCPUOrCUDA(const Tensor& self, bool value) {
+static inline Tensor boolToIndexingTensorCPUOrCUDA(
+    const Tensor& self,
+    bool value) {
  // booleans add a dimension of size 1. true indexes this dimension as if 0:,
  // false as empty.
  if (value) {
@ -276,7 +278,7 @@ inline Tensor boolToIndexingTensorCPUOrCUDA(const Tensor& self, bool value) {
  }
 }

-inline Tensor boolToIndexingTensorNonNativeDeviceType(
+static inline Tensor boolToIndexingTensorNonNativeDeviceType(
    const Tensor& self,
    bool value) {
  // booleans add a dimension of size 1. true indexes this dimension as if 0:,
@ -288,7 +290,7 @@ inline Tensor boolToIndexingTensorNonNativeDeviceType(
  }
 }

-inline Tensor boolToIndexingTensor(
+static inline Tensor boolToIndexingTensor(
    const Tensor& self,
    bool value,
    const at::Device& self_device) {
@ -299,13 +301,13 @@ inline Tensor boolToIndexingTensor(
  }
 }

-inline Tensor scalarToTensorNonNativeDeviceType(
+static inline Tensor scalarToTensorNonNativeDeviceType(
    const Scalar& v,
    const TensorOptions& options) {
  return at::scalar_tensor(v, options);
 }

-inline void recordTensorIndex(
+static inline void recordTensorIndex(
    const Tensor& tensor,
    std::vector<Tensor>& outIndices,
    int64_t* dim_ptr) {
@ -315,7 +317,7 @@ inline void recordTensorIndex(
  (*dim_ptr)++;
 };

-inline c10::List<::std::optional<Tensor>> typeConvertIndices(
+static inline c10::List<::std::optional<Tensor>> typeConvertIndices(
    const Tensor& /*self*/,
    std::vector<Tensor>&& indices) {
  c10::List<::std::optional<Tensor>> converted_inds;
@ -336,7 +338,7 @@ inline c10::List<::std::optional<Tensor>> typeConvertIndices(
 // construct a `std::vector` container to be consumed by the C++
 // `count_specified_dimensions` function, which adds 100s of nanoseconds
 // overhead and is undesirable.
-inline int64_t count_specified_dimensions(
+static inline int64_t count_specified_dimensions(
    const ArrayRef<TensorIndex>& indices) {
  // Count the number of indexed dimensions (everything but ellipsis and None)
  int64_t count = 0;
@ -370,7 +372,7 @@ inline int64_t count_specified_dimensions(
 //
 // The rest of the functions are in `at::indexing::impl` namespace, signifying
 // that they shouldn't be used from Python indexing implementation.
-inline Tensor scalarToTensor(
+static inline Tensor scalarToTensor(
    const Scalar& v,
    const TensorOptions& options,
    const at::Device& self_device) {
@ -385,7 +387,7 @@ inline Tensor scalarToTensor(
 // To match numpy semantics:
 // As a special case for backwards compatibility,
 // strip away unit dimensions from the left of 'src'
-inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) {
+static inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) {
  size_t first_non1_src = sizes.size();
  for (const auto i : c10::irange(sizes.size())) {
    // Unbacked SymInt has different behavior, but this is sound because
@ -400,7 +402,7 @@ inline SymIntArrayRef slicePrefix1sSize(const SymIntArrayRef& sizes) {
  return sizes.slice(first_non1_src);
 }

-inline void copy_to(const Tensor& dst, const Tensor& src) {
+static inline void copy_to(const Tensor& dst, const Tensor& src) {
  if (dst.sym_sizes().equals(src.sym_sizes())) {
    // A shortcut to avoid generating hard-coded constant sizes during tracing.
    // This is not a perfect solution: when src & dst have different shapes,
@ -419,7 +421,7 @@ inline void copy_to(const Tensor& dst, const Tensor& src) {

 // See NOTE [ Setting `disable_slice_optimization` when calling C++ tensor
 // indexing functions from Python ]
-inline Tensor handleDimInMultiDimIndexing(
+static inline Tensor handleDimInMultiDimIndexing(
    const Tensor& prev_dim_result,
    const Tensor& original_tensor,
    const TensorIndex& index,
@ -507,7 +509,7 @@ inline Tensor handleDimInMultiDimIndexing(
 namespace impl {
 // This mirrors `applySlicing` in
 // torch/csrc/autograd/python_variable_indexing.cpp
-inline Tensor applySlicing(
+static inline Tensor applySlicing(
    const Tensor& self,
    const ArrayRef<TensorIndex>& indices,
    std::vector<Tensor>& outIndices,
@ -548,13 +550,13 @@ inline Tensor applySlicing(
 }
 } // namespace impl

-inline Tensor dispatch_index(
+static inline Tensor dispatch_index(
    const Tensor& self,
    std::vector<Tensor>&& indices) {
  return self.index(impl::typeConvertIndices(self, std::move(indices)));
 }

-inline Tensor dispatch_index_put_(
+static inline Tensor dispatch_index_put_(
    Tensor& self,
    std::vector<Tensor>&& indices,
    const Tensor& value) {
@ -596,7 +598,7 @@ inline Tensor dispatch_index_put_(
 // torch/csrc/autograd/python_variable_indexing.cpp See NOTE [ Setting
 // `disable_slice_optimization` when calling C++ tensor indexing functions from
 // Python ]
-inline Tensor get_item(
+static inline Tensor get_item(
    const Tensor& self,
    const ArrayRef<TensorIndex>& indices,
    bool disable_slice_optimization = false) {
@ -662,7 +664,7 @@ inline Tensor get_item(
 // torch/csrc/autograd/python_variable_indexing.cpp for "the assigned value is a
 // Tensor" case See NOTE [ Setting `disable_slice_optimization` when calling C++
 // tensor indexing functions from Python ]
-inline void set_item(
+static inline void set_item(
    const Tensor& self,
    const ArrayRef<TensorIndex>& indices,
    const Tensor& value,
--- a/Show More
+++ b/Show More