[dynamo] Graph break on random_ op

Fixes https://github.com/pytorch/pytorch/issues/121621 ghstack-source-id: 098b44305ae2aaab334e6973f6e94f937e61f9a0 Pull Request resolved: https://github.com/pytorch/pytorch/pull/130222
[export] Update example inputs format for DB. (#129982 )
2025-10-23 14:59:34 +08:00 · 2024-07-07 15:29:48 -07:00 · 2024-07-03 17:53:15 +00:00 · 2024-07-03 17:24:08 +00:00 · 2024-07-03 17:20:19 +00:00 · 2024-07-03 17:08:45 +00:00
1775 changed files with 75198 additions and 50697 deletions
--- a/.ci/docker/aotriton_version.txt
+++ b/.ci/docker/aotriton_version.txt
@ -1,5 +1,5 @@
 0.6b
 manylinux_2_17
-rocm6
-04b5df8c8123f90cba3ede7e971e6fbc6040d506
-3db6ecbc915893ff967abd6e1b43bd5f54949868873be60dc802086c3863e648
+rocm6.1
+7f07e8a1cb1f99627eb6d77f5c0e9295c775f3c7
+77c29fa3f3b614e187d7213d745e989a92708cee2bc6020419ab49019af399d1
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -373,6 +373,13 @@ case "$image" in
    CONDA_CMAKE=yes
    EXECUTORCH=yes
    ;;
+  pytorch-linux-jammy-py3.12-halide)
+    CUDA_VERSION=12.4
+    ANACONDA_PYTHON_VERSION=3.12
+    GCC_VERSION=11
+    CONDA_CMAKE=yes
+    HALIDE=yes
+    ;;
  pytorch-linux-focal-linter)
    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
    # We will need to update mypy version eventually, but that's for another day. The task
@ -490,6 +497,7 @@ docker build \
       --build-arg "DOCS=${DOCS}" \
       --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \
       --build-arg "EXECUTORCH=${EXECUTORCH}" \
+       --build-arg "HALIDE=${HALIDE}" \
       --build-arg "XPU_VERSION=${XPU_VERSION}" \
       --build-arg "ACL=${ACL:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
--- a/.ci/docker/ci_commit_pins/executorch.txt
+++ b/.ci/docker/ci_commit_pins/executorch.txt
@ -1 +1 @@
-d4b3e5cc607e97afdba79dc90f8ef968142f347c
+c572f9e509b5ec5d56f4d218271e36269bba244f
--- a/.ci/docker/ci_commit_pins/halide.txt
+++ b/.ci/docker/ci_commit_pins/halide.txt
@ -0,0 +1 @@
+340136fec6d3ebc73e7a19eba1663e9b0ba8ab2d
--- a/.ci/docker/ci_commit_pins/triton-rocm.txt
+++ b/.ci/docker/ci_commit_pins/triton-rocm.txt
@ -1 +1 @@
-01cbe5045a6898c9a925f01435c8277b2fe6afcc
+21eae954efa5bf584da70324b640288c3ee7aede
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-b8c64f64c18d8cac598b3adb355c21e7439c21de
+1b2f15840e0d70eec50d84c7a0575cb835524def
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-45fff310c891f5a92d55445adf8cc9d29df5841e
+dedb7bdf339a3546896d4820366ca562c586bfa0
--- a/.ci/docker/common/install_aotriton.sh
+++ b/.ci/docker/common/install_aotriton.sh
@ -9,7 +9,7 @@ TARBALL='aotriton.tar.bz2'
 read -d "\n" VER MANYLINUX ROCMBASE PINNED_COMMIT SHA256 < aotriton_version.txt || true
 ARCH=$(uname -m)
 AOTRITON_INSTALL_PREFIX="$1"
-AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}.tar.bz2"
+AOTRITON_URL="https://github.com/ROCm/aotriton/releases/download/${VER}/aotriton-${VER}-${MANYLINUX}_${ARCH}-${ROCMBASE}-shared.tar.bz2"

 cd "${AOTRITON_INSTALL_PREFIX}"
 # Must use -L to follow redirects
--- a/.ci/docker/common/install_executorch.sh
+++ b/.ci/docker/common/install_executorch.sh
@ -37,6 +37,9 @@ install_conda_dependencies() {

 install_pip_dependencies() {
  pushd executorch/.ci/docker
+  # Install PyTorch CPU build beforehand to avoid installing the much bigger CUDA
+  # binaries later, ExecuTorch only needs CPU
+  pip_install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
  # Install all Python dependencies
  pip_install -r requirements-ci.txt
  popd
@ -44,13 +47,14 @@ install_pip_dependencies() {

 setup_executorch() {
  pushd executorch
-  source .ci/scripts/utils.sh
+  # Setup swiftshader and Vulkan SDK which are required to build the Vulkan delegate
+  as_jenkins bash .ci/scripts/setup-vulkan-linux-deps.sh

-  install_flatc_from_source
-  pip_install .
+  export PYTHON_EXECUTABLE=python
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"

-  # Make sure that all the newly generate files are owned by Jenkins
-  chown -R jenkins .
+  as_jenkins .ci/scripts/setup-linux.sh cmake
  popd
 }

--- a/.ci/docker/common/install_halide.sh
+++ b/.ci/docker/common/install_halide.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+set -ex
+
+source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
+
+COMMIT=$(get_pinned_commit halide)
+test -n "$COMMIT"
+
+# activate conda to populate CONDA_PREFIX
+test -n "$ANACONDA_PYTHON_VERSION"
+eval "$(conda shell.bash hook)"
+conda activate py_$ANACONDA_PYTHON_VERSION
+
+if [ -n "${UBUNTU_VERSION}" ];then
+    apt update
+    apt-get install -y lld liblld-15-dev libpng-dev libjpeg-dev libgl-dev \
+                  libopenblas-dev libeigen3-dev libatlas-base-dev libzstd-dev
+fi
+
+conda_install numpy scipy imageio cmake ninja
+
+git clone --depth 1 --branch release/16.x --recursive https://github.com/llvm/llvm-project.git
+cmake -DCMAKE_BUILD_TYPE=Release \
+        -DLLVM_ENABLE_PROJECTS="clang" \
+        -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" \
+        -DLLVM_ENABLE_TERMINFO=OFF -DLLVM_ENABLE_ASSERTIONS=ON \
+        -DLLVM_ENABLE_EH=ON -DLLVM_ENABLE_RTTI=ON -DLLVM_BUILD_32_BITS=OFF \
+        -S llvm-project/llvm -B llvm-build -G Ninja
+cmake --build llvm-build
+cmake --install llvm-build --prefix llvm-install
+export LLVM_ROOT=`pwd`/llvm-install
+export LLVM_CONFIG=$LLVM_ROOT/bin/llvm-config
+
+git clone https://github.com/halide/Halide.git
+pushd Halide
+git checkout ${COMMIT} && git submodule update --init --recursive
+pip_install -r requirements.txt
+cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -S . -B build
+cmake --build build
+test -e ${CONDA_PREFIX}/lib/python3 || ln -s python${ANACONDA_PYTHON_VERSION} ${CONDA_PREFIX}/lib/python3
+cmake --install build --prefix ${CONDA_PREFIX}
+chown -R jenkins ${CONDA_PREFIX}
+popd
+rm -rf Halide llvm-build llvm-project llvm-install
+
+python -c "import halide"  # check for errors
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -33,7 +33,9 @@ pip_install coloredlogs packaging
 pip_install onnxruntime==1.18
 pip_install onnx==1.16.0
 # pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps
-pip_install onnxscript==0.1.0.dev20240523 --no-deps
+pip_install onnxscript==0.1.0.dev20240613 --no-deps
+# required by onnxscript
+pip_install ml_dtypes

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -85,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11"
 #Pinned versions:
 #test that import:

-mypy==1.9.0
+mypy==1.10.0
 # Pin MyPy version because new errors are likely to appear with each release
 #Description: linter
-#Pinned versions: 1.9.0
+#Pinned versions: 1.10.0
 #test that import: test_typing.py, test_type_hints.py

 networkx==2.8.8
@ -306,7 +306,7 @@ pywavelets==1.5.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:

-lxml==5.0.0.
+lxml==5.0.0
 #Description: This is a requirement of unittest-xml-reporting

 # Python-3.9 binaries
--- a/.ci/docker/ubuntu-cuda/Dockerfile
+++ b/.ci/docker/ubuntu-cuda/Dockerfile
@ -103,6 +103,14 @@ COPY triton_version.txt triton_version.txt
 RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi
 RUN rm install_triton.sh common_utils.sh triton.txt triton_version.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 # Install ccache/sccache (do this last, so we get priority in PATH)
 COPY ./common/install_cache.sh install_cache.sh
 ENV PATH /opt/cache/bin:$PATH
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -155,6 +155,14 @@ COPY ci_commit_pins/executorch.txt executorch.txt
 RUN if [ -n "${EXECUTORCH}" ]; then bash ./install_executorch.sh; fi
 RUN rm install_executorch.sh common_utils.sh executorch.txt

+ARG HALIDE
+# Build and install halide
+COPY ./common/install_halide.sh install_halide.sh
+COPY ./common/common_utils.sh common_utils.sh
+COPY ci_commit_pins/halide.txt halide.txt
+RUN if [ -n "${HALIDE}" ]; then bash ./install_halide.sh; fi
+RUN rm install_halide.sh common_utils.sh halide.txt
+
 ARG ONNX
 # Install ONNX dependencies
 COPY ./common/install_onnx.sh ./common/common_utils.sh ./
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -284,12 +284,26 @@ else
        # Which should be backward compatible with Numpy-1.X
        python -mpip install --pre numpy==2.0.0rc1
      fi
-      WERROR=1 python setup.py bdist_wheel
+
+      WERROR=1 python setup.py clean
+
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        BUILD_LIBTORCH_WHL=1 BUILD_PYTHON_ONLY=0 python setup.py bdist_wheel
+        BUILD_LIBTORCH_WHL=0 BUILD_PYTHON_ONLY=1 python setup.py bdist_wheel --cmake
+      else
+        WERROR=1 python setup.py bdist_wheel
+      fi
    else
+      python setup.py clean
      if [[ "$BUILD_ENVIRONMENT" == *xla* ]]; then
        source .ci/pytorch/install_cache_xla.sh
      fi
-      python setup.py bdist_wheel
+      if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+        echo "USE_SPLIT_BUILD cannot be used with xla or rocm"
+        exit 1
+      else
+        python setup.py bdist_wheel
+      fi
    fi
    pip_install_whl "$(echo dist/*.whl)"

@ -328,9 +342,10 @@ else
    CUSTOM_OP_TEST="$PWD/test/custom_operator"
    python --version
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
+
    mkdir -p "$CUSTOM_OP_BUILD"
    pushd "$CUSTOM_OP_BUILD"
-    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_OP_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -343,7 +358,7 @@ else
    SITE_PACKAGES="$(python -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
    mkdir -p "$JIT_HOOK_BUILD"
    pushd "$JIT_HOOK_BUILD"
-    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$JIT_HOOK_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
@ -355,7 +370,7 @@ else
    python --version
    mkdir -p "$CUSTOM_BACKEND_BUILD"
    pushd "$CUSTOM_BACKEND_BUILD"
-    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch" -DPython_EXECUTABLE="$(which python)" \
+    cmake "$CUSTOM_BACKEND_TEST" -DCMAKE_PREFIX_PATH="$SITE_PACKAGES/torch;$SITE_PACKAGES" -DPython_EXECUTABLE="$(which python)" \
          -DCMAKE_MODULE_PATH="$CUSTOM_TEST_MODULE_PATH" -DUSE_ROCM="$CUSTOM_TEST_USE_ROCM"
    make VERBOSE=1
    popd
--- a/.ci/pytorch/common_utils.sh
+++ b/.ci/pytorch/common_utils.sh
@ -56,9 +56,29 @@ function assert_git_not_dirty() {
 function pip_install_whl() {
  # This is used to install PyTorch and other build artifacts wheel locally
  # without using any network connection
-  python3 -mpip install --no-index --no-deps "$@"
+
+  # Convert the input arguments into an array
+  local args=("$@")
+
+  # Check if the first argument contains multiple paths separated by spaces
+  if [[ "${args[0]}" == *" "* ]]; then
+    # Split the string by spaces into an array
+    IFS=' ' read -r -a paths <<< "${args[0]}"
+    # Loop through each path and install individually
+    for path in "${paths[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  else
+    # Loop through each argument and install individually
+    for path in "${args[@]}"; do
+      echo "Installing $path"
+      python3 -mpip install --no-index --no-deps "$path"
+    done
+  fi
 }

+
 function pip_install() {
  # retry 3 times
  # old versions of pip don't have the "--progress-bar" flag
@ -188,28 +208,6 @@ function clone_pytorch_xla() {
  fi
 }

-function checkout_install_torchdeploy() {
-  local commit
-  commit=$(get_pinned_commit multipy)
-  pushd ..
-  git clone --recurse-submodules https://github.com/pytorch/multipy.git
-  pushd multipy
-  git checkout "${commit}"
-  python multipy/runtime/example/generate_examples.py
-  BUILD_CUDA_TESTS=1 pip install -e .
-  popd
-  popd
-}
-
-function test_torch_deploy(){
- pushd ..
- pushd multipy
- ./multipy/runtime/build/test_deploy
- ./multipy/runtime/build/test_deploy_gpu
- popd
- popd
-}
-
 function checkout_install_torchbench() {
  local commit
  commit=$(get_pinned_commit torchbench)
@ -224,6 +222,8 @@ function checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
+  echo "Print all dependencies after TorchBench is installed"
+  python -mpip freeze
  popd
 }

--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -18,8 +18,8 @@ time python test/run_test.py --verbose -i distributed/test_c10d_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_nccl
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_gloo
 time python test/run_test.py --verbose -i distributed/test_c10d_spawn_nccl
-time python test/run_test.py --verbose -i distributed/test_cuda_p2p
 time python test/run_test.py --verbose -i distributed/test_store
+time python test/run_test.py --verbose -i distributed/test_symmetric_memory
 time python test/run_test.py --verbose -i distributed/test_pg_wrapper
 time python test/run_test.py --verbose -i distributed/rpc/cuda/test_tensorpipe_agent
 # FSDP tests
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -264,18 +264,6 @@ elif [[ $TEST_CONFIG == 'nogpu_AVX512' ]]; then
  export ATEN_CPU_CAPABILITY=avx2
 fi

-# temp workarounds for https://github.com/pytorch/pytorch/issues/126692, remove when fixed
-if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
-  pushd test
-  CUDA_VERSION=$(python -c "import torch; print(torch.version.cuda)")
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    ISCUDA124="cu124"
-  else
-    ISCUDA124=""
-  fi
-  popd
-fi
-
 test_python_legacy_jit() {
  time python test/run_test.py --include test_jit_legacy test_jit_fuser_legacy --verbose
  assert_git_not_dirty
@ -289,6 +277,9 @@ test_python_shard() {

  # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly
  # shellcheck disable=SC2086
+
+  # modify LD_LIBRARY_PATH to ensure it has the conda env.
+  # This set of tests has been shown to be buggy without it for the split-build
  time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION

  assert_git_not_dirty
@ -347,17 +338,31 @@ test_inductor_distributed() {
  assert_git_not_dirty
 }

-test_inductor() {
-  python tools/dynamo/verify_dynamo.py
-  python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose
-  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
-  python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose
+test_inductor_shard() {
+  if [[ -z "$NUM_TEST_SHARDS" ]]; then
+    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
+    exit 1
+  fi

+  python tools/dynamo/verify_dynamo.py
+  python test/run_test.py --inductor \
+    --include test_modules test_ops test_ops_gradients test_torch \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
+
+  # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state
+  python test/run_test.py \
+    --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
+}
+
+test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then
-      BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-      CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
+    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
+    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference
  fi
 }

@ -376,7 +381,7 @@ test_inductor_cpp_wrapper_abi_compatible() {
    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_timm_training.csv"
+    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_timm_training.csv"
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -401,7 +406,7 @@ if [[ "${TEST_CONFIG}" == *dynamic* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--dynamic-shapes --dynamic-batch-only)
 fi

-if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
@ -526,9 +531,10 @@ test_single_dynamo_benchmark() {
    test_perf_for_dashboard "$suite" \
      "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}"
  else
-    if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *aot_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
      # Test AOTInductor with the ABI-compatible mode on CI
      # This can be removed once the ABI-compatible mode becomes default.
+      # For CPU device, we perfer non ABI-compatible mode on CI when testing AOTInductor.
      export TORCHINDUCTOR_ABI_COMPATIBLE=1
    fi
    python "benchmarks/dynamo/$suite.py" \
@ -538,10 +544,10 @@ test_single_dynamo_benchmark() {
      --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
    python benchmarks/dynamo/check_graph_breaks.py \
      --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/${TEST_CONFIG}_${name}.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/${TEST_CONFIG}_${name}.csv"
  fi
 }

@ -550,6 +556,11 @@ test_inductor_micro_benchmark() {
  python benchmarks/gpt_fast/benchmark.py --output "${TEST_REPORTS_DIR}/gpt_fast_benchmark.csv"
 }

+test_inductor_halide() {
+  python test/run_test.py --include inductor/test_halide.py --verbose
+  assert_git_not_dirty
+}
+
 test_dynamo_benchmark() {
  # Usage: test_dynamo_benchmark huggingface 0
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
@ -564,11 +575,15 @@ test_dynamo_benchmark() {
  elif [[ "${TEST_CONFIG}" == *perf* ]]; then
    test_single_dynamo_benchmark "dashboard" "$suite" "$shard_id" "$@"
  else
-    if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
+      local dt="float32"
+      if [[ "${TEST_CONFIG}" == *amp* ]]; then
+        dt="amp"
+      fi
      if [[ "${TEST_CONFIG}" == *freezing* ]]; then
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 --freezing "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" --freezing "$@"
      else
-        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --float32 "$@"
+        test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --"$dt" "$@"
      fi
    elif [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then
      test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --inference --bfloat16 "$@"
@ -592,7 +607,7 @@ test_inductor_torchbench_smoketest_perf() {
    --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
  python benchmarks/dynamo/check_accuracy.py \
    --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-    --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_torchbench_inference.csv"
+    --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_torchbench_inference.csv"

  python benchmarks/dynamo/torchbench.py --device cuda --performance --backend inductor --float16 --training \
    --batch-size-file "$(realpath benchmarks/dynamo/torchbench_models_list.txt)" --only hf_Bert \
@ -607,13 +622,8 @@ test_inductor_torchbench_smoketest_perf() {
  # https://github.com/pytorch/pytorch/actions/runs/7158691360/job/19491437314,
  # and thus we lower its threshold to reduce flakiness. If this continues to be a problem,
  # we switch to use some other model.
-  # Use 4.7 for cuda 12.4, change back to 4.9 after fixing https://github.com/pytorch/pytorch/issues/126692
-  if [ "$CUDA_VERSION" == "12.4" ]; then
-    THRESHOLD=4.7
-  else
-    THRESHOLD=4.9
-  fi
-  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t $THRESHOLD
+  # lowering threshold from 4.9 to 4.7 for cu124. Will bump it up after cuda 12.4.0->12.4.1 update
+  python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" -t 4.7

  # Check memory compression ratio for a few models
  for test in hf_Albert timm_vision_transformer; do
@ -632,7 +642,7 @@ test_inductor_torchbench_smoketest_perf() {
      --only $test --output "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv"
    python benchmarks/dynamo/check_accuracy.py \
      --actual "$TEST_REPORTS_DIR/inductor_warm_start_smoketest_$test.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${ISCUDA124}/inductor_huggingface_training.csv"
+      --expected "benchmarks/dynamo/ci_expected_accuracy/inductor_huggingface_training.csv"
  done
 }

@ -1169,15 +1179,21 @@ test_executorch() {

  pushd /executorch

-  # NB: We need to build ExecuTorch runner here and not inside the Docker image
-  # because it depends on PyTorch
+  export PYTHON_EXECUTABLE=python
+  export EXECUTORCH_BUILD_PYBIND=ON
+  export CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON"
+
+  # NB: We need to rebuild ExecuTorch runner here because it depends on PyTorch
+  # from the PR
  # shellcheck disable=SC1091
-  source .ci/scripts/utils.sh
-  build_executorch_runner "cmake"
+  source .ci/scripts/setup-linux.sh cmake
+
+  echo "Run ExecuTorch unit tests"
+  pytest -v -n auto
+  # shellcheck disable=SC1091
+  LLVM_PROFDATA=llvm-profdata-12 LLVM_COV=llvm-cov-12 bash test/run_oss_cpp_tests.sh

  echo "Run ExecuTorch regression tests for some models"
-  # NB: This is a sample model, more can be added here
-  export PYTHON_EXECUTABLE=python
  # TODO(huydhn): Add more coverage here using ExecuTorch's gather models script
  # shellcheck disable=SC1091
  source .ci/scripts/test.sh mv3 cmake xnnpack-quantization-delegation ''
@ -1237,11 +1253,10 @@ elif [[ "$TEST_CONFIG" == distributed ]]; then
  if [[ "${SHARD_NUMBER}" == 1 ]]; then
    test_rpc
  fi
-elif [[ "$TEST_CONFIG" == deploy ]]; then
-  checkout_install_torchdeploy
-  test_torch_deploy
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
+  test_inductor_halide
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
@ -1253,13 +1268,14 @@ elif [[ "${TEST_CONFIG}" == *timm* ]]; then
  id=$((SHARD_NUMBER-1))
  test_dynamo_benchmark timm_models "$id"
 elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
-  if [[ "${TEST_CONFIG}" == *cpu_inductor* ]]; then
+  if [[ "${TEST_CONFIG}" == *cpu_inductor* || "${TEST_CONFIG}" == *cpu_aot_inductor* ]]; then
    install_torchaudio cpu
  else
    install_torchaudio cuda
  fi
  install_torchtext
  install_torchvision
+  TORCH_CUDA_ARCH_LIST="8.0;8.6" pip_install git+https://github.com/pytorch/ao.git
  id=$((SHARD_NUMBER-1))
  # https://github.com/opencv/opencv-python/issues/885
  pip_install opencv-python==4.8.0.74
@ -1278,7 +1294,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    checkout_install_torchbench
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu_inductor* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu_inductor* && "${TEST_CONFIG}" != *cpu_aot_inductor* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
@ -1286,10 +1302,14 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then
  install_torchvision
  test_inductor_cpp_wrapper_abi_compatible
-elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
-  test_inductor
+  test_inductor_shard 1
+  test_inductor_aoti
  test_inductor_distributed
+elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" -gt 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
+  install_torchvision
+  test_inductor_shard "${SHARD_NUMBER}"
 elif [[ "${TEST_CONFIG}" == *dynamo* && "${SHARD_NUMBER}" == 1 && $NUM_TEST_SHARDS -gt 1 ]]; then
  install_torchvision
  test_dynamo_shard 1
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@ -97,8 +97,16 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then
  )
 elif [[ "$PACKAGE_TYPE" != libtorch ]]; then
  if [[ "\$BUILD_ENVIRONMENT" != *s390x* ]]; then
-    pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
-    retry pip install -q numpy protobuf typing-extensions
+    if [[ "$USE_SPLIT_BUILD" == "true" ]]; then
+      pkg_no_python="$(ls -1 /final_pkgs/torch_no_python* | sort |tail -1)"
+      pkg_torch="$(ls -1 /final_pkgs/torch-* | sort |tail -1)"
+      # todo: after folder is populated use the pypi_pkg channel instead
+      pip install "\$pkg_no_python" "\$pkg_torch" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}_pypi_pkg"
+      retry pip install -q numpy protobuf typing-extensions
+    else
+      pip install "\$pkg" --index-url "https://download.pytorch.org/whl/\${CHANNEL}/${DESIRED_CUDA}"
+      retry pip install -q numpy protobuf typing-extensions
+    fi
  else
    pip install "\$pkg"
    retry pip install -q numpy protobuf typing-extensions
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -33,9 +33,9 @@ if [[ -z "$DOCKER_IMAGE" ]]; then
  if [[ "$PACKAGE_TYPE" == conda ]]; then
    export DOCKER_IMAGE="pytorch/conda-cuda"
  elif [[ "$DESIRED_CUDA" == cpu ]]; then
-    export DOCKER_IMAGE="pytorch/manylinux-cpu"
+    export DOCKER_IMAGE="pytorch/manylinux:cpu"
  else
-    export DOCKER_IMAGE="pytorch/manylinux-cuda${DESIRED_CUDA:2}"
+    export DOCKER_IMAGE="pytorch/manylinux-builder:${DESIRED_CUDA:2}"
  fi
 fi

@ -75,9 +75,9 @@ export PYTORCH_BUILD_NUMBER=1
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)

 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
+TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
  # Only linux Python < 3.13 are supported wheels for triton
-  TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.13'"
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
  if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
      TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt)
@ -87,11 +87,11 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:
 fi

 # Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package
-if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then
-    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}"
+if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" ]]; then
+    TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
    if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then
        TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt)
-        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}"
+        TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}"
    fi
    if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then
        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}"
@ -100,32 +100,6 @@ if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_B
    fi
 fi

-JAVA_HOME=
-BUILD_JNI=OFF
-if [[ "$PACKAGE_TYPE" == libtorch ]]; then
-  POSSIBLE_JAVA_HOMES=()
-  POSSIBLE_JAVA_HOMES+=(/usr/local)
-  POSSIBLE_JAVA_HOMES+=(/usr/lib/jvm/java-8-openjdk-amd64)
-  POSSIBLE_JAVA_HOMES+=(/Library/Java/JavaVirtualMachines/*.jdk/Contents/Home)
-  # Add the Windows-specific JNI path
-  POSSIBLE_JAVA_HOMES+=("$PWD/pytorch/.circleci/windows-jni/")
-  for JH in "${POSSIBLE_JAVA_HOMES[@]}" ; do
-    if [[ -e "$JH/include/jni.h" ]] ; then
-      # Skip if we're not on Windows but haven't found a JAVA_HOME
-      if [[ "$JH" == "$PWD/pytorch/.circleci/windows-jni/" && "$OSTYPE" != "msys" ]] ; then
-        break
-      fi
-      echo "Found jni.h under $JH"
-      JAVA_HOME="$JH"
-      BUILD_JNI=ON
-      break
-    fi
-  done
-  if [ -z "$JAVA_HOME" ]; then
-    echo "Did not find jni.h"
-  fi
-fi
-
 cat >"$envfile" <<EOL
 # =================== The following code will be executed inside Docker container ===================
 export TZ=UTC
@ -136,6 +110,7 @@ export DESIRED_PYTHON="${DESIRED_PYTHON:-}"
 export DESIRED_CUDA="$DESIRED_CUDA"
 export LIBTORCH_VARIANT="${LIBTORCH_VARIANT:-}"
 export BUILD_PYTHONLESS="${BUILD_PYTHONLESS:-}"
+export USE_SPLIT_BUILD="${USE_SPLIT_BUILD:-}"
 if [[ "${OSTYPE}" == "msys" ]]; then
  export LIBTORCH_CONFIG="${LIBTORCH_CONFIG:-}"
  if [[ "${LIBTORCH_CONFIG:-}" == 'debug' ]]; then
@ -159,8 +134,6 @@ export TORCH_CONDA_BUILD_FOLDER='pytorch-nightly'
 export ANACONDA_USER='pytorch'

 export USE_FBGEMM=1
-export JAVA_HOME=$JAVA_HOME
-export BUILD_JNI=$BUILD_JNI
 export PIP_UPLOAD_FOLDER="$PIP_UPLOAD_FOLDER"
 export DOCKER_IMAGE="$DOCKER_IMAGE"

--- a/.circleci/scripts/binary_upload.sh
+++ b/.circleci/scripts/binary_upload.sh
@ -25,6 +25,10 @@ if [[ "${DRY_RUN}" = "disabled" ]]; then
  AWS_S3_CP="aws s3 cp"
 fi

+if [[ "${USE_SPLIT_BUILD:-false}" == "true" ]]; then
+  UPLOAD_SUBFOLDER="${UPLOAD_SUBFOLDER}_pypi_pkg"
+fi
+
 # Sleep 2 minutes between retries for conda upload
 retry () {
  "$@"  || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@") || (sleep 5m && "$@")
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -14,12 +14,14 @@ runs:
    - name: Cleans up diskspace
      shell: bash
      run: |
+        set -ex
        diskspace_cutoff=${{ inputs.diskspace-cutoff }}
-        diskspace=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+        docker_root_dir=$(docker info -f '{{.DockerRootDir}}')
+        diskspace=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
        if [[ "$diskspace" -ge "$diskspace_cutoff" ]] ; then
            docker system prune -af
-            diskspace_new=$(df -H / --output=pcent | sed -n 2p | sed 's/%//' | sed 's/ //')
+            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                echo "$msg"
--- a/.github/actions/linux-build/action.yml
+++ b/.github/actions/linux-build/action.yml
@ -52,6 +52,13 @@ inputs:
    description: Hugging Face Hub token
    required: false
    default: ""
+  use_split_build:
+    description: |
+      [Experimental] Build a libtorch only wheel and build pytorch such that
+      are built from the libtorch wheel.
+    required: false
+    type: boolean
+    default: false
 outputs:
  docker-image:
    value: ${{ steps.calculate-docker-image.outputs.docker-image }}
@ -144,6 +151,7 @@ runs:
        DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }}
        OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
        HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }}
+        USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
      shell: bash
      run: |
        # detached container should get cleaned up by teardown_ec2_linux
@ -163,6 +171,7 @@ runs:
          -e PR_LABELS \
          -e OUR_GITHUB_JOB_ID \
          -e HUGGING_FACE_HUB_TOKEN \
+          -e USE_SPLIT_BUILD \
          --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
          --security-opt seccomp=unconfined \
          --cap-add=SYS_PTRACE \
@ -183,7 +192,7 @@ runs:

    - name: Store PyTorch Build Artifacts on S3
      uses: seemethere/upload-artifact-s3@v5
-      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped'
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
      with:
        name: ${{ inputs.build-environment }}
        retention-days: 14
@ -191,6 +200,16 @@ runs:
        path: artifacts.zip
        s3-bucket: ${{ inputs.s3-bucket }}

+    - name: Store PyTorch Build Artifacts on S3 for split build
+      uses: seemethere/upload-artifact-s3@v5
+      if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
+      with:
+        name: ${{ inputs.build-environment }}-experimental-split-build
+        retention-days: 14
+        if-no-files-found: error
+        path: artifacts.zip
+        s3-bucket: ${{ inputs.s3-bucket }}
+
    - name: Upload sccache stats
      if: steps.build.outcome != 'skipped'
      uses: seemethere/upload-artifact-s3@v5
--- a/.github/actions/test-pytorch-binary/action.yml
+++ b/.github/actions/test-pytorch-binary/action.yml
@ -26,6 +26,7 @@ runs:
          -e PYTORCH_FINAL_PACKAGE_DIR \
          -e PYTORCH_ROOT \
          -e SKIP_ALL_TESTS \
+          -e USE_SPLIT_BUILD \
          --tty \
          --detach \
          -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/ci_commit_pins/torchbench.txt
+++ b/.github/ci_commit_pins/torchbench.txt
@ -1 +1 @@
-d6015d42d9a1834bc7595c4bd6852562fb80b30b
+23512dbebd44a11eb84afbf53c3c071dd105297e
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -27,11 +27,9 @@
  - third_party/onnx
  - caffe2/python/onnx/**
  approved_by:
-  - BowenBao
  - justinchuby
  - liqunfu
  - shubhambhokare1
-  - thiagocrepaldi
  - titaiwangms
  - wschin
  - xadupre
@ -244,6 +242,7 @@
  - torch/csrc/xpu/**
  - torch/xpu/**
  - test/xpu/**
+  - test/test_xpu.py
  - third_party/xpu.txt
  - .ci/docker/ci_commit_pins/triton-xpu.txt
  approved_by:
@ -376,13 +375,21 @@

 - name: CPU inductor
  patterns:
+  - torch/_inductor/mkldnn_ir.py
  - torch/_inductor/mkldnn_lowerings.py
  - torch/_inductor/fx_passes/mkldnn_fusion.py
  - torch/_inductor/fx_passes/quantization.py
+  - torch/_inductor/codegen/cpp_prefix.h
  - torch/_inductor/codegen/cpp.py
+  - torch/_inductor/codegen/cpp_utils.py
+  - torch/_inductor/codegen/cpp_micro_gemm.py
+  - torch/_inductor/codegen/cpp_template_kernel.py
+  - torch/_inductor/codegen/cpp_template.py
+  - torch/_inductor/codegen/cpp_gemm_template.py
  - test/inductor/test_mkldnn_pattern_matcher.py
  - test/inductor/test_cpu_repo.py
  - test/inductor/test_cpu_cpp_wrapper.py
+  - test/inductor/test_cpu_select_algorithm.py
  - aten/src/ATen/cpu/**
  - aten/src/ATen/native/quantized/cpu/**
  - test/quantization/core/test_quantized_op.py
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -26,3 +26,4 @@ retryable_workflows:
 - windows-binary
 labeler_config: labeler.yml
 label_to_label_config: label_to_label.yml
+mergebot: True
--- a/.github/scripts/amd/package_triton_wheel.sh
+++ b/.github/scripts/amd/package_triton_wheel.sh
@ -93,6 +93,8 @@ done

 # Copy Include Files
 cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include
+cp -r $ROCM_HOME/include/roctracer $TRITON_ROCM_DIR/include
+cp -r $ROCM_HOME/include/hsa $TRITON_ROCM_DIR/include

 # Copy linker
 mkdir -p $TRITON_ROCM_DIR/llvm/bin
--- a/.github/scripts/cherry_pick.py
+++ b/.github/scripts/cherry_pick.py
@ -3,11 +3,11 @@
 import json
 import os
 import re
-from typing import Any, Optional
+from typing import Any, cast, Dict, List, Optional

 from urllib.error import HTTPError

-from github_utils import gh_fetch_url, gh_post_pr_comment
+from github_utils import gh_fetch_url, gh_post_pr_comment, gh_query_issues_by_labels

 from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo
 from trymerge import get_pr_commit_sha, GitHubPR
@ -19,6 +19,7 @@ REQUIRES_ISSUE = {
    "critical",
    "fixnewfeature",
 }
+RELEASE_BRANCH_REGEX = re.compile(r"release/(?P<version>.+)")


 def parse_args() -> Any:
@ -58,6 +59,33 @@ def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]:
    return commit_sha if pr.is_closed() else None


+def get_release_version(onto_branch: str) -> Optional[str]:
+    """
+    Return the release version if the target branch is a release branch
+    """
+    m = re.match(RELEASE_BRANCH_REGEX, onto_branch)
+    return m.group("version") if m else ""
+
+
+def get_tracker_issues(
+    org: str, project: str, onto_branch: str
+) -> List[Dict[str, Any]]:
+    """
+    Find the tracker issue from the repo. The tracker issue needs to have the title
+    like [VERSION] Release Tracker following the convention on PyTorch
+    """
+    version = get_release_version(onto_branch)
+    if not version:
+        return []
+
+    tracker_issues = gh_query_issues_by_labels(org, project, labels=["release tracker"])
+    if not tracker_issues:
+        return []
+
+    # Figure out the tracker issue from the list by looking at the title
+    return [issue for issue in tracker_issues if version in issue.get("title", "")]
+
+
 def cherry_pick(
    github_actor: str,
    repo: GitRepo,
@ -77,17 +105,49 @@ def cherry_pick(
    )

    try:
+        org, project = repo.gh_owner_and_name()
+
+        cherry_pick_pr = ""
        if not dry_run:
-            org, project = repo.gh_owner_and_name()
            cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch)

-            msg = f"The cherry pick PR is at {cherry_pick_pr}"
-            if fixes:
-                msg += f" and it is linked with issue {fixes}"
-            elif classification in REQUIRES_ISSUE:
-                msg += f" and it is recommended to link a {classification} cherry pick PR with an issue"
+        tracker_issues_comments = []
+        tracker_issues = get_tracker_issues(org, project, onto_branch)
+        for issue in tracker_issues:
+            issue_number = int(str(issue.get("number", "0")))
+            if not issue_number:
+                continue

-            post_comment(org, project, pr.pr_num, msg)
+            res = cast(
+                Dict[str, Any],
+                post_tracker_issue_comment(
+                    org,
+                    project,
+                    issue_number,
+                    pr.pr_num,
+                    cherry_pick_pr,
+                    classification,
+                    fixes,
+                    dry_run,
+                ),
+            )
+
+            comment_url = res.get("html_url", "")
+            if comment_url:
+                tracker_issues_comments.append(comment_url)
+
+        msg = f"The cherry pick PR is at {cherry_pick_pr}"
+        if fixes:
+            msg += f" and it is linked with issue {fixes}."
+        elif classification in REQUIRES_ISSUE:
+            msg += f" and it is recommended to link a {classification} cherry pick PR with an issue."
+
+        if tracker_issues_comments:
+            msg += " The following tracker issues are updated:\n"
+            for tracker_issues_comment in tracker_issues_comments:
+                msg += f"* {tracker_issues_comment}\n"
+
+        post_pr_comment(org, project, pr.pr_num, msg, dry_run)

    finally:
        if current_branch:
@ -159,7 +219,9 @@ def submit_pr(
        raise RuntimeError(msg) from error


-def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
+def post_pr_comment(
+    org: str, project: str, pr_num: int, msg: str, dry_run: bool = False
+) -> List[Dict[str, Any]]:
    """
    Post a comment on the PR itself to point to the cherry picking PR when success
    or print the error when failure
@ -182,7 +244,35 @@ def post_comment(org: str, project: str, pr_num: int, msg: str) -> None:
    comment = "\n".join(
        (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}")
    )
-    gh_post_pr_comment(org, project, pr_num, comment)
+    return gh_post_pr_comment(org, project, pr_num, comment, dry_run)
+
+
+def post_tracker_issue_comment(
+    org: str,
+    project: str,
+    issue_num: int,
+    pr_num: int,
+    cherry_pick_pr: str,
+    classification: str,
+    fixes: str,
+    dry_run: bool = False,
+) -> List[Dict[str, Any]]:
+    """
+    Post a comment on the tracker issue (if any) to record the cherry pick
+    """
+    comment = "\n".join(
+        (
+            "Link to landed trunk PR (if applicable):",
+            f"* https://github.com/{org}/{project}/pull/{pr_num}",
+            "",
+            "Link to release branch PR:",
+            f"* {cherry_pick_pr}",
+            "",
+            "Criteria Category:",
+            " - ".join((classification.capitalize(), fixes.capitalize())),
+        )
+    )
+    return gh_post_pr_comment(org, project, issue_num, comment, dry_run)


 def main() -> None:
@ -214,7 +304,7 @@ def main() -> None:

    except RuntimeError as error:
        if not args.dry_run:
-            post_comment(org, project, pr_num, str(error))
+            post_pr_comment(org, project, pr_num, str(error))
        else:
            raise error

--- a/.github/scripts/drci_mocks.json.gz
+++ b/.github/scripts/drci_mocks.json.gz
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -48,7 +48,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.1": (
@ -61,7 +61,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "12.4": (
@ -74,7 +74,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
@ -347,10 +347,6 @@ def generate_wheels_matrix(
    for python_version in python_versions:
        for arch_version in arches:
            gpu_arch_type = arch_type(arch_version)
-            # Disable py3.12 builds for ROCm because of triton dependency
-            # on llnl-hatchet, which doesn't have py3.12 wheels available
-            if gpu_arch_type == "rocm" and python_version == "3.12":
-                continue
            gpu_arch_version = (
                ""
                if arch_version == "cpu"
@ -390,6 +386,31 @@ def generate_wheels_matrix(
                        ),
                    }
                )
+                if arch_version != "cuda-aarch64":
+                    ret.append(
+                        {
+                            "python_version": python_version,
+                            "gpu_arch_type": gpu_arch_type,
+                            "gpu_arch_version": gpu_arch_version,
+                            "desired_cuda": translate_desired_cuda(
+                                gpu_arch_type, gpu_arch_version
+                            ),
+                            "use_split_build": "True",
+                            "devtoolset": (
+                                "cxx11-abi" if arch_version == "cuda-aarch64" else ""
+                            ),
+                            "container_image": WHEEL_CONTAINER_IMAGES[arch_version],
+                            "package_type": package_type,
+                            "pytorch_extra_install_requirements": (
+                                PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version]  # fmt: skip
+                                if os != "linux-aarch64"
+                                else ""
+                            ),
+                            "build_name": f"{package_type}-py{python_version}-{gpu_arch_type}{gpu_arch_version}-split".replace(  # noqa: B950
+                                ".", "_"
+                            ),
+                        }
+                    )
            else:
                ret.append(
                    {
--- a/.github/scripts/get_workflow_type.py
+++ b/.github/scripts/get_workflow_type.py
@ -1,99 +0,0 @@
-import json
-from argparse import ArgumentParser
-from typing import Any
-
-from github import Auth, Github
-from github.Issue import Issue
-
-
-WORKFLOW_TYPE_LABEL = "label"
-WORKFLOW_TYPE_RG = "rg"
-WORKFLOW_TYPE_BOTH = "both"
-
-
-def parse_args() -> Any:
-    parser = ArgumentParser("Get dynamic rollout settings")
-    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
-    parser.add_argument(
-        "--github-repo",
-        type=str,
-        required=False,
-        default="pytorch/test-infra",
-        help="GitHub repo to get the issue",
-    )
-    parser.add_argument(
-        "--github-issue", type=int, required=True, help="GitHub issue umber"
-    )
-    parser.add_argument(
-        "--github-user", type=str, required=True, help="GitHub username"
-    )
-    parser.add_argument(
-        "--github-branch", type=str, required=True, help="Current GitHub branch"
-    )
-
-    return parser.parse_args()
-
-
-def get_gh_client(github_token: str) -> Github:
-    auth = Auth.Token(github_token)
-    return Github(auth=auth)
-
-
-def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
-    repo = gh.get_repo(repo)
-    return repo.get_issue(number=issue_num)
-
-
-def is_exception_branch(branch: str) -> bool:
-    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
-
-
-def get_workflow_type(issue: Issue, username: str) -> str:
-    user_list = issue.get_comments()[0].body.split("\r\n")
-    try:
-        run_option = issue.get_comments()[1].body.split("\r\n")[0]
-    except Exception as e:
-        run_option = "single"
-
-    if user_list[0] == "!":
-        # Use old runners for everyone
-        return WORKFLOW_TYPE_LABEL
-    elif user_list[1] == "*":
-        if run_option == WORKFLOW_TYPE_BOTH:
-            # Use ARC runners and old runners for everyone
-            return WORKFLOW_TYPE_BOTH
-        else:
-            # Use only ARC runners for everyone
-            return WORKFLOW_TYPE_RG
-    elif username in user_list:
-        if run_option == WORKFLOW_TYPE_BOTH:
-            # Use ARC runners and old runners for a specific user
-            return WORKFLOW_TYPE_BOTH
-        else:
-            # Use only ARC runners for a specific user
-            return WORKFLOW_TYPE_RG
-    else:
-        # Use old runners by default
-        return WORKFLOW_TYPE_LABEL
-
-
-def main() -> None:
-    args = parse_args()
-
-    if is_exception_branch(args.github_branch):
-        output = {"workflow_type": WORKFLOW_TYPE_LABEL}
-    else:
-        try:
-            gh = get_gh_client(args.github_token)
-            issue = get_issue(gh, args.github_repo, args.github_issue)
-
-            output = {"workflow_type": get_workflow_type(issue, args.github_user)}
-        except Exception as e:
-            output = {"workflow_type": WORKFLOW_TYPE_LABEL}
-
-    json_output = json.dumps(output)
-    print(json_output)
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/github_utils.py
+++ b/.github/scripts/github_utils.py
@ -202,3 +202,12 @@ def gh_update_pr_state(org: str, repo: str, pr_num: int, state: str = "open") ->
            )
        else:
            raise
+
+
+def gh_query_issues_by_labels(
+    org: str, repo: str, labels: List[str], state: str = "open"
+) -> List[Dict[str, Any]]:
+    url = f"{GITHUB_API_URL}/repos/{org}/{repo}/issues"
+    return gh_fetch_json(
+        url, method="GET", params={"labels": ",".join(labels), "state": state}
+    )
--- a/.github/scripts/gql_mocks.json.gz
+++ b/.github/scripts/gql_mocks.json.gz
--- a/.github/scripts/lintrunner.sh
+++ b/.github/scripts/lintrunner.sh
@ -29,6 +29,7 @@ python3 -m tools.pyi.gen_pyi \
    --native-functions-path aten/src/ATen/native/native_functions.yaml \
    --tags-path aten/src/ATen/native/tags.yaml \
    --deprecated-functions-path "tools/autograd/deprecated.yaml"
+python3 torch/utils/data/datapipes/gen_pyi.py

 RC=0
 # Run lintrunner on all files
--- a/.github/scripts/runner_determinator.py
+++ b/.github/scripts/runner_determinator.py
@ -0,0 +1,210 @@
+# flake8: noqa: G004
+
+import logging
+import os
+from argparse import ArgumentParser
+from logging import LogRecord
+from typing import Any, Iterable
+
+from github import Auth, Github
+from github.Issue import Issue
+
+
+WORKFLOW_LABEL_META = ""  # use meta runners
+WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
+
+GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
+GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
+
+
+class ColorFormatter(logging.Formatter):
+    """Color codes the log messages based on the log level"""
+
+    COLORS = {
+        "WARNING": "\033[33m",  # Yellow
+        "ERROR": "\033[31m",  # Red
+        "CRITICAL": "\033[31m",  # Red
+        "INFO": "\033[0m",  # Reset
+        "DEBUG": "\033[0m",  # Reset
+    }
+
+    def format(self, record: LogRecord) -> str:
+        log_color = self.COLORS.get(record.levelname, "\033[0m")  # Default to reset
+        record.msg = f"{log_color}{record.msg}\033[0m"
+        return super().format(record)
+
+
+handler = logging.StreamHandler()
+handler.setFormatter(ColorFormatter(fmt="%(levelname)-8s: %(message)s"))
+
+log = logging.getLogger(os.path.basename(__file__))
+log.addHandler(handler)
+log.setLevel(logging.INFO)
+
+
+def set_github_output(key: str, value: str) -> None:
+    """
+    Defines outputs of the github action that invokes this script
+    """
+    if not GITHUB_OUTPUT:
+        # See https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ for deprecation notice
+        log.warning(
+            "No env var found for GITHUB_OUTPUT, you must be running this code locally. Falling back to the deprecated print method."
+        )
+        print(f"::set-output name={key}::{value}")
+        return
+
+    with open(GITHUB_OUTPUT, "a") as f:
+        log.info(f"Setting output: {key}='{value}'")
+        f.write(f"{key}={value}\n")
+
+
+def parse_args() -> Any:
+    parser = ArgumentParser("Get dynamic rollout settings")
+    parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
+    parser.add_argument(
+        "--github-issue-repo",
+        type=str,
+        required=False,
+        default="pytorch/test-infra",
+        help="GitHub repo to get the issue",
+    )
+    parser.add_argument(
+        "--github-repo",
+        type=str,
+        required=True,
+        help="GitHub repo where CI is running",
+    )
+    parser.add_argument(
+        "--github-issue", type=int, required=True, help="GitHub issue number"
+    )
+    parser.add_argument(
+        "--github-actor", type=str, required=True, help="GitHub triggering_actor"
+    )
+    parser.add_argument(
+        "--github-issue-owner", type=str, required=True, help="GitHub issue owner"
+    )
+    parser.add_argument(
+        "--github-branch", type=str, required=True, help="Current GitHub branch or tag"
+    )
+    parser.add_argument(
+        "--github-ref-type",
+        type=str,
+        required=True,
+        help="Current GitHub ref type, branch or tag",
+    )
+
+    return parser.parse_args()
+
+
+def get_gh_client(github_token: str) -> Github:
+    auth = Auth.Token(github_token)
+    return Github(auth=auth)
+
+
+def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
+    repo = gh.get_repo(repo)
+    return repo.get_issue(number=issue_num)
+
+
+def get_potential_pr_author(
+    gh: Github, repo: str, username: str, ref_type: str, ref_name: str
+) -> str:
+    # If the trigger was a new tag added by a bot, this is a ciflow case
+    # Fetch the actual username from the original PR. The PR number is
+    # embedded in the tag name: ciflow/<name>/<pr-number>
+    if username == "pytorch-bot[bot]" and ref_type == "tag":
+        split_tag = ref_name.split("/")
+        if (
+            len(split_tag) == 3
+            and split_tag[0] == "ciflow"
+            and split_tag[2].isnumeric()
+        ):
+            pr_number = split_tag[2]
+            try:
+                repository = gh.get_repo(repo)
+                pull = repository.get_pull(number=int(pr_number))
+            except Exception as e:
+                raise Exception(  # noqa: TRY002
+                    f"issue with pull request {pr_number} from repo {repository}"
+                ) from e
+            return pull.user.login
+    # In all other cases, return the original input username
+    return username
+
+
+def is_exception_branch(branch: str) -> bool:
+    return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+
+
+def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
+    try:
+        first_comment = issue.get_comments()[0].body.strip("\n\t ")
+
+        if first_comment[0] == "!":
+            log.info("LF Workflows are disabled for everyone. Using meta runners.")
+            return WORKFLOW_LABEL_META
+        elif first_comment[0] == "*":
+            log.info("LF Workflows are enabled for everyone. Using LF runners.")
+            return WORKFLOW_LABEL_LF
+        else:
+            all_opted_in_users = {
+                usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
+            }
+            opted_in_requestors = {
+                usr for usr in workflow_requestors if usr in all_opted_in_users
+            }
+            if opted_in_requestors:
+                log.info(
+                    f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
+                )
+                return WORKFLOW_LABEL_LF
+            else:
+                log.info(
+                    f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
+                )
+                return WORKFLOW_LABEL_META
+
+    except Exception as e:
+        log.error(
+            f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
+        )
+        return WORKFLOW_LABEL_META
+
+
+def main() -> None:
+    args = parse_args()
+
+    if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
+        log.info(f"Exception branch: '{args.github_branch}', using meta runners")
+        label_type = WORKFLOW_LABEL_META
+    else:
+        try:
+            gh = get_gh_client(args.github_token)
+            # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
+            issue = get_issue(gh, args.github_issue_repo, args.github_issue)
+            username = get_potential_pr_author(
+                gh,
+                args.github_repo,
+                args.github_actor,
+                args.github_ref_type,
+                args.github_branch,
+            )
+            label_type = get_workflow_type(
+                issue,
+                (
+                    args.github_issue_owner,
+                    username,
+                ),
+            )
+        except Exception as e:
+            log.error(
+                f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+            )
+            label_type = WORKFLOW_LABEL_META
+
+    set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
+
+
+if __name__ == "__main__":
+    main()
--- a/.github/scripts/test_trymerge.py
+++ b/.github/scripts/test_trymerge.py
@ -180,6 +180,9 @@ def mock_gh_get_info() -> Any:
    return {
        "closed": False,
        "isCrossRepository": False,
+        "headRefName": "foo",
+        "baseRefName": "bar",
+        "baseRepository": {"defaultBranchRef": {"name": "bar"}},
        "files": {"nodes": [], "pageInfo": {"hasNextPage": False}},
        "changedFiles": 0,
    }
@ -394,6 +397,7 @@ class TestTryMerge(TestCase):
        # self.assertGreater(len(pr.get_checkrun_conclusions()), 3)
        self.assertGreater(pr.get_commit_count(), 60)

+    @skip("GitHub doesn't keep this data anymore")
    def test_gql_retrieve_checksuites(self, *args: Any) -> None:
        "Fetch comments and conclusions for PR with 60 commits"
        pr = GitHubPR("pytorch", "pytorch", 94787)
@ -891,6 +895,24 @@ class TestBypassFailures(TestCase):
        self.assertTrue(len(ignorable["FLAKY"]) == 1)
        self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0)

+    def test_ignore_failures_older_run_same_workflow(self, *args: Any) -> None:
+        pr = GitHubPR("pytorch", "pytorch", 129013)
+        checks = pr.get_checkrun_conclusions()
+        checks = get_classifications(
+            pr.pr_num,
+            pr.project,
+            checks,
+            [],
+        )
+        pending, failed, ignorable = categorize_checks(
+            checks,
+            list(checks.keys()),
+        )
+        self.assertTrue(len(pending) == 0)
+        self.assertTrue(len(failed) == 0)
+        self.assertTrue(len(ignorable["FLAKY"]) == 2)
+        self.assertTrue(len(ignorable["UNSTABLE"]) == 13)
+
    @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules)
    def test_dont_ignore_flaky_failures(self, *args: Any) -> None:
        """
@ -1019,7 +1041,7 @@ class TestGitHubPRGhstackDependencies(TestCase):
        )

    @skip(
-        reason="This test is run against a mutalbe PR that has changed, so it no longer works. The test should be changed"
+        reason="This test is run against a mutable PR that has changed, so it no longer works. The test should be changed"
    )
    @mock.patch("trymerge.read_merge_rules")
    @mock.patch("trymerge.GitRepo")
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -81,9 +81,10 @@ JobNameToStateDict = Dict[str, JobCheckState]


 class WorkflowCheckState:
-    def __init__(self, name: str, url: str, status: Optional[str]):
+    def __init__(self, name: str, url: str, run_id: int, status: Optional[str]):
        self.name: str = name
        self.url: str = url
+        self.run_id: int = run_id
        self.status: Optional[str] = status
        self.jobs: JobNameToStateDict = {}

@ -122,6 +123,7 @@ fragment PRCheckSuites on CheckSuiteConnection {
      workflowRun {
        workflow {
          name
+          databaseId
        }
        databaseId
        url
@ -512,7 +514,7 @@ def add_workflow_conclusions(
    workflows: Dict[str, WorkflowCheckState] = {}

    # for the jobs that don't have a workflow
-    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", None)
+    no_workflow_obj: WorkflowCheckState = WorkflowCheckState("", "", 0, None)

    def add_conclusions(edges: Any) -> None:
        for edge_idx, edge in enumerate(edges):
@ -523,18 +525,30 @@ def add_workflow_conclusions(
            workflow_obj: WorkflowCheckState = no_workflow_obj

            if workflow_run is not None:
+                # This is the usual workflow run ID we see on GitHub
+                workflow_run_id = workflow_run["databaseId"]
+                # While this is the metadata name and ID of the workflow itself
                workflow_name = workflow_run["workflow"]["name"]
+                workflow_id = workflow_run["workflow"]["databaseId"]
+
                workflow_conclusion = node["conclusion"]
                # Do not override existing status with cancelled
                if workflow_conclusion == "CANCELLED" and workflow_name in workflows:
                    continue
-                if workflow_name not in workflows:
-                    workflows[workflow_name] = WorkflowCheckState(
+
+                # Only keep the latest workflow run for each workflow, heuristically,
+                # it's the run with largest run ID
+                if (
+                    workflow_id not in workflows
+                    or workflows[workflow_id].run_id < workflow_run_id
+                ):
+                    workflows[workflow_id] = WorkflowCheckState(
                        name=workflow_name,
                        status=workflow_conclusion,
                        url=workflow_run["url"],
+                        run_id=workflow_run_id,
                    )
-                workflow_obj = workflows[workflow_name]
+                workflow_obj = workflows[workflow_id]

            while checkruns is not None:
                for checkrun_node in checkruns["nodes"]:
@ -572,12 +586,12 @@ def add_workflow_conclusions(
    # the jobs in but don't put the workflow in.  We care more about the jobs in
    # the workflow that ran than the container workflow.
    res: JobNameToStateDict = {}
-    for workflow_name, workflow in workflows.items():
+    for workflow in workflows.values():
        if len(workflow.jobs) > 0:
            for job_name, job in workflow.jobs.items():
                res[job_name] = job
        else:
-            res[workflow_name] = JobCheckState(
+            res[workflow.name] = JobCheckState(
                workflow.name,
                workflow.url,
                workflow.status,
@ -1163,7 +1177,6 @@ class GitHubPR:
            # Finally, upload the record to Rockset. The list of pending and failed
            # checks are at the time of the merge
            save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=comment_id,
                pr_num=self.pr_num,
                owner=self.org,
@ -1179,10 +1192,8 @@ class GitHubPR:
                merge_base_sha=self.get_merge_base(),
                merge_commit_sha=merge_commit_sha,
                is_failed=False,
-                dry_run=dry_run,
                skip_mandatory_checks=skip_mandatory_checks,
                ignore_current=bool(ignore_current_checks),
-                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
@ -1489,7 +1500,6 @@ def checks_to_markdown_bullets(

@retries_decorator()
 def save_merge_record(
-    collection: str,
    comment_id: int,
    pr_num: int,
    owner: str,
@ -1505,59 +1515,44 @@ def save_merge_record(
    merge_base_sha: str,
    merge_commit_sha: str = "",
    is_failed: bool = False,
-    dry_run: bool = False,
    skip_mandatory_checks: bool = False,
    ignore_current: bool = False,
    error: str = "",
-    workspace: str = "commons",
 ) -> None:
    """
-    This saves the merge records into Rockset, so we can query them (for fun and profit)
+    This saves the merge records as a json, which can later be uploaded to s3
    """
-    if dry_run:
-        # Decide not to save the record to Rockset if dry-run is set to not pollute
-        # the collection
-        return

-    try:
-        import rockset  # type: ignore[import]
+    # Prepare the record to be written into Rockset
+    data = [
+        {
+            "comment_id": comment_id,
+            "pr_num": pr_num,
+            "owner": owner,
+            "project": project,
+            "author": author,
+            "pending_checks": pending_checks,
+            "failed_checks": failed_checks,
+            "ignore_current_checks": ignore_current_checks,
+            "broken_trunk_checks": broken_trunk_checks,
+            "flaky_checks": flaky_checks,
+            "unstable_checks": unstable_checks,
+            "last_commit_sha": last_commit_sha,
+            "merge_base_sha": merge_base_sha,
+            "merge_commit_sha": merge_commit_sha,
+            "is_failed": is_failed,
+            "skip_mandatory_checks": skip_mandatory_checks,
+            "ignore_current": ignore_current,
+            "error": error,
+            # This is a unique identifier for the record for deduping purposes
+            # in rockset.  Any unique string would work
+            "_id": f"{project}-{pr_num}-{comment_id}-{os.environ.get('GITHUB_RUN_ID')}",
+        }
+    ]
+    repo_root = Path(__file__).resolve().parent.parent.parent

-        # Prepare the record to be written into Rockset
-        data = [
-            {
-                "comment_id": comment_id,
-                "pr_num": pr_num,
-                "owner": owner,
-                "project": project,
-                "author": author,
-                "pending_checks": pending_checks,
-                "failed_checks": failed_checks,
-                "ignore_current_checks": ignore_current_checks,
-                "broken_trunk_checks": broken_trunk_checks,
-                "flaky_checks": flaky_checks,
-                "unstable_checks": unstable_checks,
-                "last_commit_sha": last_commit_sha,
-                "merge_base_sha": merge_base_sha,
-                "merge_commit_sha": merge_commit_sha,
-                "is_failed": is_failed,
-                "skip_mandatory_checks": skip_mandatory_checks,
-                "ignore_current": ignore_current,
-                "error": error,
-            }
-        ]
-
-        client = rockset.RocksetClient(
-            host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"]
-        )
-        client.Documents.add_documents(
-            collection=collection,
-            data=data,
-            workspace=workspace,
-        )
-
-    except ModuleNotFoundError:
-        print("Rockset is missing, no record will be saved")
-        return
+    with open(repo_root / "merge_record.json", "w") as f:
+        json.dump(data, f)


@retries_decorator(rc=[])
@ -2330,6 +2325,15 @@ def main() -> None:
            dry_run=args.dry_run,
        )
        return
+    if not pr.is_ghstack_pr() and pr.base_ref() != pr.default_branch():
+        gh_post_pr_comment(
+            org,
+            project,
+            args.pr_num,
+            f"PR targets {pr.base_ref()} rather than {pr.default_branch()}, refusing merge request",
+            dry_run=args.dry_run,
+        )
+        return

    if args.check_mergeability:
        if pr.is_ghstack_pr():
@ -2365,7 +2369,6 @@ def main() -> None:
            # list of pending and failed checks here, but they are not really
            # needed at the moment
            save_merge_record(
-                collection=ROCKSET_MERGES_COLLECTION,
                comment_id=args.comment_id,
                pr_num=args.pr_num,
                owner=org,
@ -2380,11 +2383,9 @@ def main() -> None:
                last_commit_sha=pr.last_commit().get("oid", ""),
                merge_base_sha=pr.get_merge_base(),
                is_failed=True,
-                dry_run=args.dry_run,
                skip_mandatory_checks=args.force,
                ignore_current=args.ignore_current,
                error=str(e),
-                workspace=ROCKSET_MERGES_WORKSPACE,
            )
        else:
            print("Missing comment ID or PR number, couldn't upload to Rockset")
--- a/.github/templates/upload.yml.j2
+++ b/.github/templates/upload.yml.j2
@ -30,6 +30,9 @@
  {%- if config["devtoolset"] %}
      DESIRED_DEVTOOLSET: !{{ config["devtoolset"] }}
  {%- endif %}
+  {%- if config.use_split_build is defined %}
+      use_split_build: !{{ config["use_split_build"] }}
+  {%- endif %}
 {%- endif %}
 {%- if config["package_type"] == "libtorch" %}
  {%- if config["libtorch_config"] %}
@ -44,6 +47,7 @@
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.8"
  {%- endif %}
+
 {%- else %}
      DESIRED_PYTHON: "!{{ config["python_version"] }}"
 {%- endif %}
--- a/.github/workflows/_bazel-build-test.yml
+++ b/.github/workflows/_bazel-build-test.yml
@ -27,6 +27,11 @@ on:
        type: string
        description: |
          A JSON description of what configs to run later on.
+      runner:
+        required: false
+        type: string
+        default: "linux.large"
+        description: Runner type

 env:
  GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
@ -34,7 +39,7 @@ env:
 jobs:
  filter:
    if: github.repository_owner == 'pytorch'
-    runs-on: [self-hosted, linux.large]
+    runs-on: ${{ inputs.runner }}
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
      is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }}
--- a/.github/workflows/_binary-build-linux.yml
+++ b/.github/workflows/_binary-build-linux.yml
@ -21,6 +21,13 @@ on:
        default: 210
        type: number
        description: timeout for the job
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
      ALPINE_IMAGE:
        required: false
        type: string
@ -110,6 +117,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -137,6 +145,7 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
+            echo "USE_SPLIT_BUILD=${{ env.use_split_build }}"
          } >> "${GITHUB_ENV} }}"

      - name: List the env
@ -246,6 +255,7 @@ jobs:
            -e PYTORCH_ROOT \
            -e SKIP_ALL_TESTS \
            -e PYTORCH_EXTRA_INSTALL_REQUIREMENTS \
+            -e USE_SPLIT_BUILD \
            --tty \
            --detach \
            -v "${GITHUB_WORKSPACE}/pytorch:/pytorch" \
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -63,6 +63,13 @@ on:
        required: true
        type: string
        description: Hardware to run this job on. Valid values are linux.4xlarge, linux.4xlarge.nvidia.gpu, linux.arm64.2xlarge, and linux.rocm.gpu
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
    secrets:
      github-token:
        required: true
@ -97,6 +104,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Make the env permanent during this workflow (but not the secrets)
        shell: bash
@ -124,6 +132,7 @@ jobs:
            echo "PR_NUMBER=${{ env.PR_NUMBER }}"
            echo "PYTORCH_FINAL_PACKAGE_DIR=${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
            echo "SHA1=${{ env.SHA1 }}"
+            echo "USE_SPLIT_BUILD=${{ env.USE_SPLIT_BUILD }}"
          } >> "${GITHUB_ENV} }}"

      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
--- a/.github/workflows/_binary-upload.yml
+++ b/.github/workflows/_binary-upload.yml
@ -55,6 +55,13 @@ on:
        required: false
        type: string
        description: Desired python version
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
    secrets:
      github-token:
        required: true
@ -93,6 +100,7 @@ jobs:
      PR_NUMBER: ${{ github.event.pull_request.number }}
      PYTORCH_FINAL_PACKAGE_DIR: /artifacts
      SHA1: ${{ github.event.pull_request.head.sha || github.sha }}
+      USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
    steps:
      - name: Checkout PyTorch
        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
--- a/.github/workflows/_linux-build-label.yml
+++ b/.github/workflows/_linux-build-label.yml
@ -56,6 +56,13 @@ on:
        required: false
        type: string
        default: ""
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -107,3 +114,4 @@ jobs:
          aws-role-to-assume: ${{ inputs.aws-role-to-assume }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          use_split_build: ${{ inputs.use_split_build }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -64,6 +64,14 @@ on:
        required: false
        type: string
        default: ""
+      use_split_build:
+        description: |
+          [Experimental] Build a libtorch only wheel and build pytorch such that
+          are built from the libtorch wheel.
+        required: false
+        type: boolean
+        default: false
+
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
@ -181,6 +189,7 @@ jobs:
          DEBUG: ${{ inputs.build-with-debug && '1' || '0' }}
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          USE_SPLIT_BUILD: ${{ inputs.use_split_build }}
        run: |
          # detached container should get cleaned up by teardown_ec2_linux
          container_name=$(docker run \
@ -199,6 +208,7 @@ jobs:
            -e PR_LABELS \
            -e OUR_GITHUB_JOB_ID \
            -e HUGGING_FACE_HUB_TOKEN \
+            -e USE_SPLIT_BUILD \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --security-opt seccomp=unconfined \
            --cap-add=SYS_PTRACE \
@ -218,7 +228,7 @@ jobs:

      - name: Store PyTorch Build Artifacts on S3
        uses: seemethere/upload-artifact-s3@v5
-        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped'
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build != 'true'
        with:
          name: ${{ inputs.build-environment }}
          retention-days: 14
@ -226,6 +236,16 @@ jobs:
          path: artifacts.zip
          s3-bucket: ${{ inputs.s3-bucket }}

+      - name: Store PyTorch Build Artifacts on S3
+        uses: seemethere/upload-artifact-s3@v5
+        if: inputs.build-generates-artifacts && steps.build.outcome != 'skipped' && inputs.use_split_build == 'true'
+        with:
+          name: ${{ inputs.build-environment }}-experimental-split-build
+          retention-days: 14
+          if-no-files-found: error
+          path: artifacts.zip
+          s3-bucket: ${{ inputs.s3-bucket }}
+
      - name: Upload sccache stats
        if: steps.build.outcome != 'skipped'
        uses: seemethere/upload-artifact-s3@v5
--- a/.github/workflows/_runner-determinator.yml
+++ b/.github/workflows/_runner-determinator.yml
@ -3,39 +3,272 @@ name: Check whether the workflow owner can use ARC runners
 on:
  workflow_call:
    inputs:
-      user_name:
+      triggering_actor:
        required: true
        type: string
-        description: The name of the workflow owner.
+        description: The triggering_actor for the workflow. Use github.triggering_actor
+      issue_owner:
+        required: true
+        type: string
+        description: The owner of the issue. Use github.event.pull_request.user.login || github.event.issue.user.login
      curr_branch:
        required: true
        type: string
-        description: Current branch.
+        description: Current branch or tag.
+      curr_ref_type:
+        required: false
+        type: string
+        default: branch
+        description: The value of "github.ref_type", "branch" or "tag"
      issue_number:
        required: false
        type: string
        default: "5132"
+        description: |
+          Fetch's GitHub Issue from pytorch/test-infra
+          Example: https://github.com/pytorch/test-infra/issues/5132

    outputs:
-      workflow-type:
+      label-type:
        description: Type of runners to use
-        value: ${{ jobs.runner-determinator.outputs.workflow-type }}
+        value: ${{ jobs.runner-determinator.outputs.label-type }}

 jobs:
  runner-determinator:
-    runs-on: linux.4xlarge
+    runs-on: ubuntu-latest
    outputs:
-      workflow-type: ${{ steps.set-condition.outputs.workflow-type }}
+      label-type: ${{ steps.set-condition.outputs.label-type }}
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      ISSUE_NUMBER: ${{ inputs.issue_number }}
-      USERNAME: ${{ inputs.user_name }}
+      TRIGGERING_ACTOR: ${{ inputs.triggering_actor }}
+      ISSUE_OWNER: ${{ inputs.issue_owner }}
    steps:
-      - name: Checkout PyTorch
-        uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
-        with:
-          fetch-depth: 1
-          submodules: true
+      # - name: Checkout PyTorch
+      #   uses: pytorch/pytorch/.github/actions/checkout-pytorch@main
+      #   with:
+      #     fetch-depth: 1
+      #     submodules: true
+
+      # TODO: Remove the hardcoded step below
+      # Hardcoding below is temporary for testing ALI runners
+      # This file below should match the script found in .github/scripts/runner_determinator.py
+      - name: Hardcode runner-determinator script
+        run: |
+          cat <<EOF > runner_determinator.py
+          # flake8: noqa: G004
+
+          import logging
+          import os
+          from argparse import ArgumentParser
+          from logging import LogRecord
+          from typing import Any, Iterable
+
+          from github import Auth, Github
+          from github.Issue import Issue
+
+
+          WORKFLOW_LABEL_META = ""  # use meta runners
+          WORKFLOW_LABEL_LF = "lf."  # use runners from the linux foundation
+
+          GITHUB_OUTPUT = os.getenv("GITHUB_OUTPUT", "")
+          GH_OUTPUT_KEY_LABEL_TYPE = "label-type"
+
+
+          class ColorFormatter(logging.Formatter):
+              """Color codes the log messages based on the log level"""
+
+              COLORS = {
+                  "WARNING": "\033[33m",  # Yellow
+                  "ERROR": "\033[31m",  # Red
+                  "CRITICAL": "\033[31m",  # Red
+                  "INFO": "\033[0m",  # Reset
+                  "DEBUG": "\033[0m",  # Reset
+              }
+
+              def format(self, record: LogRecord) -> str:
+                  log_color = self.COLORS.get(record.levelname, "\033[0m")  # Default to reset
+                  record.msg = f"{log_color}{record.msg}\033[0m"
+                  return super().format(record)
+
+
+          handler = logging.StreamHandler()
+          handler.setFormatter(ColorFormatter(fmt="%(levelname)-8s: %(message)s"))
+
+          log = logging.getLogger(os.path.basename(__file__))
+          log.addHandler(handler)
+          log.setLevel(logging.INFO)
+
+
+          def set_github_output(key: str, value: str) -> None:
+              """
+              Defines outputs of the github action that invokes this script
+              """
+              if not GITHUB_OUTPUT:
+                  # See https://github.blog/changelog/2022-10-11-github-actions-deprecating-save-state-and-set-output-commands/ for deprecation notice
+                  log.warning(
+                      "No env var found for GITHUB_OUTPUT, you must be running this code locally. Falling back to the deprecated print method."
+                  )
+                  print(f"::set-output name={key}::{value}")
+                  return
+
+              with open(GITHUB_OUTPUT, "a") as f:
+                  log.info(f"Setting output: {key}='{value}'")
+                  f.write(f"{key}={value}\n")
+
+
+          def parse_args() -> Any:
+              parser = ArgumentParser("Get dynamic rollout settings")
+              parser.add_argument("--github-token", type=str, required=True, help="GitHub token")
+              parser.add_argument(
+                  "--github-issue-repo",
+                  type=str,
+                  required=False,
+                  default="pytorch/test-infra",
+                  help="GitHub repo to get the issue",
+              )
+              parser.add_argument(
+                  "--github-repo",
+                  type=str,
+                  required=True,
+                  help="GitHub repo where CI is running",
+              )
+              parser.add_argument(
+                  "--github-issue", type=int, required=True, help="GitHub issue number"
+              )
+              parser.add_argument(
+                  "--github-actor", type=str, required=True, help="GitHub triggering_actor"
+              )
+              parser.add_argument(
+                  "--github-issue-owner", type=str, required=True, help="GitHub issue owner"
+              )
+              parser.add_argument(
+                  "--github-branch", type=str, required=True, help="Current GitHub branch or tag"
+              )
+              parser.add_argument(
+                  "--github-ref-type",
+                  type=str,
+                  required=True,
+                  help="Current GitHub ref type, branch or tag",
+              )
+
+              return parser.parse_args()
+
+
+          def get_gh_client(github_token: str) -> Github:
+              auth = Auth.Token(github_token)
+              return Github(auth=auth)
+
+
+          def get_issue(gh: Github, repo: str, issue_num: int) -> Issue:
+              repo = gh.get_repo(repo)
+              return repo.get_issue(number=issue_num)
+
+
+          def get_potential_pr_author(
+              gh: Github, repo: str, username: str, ref_type: str, ref_name: str
+          ) -> str:
+              # If the trigger was a new tag added by a bot, this is a ciflow case
+              # Fetch the actual username from the original PR. The PR number is
+              # embedded in the tag name: ciflow/<name>/<pr-number>
+              if username == "pytorch-bot[bot]" and ref_type == "tag":
+                  split_tag = ref_name.split("/")
+                  if (
+                      len(split_tag) == 3
+                      and split_tag[0] == "ciflow"
+                      and split_tag[2].isnumeric()
+                  ):
+                      pr_number = split_tag[2]
+                      try:
+                          repository = gh.get_repo(repo)
+                          pull = repository.get_pull(number=int(pr_number))
+                      except Exception as e:
+                          raise Exception(  # noqa: TRY002
+                              f"issue with pull request {pr_number} from repo {repository}"
+                          ) from e
+                      return pull.user.login
+              # In all other cases, return the original input username
+              return username
+
+
+          def is_exception_branch(branch: str) -> bool:
+              return branch.split("/")[0] in {"main", "nightly", "release", "landchecks"}
+
+
+          def get_workflow_type(issue: Issue, workflow_requestors: Iterable[str]) -> str:
+              try:
+                  first_comment = issue.get_comments()[0].body.strip("\n\t ")
+
+                  if first_comment[0] == "!":
+                      log.info("LF Workflows are disabled for everyone. Using meta runners.")
+                      return WORKFLOW_LABEL_META
+                  elif first_comment[0] == "*":
+                      log.info("LF Workflows are enabled for everyone. Using LF runners.")
+                      return WORKFLOW_LABEL_LF
+                  else:
+                      all_opted_in_users = {
+                          usr_raw.strip("\n\t@ ") for usr_raw in first_comment.split()
+                      }
+                      opted_in_requestors = {
+                          usr for usr in workflow_requestors if usr in all_opted_in_users
+                      }
+                      if opted_in_requestors:
+                          log.info(
+                              f"LF Workflows are enabled for {', '.join(opted_in_requestors)}. Using LF runners."
+                          )
+                          return WORKFLOW_LABEL_LF
+                      else:
+                          log.info(
+                              f"LF Workflows are disabled for {', '.join(workflow_requestors)}. Using meta runners."
+                          )
+                          return WORKFLOW_LABEL_META
+
+              except Exception as e:
+                  log.error(
+                      f"Failed to get determine workflow type. Falling back to meta runners. Exception: {e}"
+                  )
+                  return WORKFLOW_LABEL_META
+
+
+          def main() -> None:
+              args = parse_args()
+
+              if args.github_ref_type == "branch" and is_exception_branch(args.github_branch):
+                  log.info(f"Exception branch: '{args.github_branch}', using meta runners")
+                  label_type = WORKFLOW_LABEL_META
+              else:
+                  try:
+                      gh = get_gh_client(args.github_token)
+                      # The default issue we use - https://github.com/pytorch/test-infra/issues/5132
+                      issue = get_issue(gh, args.github_issue_repo, args.github_issue)
+                      username = get_potential_pr_author(
+                          gh,
+                          args.github_repo,
+                          args.github_actor,
+                          args.github_ref_type,
+                          args.github_branch,
+                      )
+                      label_type = get_workflow_type(
+                          issue,
+                          (
+                              args.github_issue_owner,
+                              username,
+                          ),
+                      )
+                  except Exception as e:
+                      log.error(
+                          f"Failed to get issue. Falling back to meta runners. Exception: {e}"
+                      )
+                      label_type = WORKFLOW_LABEL_META
+
+              set_github_output(GH_OUTPUT_KEY_LABEL_TYPE, label_type)
+
+
+          if __name__ == "__main__":
+              main()
+          EOF
+
+          cat runner_determinator.py

      - name: Install dependencies
        run: python3 -m pip install urllib3==1.26.18 PyGithub==2.3.0
@ -44,15 +277,14 @@ jobs:
        id: set-condition
        run: |
          curr_branch="${{ inputs.curr_branch }}"
+          curr_ref_type="${{ inputs.curr_ref_type }}"
          echo "Current branch is '$curr_branch'"

-          output="$(python3 .github/scripts/get_workflow_type.py \
+          python3 runner_determinator.py \
            --github-token "$GITHUB_TOKEN" \
            --github-issue "$ISSUE_NUMBER" \
            --github-branch "$curr_branch" \
-            --github-user "$USERNAME")"
-
-          echo "Output: '${output}'"
-
-          WORKFLOW_TYPE=$(echo "${output}" | jq -r '.workflow_type')
-          echo "workflow-type=$WORKFLOW_TYPE" >> "$GITHUB_OUTPUT"
+            --github-actor "$TRIGGERING_ACTOR" \
+            --github-issue-owner "$ISSUE_OWNER" \
+            --github-ref-type "$curr_ref_type" \
+            --github-repo "$GITHUB_REPOSITORY"
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -47,6 +47,9 @@ jobs:
    timeout-minutes: 240
    outputs:
      test-matrix: ${{ steps.filter.outputs.test-matrix }}
+    defaults:
+      run:
+        shell: bash
    steps:
      # Duplicated in win-test because this MUST go before a checkout
      - name: Enable git symlinks on Windows and disable fsmonitor daemon
@ -89,6 +92,7 @@ jobs:

      - name: Parse ref
        id: parse-ref
+        shell: bash
        run: python3 .github/scripts/parse_ref.py

      - name: Get workflow job id
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -41,6 +41,9 @@ jobs:
      fail-fast: false
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    defaults:
+      run:
+        shell: bash
    steps:
      # Duplicated in win-build because this MUST go before a checkout
      - name: Enable git symlinks on Windows and disable fsmonitor daemon
@ -224,6 +227,7 @@ jobs:

      - name: Parse ref
        id: parse-ref
+        shell: bash
        run: python3 .github/scripts/parse_ref.py

      - name: Uninstall PyTorch
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -5,6 +5,11 @@ on:
    branches:
      - main
      - release/*
+    tags:
+      # Final Release tags look like: v1.11.0
+      - v[0-9]+.[0-9]+.[0-9]+
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
  release:
    types: [published]
  pull_request:
@ -18,6 +23,8 @@ jobs:
    # https://github.com/softprops/action-gh-release?tab=readme-ov-file#permissions
    permissions:
      contents: write
+    outputs:
+      pt_release_name: ${{ steps.release_name.outputs.pt_release_name }}
    steps:
      - uses: malfet/checkout@silent-checkout
        with:
@ -49,11 +56,44 @@ jobs:
            # Create archive
            tar -czf "$PT_RELEASE_FILE" "$PT_RELEASE_NAME"
            echo "Created source archive $PT_RELEASE_FILE with content: $(ls -a "$PT_RELEASE_NAME")"
-      - name: Upload source distribution
+      - name: Upload source distribution for release
        if: ${{ github.event_name == 'release' }}
        uses: softprops/action-gh-release@v1
        with:
          files: ${{env.PT_RELEASE_FILE}}
+      - name: Upload source distribution to GHA artifacts for release tags
+        if: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: ${{ env.PT_RELEASE_FILE }}
+          path: ${{ env.PT_RELEASE_FILE }}
+      - name: Set output
+        id: release_name
+        run: echo "::set-output name=pt_release_name::${{ env.PT_RELEASE_NAME }}.tar.gz"
+
+  upload_source_code_to_s3:
+    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
+    runs-on: linux.2xlarge
+    environment: sourcecode-upload
+    name: Upload source code to S3 for release tags
+    permissions:
+      id-token: write
+    needs: release
+    steps:
+      - uses: actions/download-artifact@v2
+        with:
+          name: ${{ needs.release.outputs.pt_release_name }}
+      - name: Configure AWS credentials(PyTorch account)
+        uses: aws-actions/configure-aws-credentials@v3
+        with:
+          role-to-assume: arn:aws:iam::749337293305:role/gha_pytorch_source_code_upload_role
+          aws-region: us-east-1
+      - uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: pytorch
+          s3-prefix: source_code/test
+          if-no-files-found: warn
+          path: ${{ needs.release.outputs.pt_release_name }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -54,6 +54,7 @@ jobs:
          pytorch-linux-focal-py3-clang9-android-ndk-r21e,
          pytorch-linux-jammy-py3.8-gcc11,
          pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks,
+          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2024.0-py3,
          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-focal-py3-clang10-onnx,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_8-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-aarch64-test:  # Testing
@ -162,7 +162,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-aarch64-test:  # Testing
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-aarch64-test:  # Testing
@ -378,7 +378,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-aarch64-test:  # Testing
@ -486,7 +486,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cpu-aarch64
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-aarch64-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -48,7 +48,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda11_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda11_8-test:  # Testing
@ -72,6 +72,48 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

+  manywheel-py3_8-cuda11_8-split-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_8-split
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda11_8-split-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda11_8-split-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu118
+      GPU_ARCH_VERSION: 11.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda11.8-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda11_8-split
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
  manywheel-py3_8-cuda12_1-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -88,7 +130,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_1
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_1-test:  # Testing
@ -112,6 +154,48 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}

+  manywheel-py3_8-cuda12_1-split-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_1-split
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda12_1-split-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda12_1-split-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu121
+      GPU_ARCH_VERSION: 12.1
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.1-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_1-split
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+
  manywheel-py3_8-cuda12_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -128,7 +212,7 @@ jobs:
      DESIRED_PYTHON: "3.8"
      build_name: manywheel-py3_8-cuda12_4
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cuda12_4-test:  # Testing
@ -151,3 +235,45 @@ jobs:
      runs_on: linux.4xlarge.nvidia.gpu
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
+
+  manywheel-py3_8-cuda12_4-split-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4-split
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_8-cuda12_4-split-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs: manywheel-py3_8-cuda12_4-split-build
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      BUILDER_ROOT: /builder
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu124
+      GPU_ARCH_VERSION: 12.4
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main
+      use_split_build: True
+      DESIRED_PYTHON: "3.8"
+      build_name: manywheel-py3_8-cuda12_4-split
+      build_environment: linux-binary-manywheel
+      runs_on: linux.4xlarge.nvidia.gpu
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-s390x-binary-manywheel-nightly.yml
@ -54,7 +54,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_8-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_8-cpu-s390x-test:  # Testing
@ -117,7 +117,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_9-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cpu-s390x-test:  # Testing
@ -180,7 +180,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_10-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cpu-s390x-test:  # Testing
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_11-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cpu-s390x-test:  # Testing
@ -306,7 +306,7 @@ jobs:
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      build_name: manywheel-py3_12-cpu-s390x
      build_environment: linux-s390x-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cpu-s390x-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -165,7 +165,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -284,7 +284,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -403,7 +403,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
@ -522,7 +522,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
      # For sccache access (only on non-forked PRs)
      AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }}
      AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }}
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
@ -46,7 +46,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -290,7 +290,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -536,7 +536,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -782,7 +782,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.8"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1027,7 +1027,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1271,7 +1271,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1517,7 +1517,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -1763,7 +1763,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.9"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2008,7 +2008,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2252,7 +2252,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2498,7 +2498,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2744,7 +2744,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.10"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -2989,7 +2989,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3233,7 +3233,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3479,7 +3479,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3725,7 +3725,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.11"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -3970,7 +3970,7 @@ jobs:
      GPU_ARCH_TYPE: cpu
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4214,7 +4214,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4460,7 +4460,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
@ -4706,7 +4706,7 @@ jobs:
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      DESIRED_PYTHON: "3.12"
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.1.0.70; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.21.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'
    steps:
      - name: Display EC2 information
        shell: bash
--- a/.github/workflows/inductor-cu124.yml
+++ b/.github/workflows/inductor-cu124.yml
@ -28,7 +28,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -95,7 +96,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_4-py3_12-gcc9-inductor-test:
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -56,3 +56,29 @@ jobs:
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build.outputs.test-matrix }}
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
+      cuda-arch-list: '8.0'
+      test-matrix: |
+        { include: [
+          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
+        ]}
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
+    name: cuda12.1-py3.10-gcc9-sm80
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
+      use-gha: anything-non-empty-to-use-gha
+    secrets:
+      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -24,7 +24,8 @@ jobs:
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.2" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.2" },
        ]}

  linux-focal-rocm6_1-py3_8-inductor-test:
@ -48,7 +49,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_distributed", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
          { config: "inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
          { config: "inductor_timm", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
@ -81,32 +83,6 @@ jobs:
    secrets:
      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}

-  linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-build.yml
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9-inductor-benchmarks
-      cuda-arch-list: '8.0'
-      test-matrix: |
-        { include: [
-          { config: "inductor_torchbench_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.gcp.a100" },
-        ]}
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
-  linux-focal-cuda12_1-py3_10-gcc9-inductor-test-gcp:
-    name: cuda12.1-py3.10-gcc9-sm80
-    uses: ./.github/workflows/_linux-test.yml
-    needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp
-    with:
-      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80
-      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build-gcp.outputs.test-matrix }}
-      use-gha: anything-non-empty-to-use-gha
-    secrets:
-      HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
-
  linux-focal-cuda12_1-py3_12-gcc9-inductor-build:
    name: cuda12.1-py3.12-gcc9-sm86
    uses: ./.github/workflows/_linux-build.yml
@ -116,7 +92,8 @@ jobs:
      cuda-arch-list: '8.6'
      test-matrix: |
        { include: [
-          { config: "inductor", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "inductor", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_12-gcc9-inductor-test:
@ -128,6 +105,26 @@ jobs:
      docker-image: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_12-gcc9-inductor-build.outputs.test-matrix }}

+  linux-jammy-cpu-py3_12-inductor-halide-build:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image-name: pytorch-linux-jammy-py3.12-halide
+      test-matrix: |
+        { include: [
+          { config: "inductor-halide", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+        ]}
+
+  linux-jammy-cpu-py3_12-inductor-halide-test:
+    name: linux-jammy-cpu-py3.12-gcc11-inductor-halide
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-jammy-cpu-py3_12-inductor-halide-build
+    with:
+      build-environment: linux-jammy-py3.12-gcc11
+      docker-image: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cpu-py3_12-inductor-halide-build.outputs.test-matrix }}
+
  linux-focal-cuda12_4-py3_10-gcc9-inductor-build:
    # Should be synced with the one in inductor-periodic.yml but this only runs inductor_timm
    name: cuda12.4-py3.10-gcc9-sm86
@ -175,11 +172,21 @@ jobs:
          { config: "cpu_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "cpu_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_inductor_huggingface_amp_freezing", shard: 1, num_shards: 1, runner: "linux.16xlarge.spr" },
+          { config: "cpu_inductor_timm_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
+          { config: "cpu_inductor_timm_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
+          { config: "cpu_inductor_torchbench_amp_freezing", shard: 1, num_shards: 2, runner: "linux.16xlarge.spr" },
+          { config: "cpu_inductor_torchbench_amp_freezing", shard: 2, num_shards: 2, runner: "linux.16xlarge.spr" },
          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_aot_inductor_huggingface_freezing", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "cpu_aot_inductor_timm_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_aot_inductor_timm_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_aot_inductor_torchbench_freezing", shard: 1, num_shards: 2, runner: "linux.12xlarge" },
+          { config: "cpu_aot_inductor_torchbench_freezing", shard: 2, num_shards: 2, runner: "linux.12xlarge" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.24xl.spr-metal" },
        ]}
    secrets:
--- a/.github/workflows/llm_td_retrieval.yml
+++ b/.github/workflows/llm_td_retrieval.yml
@ -36,33 +36,24 @@ jobs:
          ref: v0.0.2
          path: llm-target-determinator

-      - name: Setup Conda
-        uses: conda-incubator/setup-miniconda@v2.1.1
+      - name: Setup miniconda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
        with:
-          miniconda-version: "py39_4.12.0"
-          python-version: 3.9
+          python-version: "3.9"

-      - name: Install Requirements
+      - name: Install requirements
        shell: bash -l {0}
        run: |
          set -euxo pipefail
-          conda create \
-            --yes \
-            --quiet \
-            --name "tdenv" \
-            "python=3.9"
-          conda activate tdenv
-          cd "${GITHUB_WORKSPACE}/llm-target-determinator"
-          pip install -r requirements.txt
-          cd ../codellama
-          pip install -e .
+          ${CONDA_RUN} pip install -r llm-target-determinator/requirements.txt
+          cd "${GITHUB_WORKSPACE}/codellama"
+          ${CONDA_RUN} pip install -e .

      - name: Fetch CodeLlama Checkpoint
        shell: bash -l {0}
        run: |
          set -euxo pipefail
-          conda activate tdenv
-          cd codellama/
+          cd "${GITHUB_WORKSPACE}/codellama"
          mkdir "CodeLlama-7b-Python"
          aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress

@ -75,7 +66,7 @@ jobs:
          shell: bash
          command: |
            set -euxo pipefail
-            python3 -m pip install awscli==1.29.40
+            ${CONDA_RUN} python -m pip install awscli==1.29.40
            cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets
            aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive

@ -88,9 +79,8 @@ jobs:
        shell: bash -l {0}
        run: |
          set -euxo pipefail
-          conda activate tdenv
          cd "${GITHUB_WORKSPACE}"/llm-target-determinator
-          torchrun \
+          ${CONDA_RUN} torchrun \
            --standalone \
            --nnodes=1 \
            --nproc-per-node=1 \
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -73,7 +73,6 @@ jobs:
          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
@ -295,3 +294,53 @@ jobs:
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      use_split_build: true
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build
+      - target-determination
+    with:
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+
+  linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build:
+    name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      use_split_build: true
+      build-environment: linux-focal-cuda11.8-py3.9-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      cuda-arch-list: 8.6
+      test-matrix: |
+        { include: [
+          { config: "multigpu", shard: 1, num_shards: 1, runner: "linux.g5.12xlarge.nvidia.gpu" },
+        ]}
+      build-with-debug: false
+
+  linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build
+      - target-determination
+    with:
+      build-environment: linux-focal-cuda11.8-py3.9-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_9-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -35,22 +35,33 @@ jobs:
      id-token: write
      contents: read

+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+
  linux-jammy-py3_8-gcc11-build:
    name: linux-jammy-py3.8-gcc11
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "docs_test", shard: 1, num_shards: 1,  runner: "linux.2xlarge" },
-          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

  linux-jammy-py3_8-gcc11-test:
@ -75,7 +86,9 @@ jobs:
  linux-jammy-py3_8-gcc11-no-ops:
    name: linux-jammy-py3.8-gcc11-no-ops
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11-no-ops
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
@ -86,7 +99,9 @@ jobs:
  linux-jammy-py3_8-gcc11-pch:
    name: linux-jammy-py3.8-gcc11-pch
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc11-pch
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      test-matrix: |
@ -98,17 +113,19 @@ jobs:
  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.10-clang15-asan
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 2, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 3, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 4, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 5, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "default", shard: 6, num_shards: 6, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build

@ -128,13 +145,15 @@ jobs:
  linux-focal-py3_8-clang10-onnx-build:
    name: linux-focal-py3.8-clang10-onnx
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang10-onnx
      docker-image-name: pytorch-linux-focal-py3-clang10-onnx
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

  linux-focal-py3_8-clang10-onnx-test:
@ -151,19 +170,22 @@ jobs:
  linux-focal-py3_8-clang10-build:
    name: linux-focal-py3.8-clang10
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang10
      docker-image-name: pytorch-linux-focal-py3.8-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
  linux-focal-py3_8-clang10-test:
    name: linux-focal-py3.8-clang10
@ -179,22 +201,24 @@ jobs:
  linux-focal-py3_11-clang10-build:
    name: linux-focal-py3.11-clang10
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.11-clang10
      docker-image-name: pytorch-linux-focal-py3.11-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "crossref", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

-
  linux-focal-py3_11-clang10-test:
    name: linux-focal-py3.11-clang10
    uses: ./.github/workflows/_linux-test.yml
@ -209,17 +233,20 @@ jobs:
  linux-focal-py3_12-clang10-build:
    name: linux-focal-py3.12-clang10
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.12-clang10
      docker-image-name: pytorch-linux-focal-py3.12-clang10
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
-          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 1, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "default", shard: 4, num_shards: 4, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

  linux-focal-py3_12-clang10-test:
@ -235,14 +262,16 @@ jobs:
  linux-focal-cuda11_8-py3_10-gcc9-build:
    name: linux-focal-cuda11.8-py3.10-gcc9
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda11.8-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
-          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda11_8-py3_10-gcc9-test:
@ -260,17 +289,18 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-build:
    name: linux-focal-cuda12.1-py3.10-gcc9
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
-          { config: "deploy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-test:
@ -288,7 +318,9 @@ jobs:
  linux-jammy-py3-clang12-mobile-build:
    name: linux-jammy-py3-clang12-mobile-build
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3-clang12-mobile-build
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      build-generates-artifacts: false
@ -300,7 +332,9 @@ jobs:
  linux-jammy-cuda-11_8-cudnn9-py3_8-clang12-build:
    name: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-cuda11.8-cudnn9-py3.8-clang12
      docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn9-py3.8-clang12
      test-matrix: |
@ -311,7 +345,9 @@ jobs:
  linux-focal-py3-clang9-mobile-custom-build-static:
    name: linux-focal-py3-clang9-mobile-custom-build-static
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3-clang9-mobile-custom-build-static
      docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e
      build-generates-artifacts: false
@ -323,12 +359,14 @@ jobs:
  linux-focal-py3_8-clang9-xla-build:
    name: linux-focal-py3_8-clang9-xla
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-py3.8-clang9-xla
      docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite
      test-matrix: |
        { include: [
-          { config: "xla", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "xla", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.12xlarge" },
        ]}

  linux-focal-py3_8-clang9-xla-test:
@ -359,37 +397,43 @@ jobs:
  linux-focal-cpu-py3_10-gcc9-bazel-test:
    name: linux-focal-cpu-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: cpu
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge" },
+          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-bazel-test:
    name: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-version: "12.1"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_4-py3_10-gcc9-bazel-test:
    name: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
    uses: ./.github/workflows/_bazel-build-test.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.large"
      build-environment: linux-focal-cuda12.4-py3.10-gcc9-bazel-test
      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
      cuda-version: "12.4"
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-py3-clang9-android-ndk-r21e-gradle-custom-build-single:
@ -417,7 +461,9 @@ jobs:
  linux-jammy-py3_8-gcc11-mobile-lightweight-dispatch-build:
    name: linux-jammy-py3.8-gcc11-mobile-lightweight-dispatch-build
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.8-gcc111-mobile-lightweight-dispatch-build
      docker-image-name: pytorch-linux-jammy-py3.8-gcc11
      build-generates-artifacts: false
@ -431,7 +477,9 @@ jobs:
    if: github.event_name == 'pull_request'
    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
@ -445,17 +493,19 @@ jobs:
  linux-focal-cuda12_1-py3_10-gcc9-sm86-build:
    name: linux-focal-cuda12.1-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86
      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
      cuda-arch-list: 8.6
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 2, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 3, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 4, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
-          { config: "default", shard: 5, num_shards: 5, runner: "linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g5.4xlarge.nvidia.gpu" },
        ]}

  linux-focal-cuda12_1-py3_10-gcc9-sm86-test:
@ -472,12 +522,14 @@ jobs:
  linux-jammy-py3-clang12-executorch-build:
    name: linux-jammy-py3-clang12-executorch
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3-clang12-executorch
      docker-image-name: pytorch-linux-jammy-py3-clang12-executorch
      test-matrix: |
        { include: [
-          { config: "executorch", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "executorch", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}

  linux-jammy-py3-clang12-executorch-test:
@ -488,3 +540,59 @@ jobs:
      build-environment: linux-jammy-py3-clang12-executorch
      docker-image: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-py3-clang12-executorch-build.outputs.test-matrix }}
+
+  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
+    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
+      use_split_build: true
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.1-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-focal-cuda12.1-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-focal-py3_12-clang10-experimental-split-build:
+    name: linux-focal-py3.12-clang10-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      use_split_build: True
+      build-environment: linux-focal-py3.12-clang10
+      docker-image-name: pytorch-linux-focal-py3.12-clang10
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" },
+          { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" },
+        ]}
+  linux-focal-py3_12-clang10-experimental-split-build-test:
+    name: linux-focal-py3.12-clang10-experimental-split-build
+    uses: ./.github/workflows/_linux-test.yml
+    needs: linux-focal-py3_12-clang10-experimental-split-build
+    with:
+      build-environment: linux-focal-py3.12-clang10-experimental-split-build
+      docker-image: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-py3_12-clang10-experimental-split-build.outputs.test-matrix }}
+      timeout-minutes: 600
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -36,6 +36,15 @@ jobs:
      id-token: write
      contents: read

+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build:
    name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
@ -97,7 +106,8 @@ jobs:
      docker-image-name: pytorch-linux-focal-py3.8-clang10
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.2xlarge" },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.2xlarge" },
        ]}

  linux-focal-py3_8-clang10-test:
@ -119,7 +129,8 @@ jobs:
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" },
+          { config: "slow", shard: 1, num_shards: 2, runner: "linux.rocm.gpu" },
+          { config: "slow", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" },
        ]}

  linux-focal-rocm6_1-py3_8-test:
@ -139,14 +150,16 @@ jobs:
  linux-jammy-py3_10-clang15-asan-build:
    name: linux-jammy-py3.10-clang15-asan
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.10-clang15-asan
      docker-image-name: pytorch-linux-jammy-py3-clang15-asan
      test-matrix: |
        { include: [
-          { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" },
-          { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" },
+          { config: "slow", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
+          { config: "slow", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge" },
        ]}
      sync-tag: asan-build

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -34,6 +34,15 @@ jobs:
      id-token: write
      contents: read

+  get-label-type:
+    name: get-label-type
+    uses: ./.github/workflows/_runner-determinator.yml
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
  linux-focal-cuda12_4-py3_10-gcc9-sm86-build:
    name: linux-focal-cuda12.4-py3.10-gcc9-sm86
    uses: ./.github/workflows/_linux-build-label.yml
@ -213,7 +222,9 @@ jobs:
  linux-focal-rocm6_1-py3_8-build:
    name: linux-focal-rocm6.1-py3.8
    uses: ./.github/workflows/_linux-build-label.yml
+    needs: get-label-type
    with:
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-focal-rocm6.1-py3.8
      docker-image-name: pytorch-linux-focal-rocm-n-py3
      sync-tag: rocm-build
@ -238,3 +249,59 @@ jobs:
      docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
+
+  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      use_split_build: true
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda12.4-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "nogpu_AVX512", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "nogpu_NO_AVX2", shard: 1, num_shards: 1, runner: "linux.2xlarge" },
+          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 1, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 2, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 3, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 4, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+          { config: "default", shard: 5, num_shards: 5, runner: "linux.4xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build
+      - target-determination
+    with:
+      build-environment: linux-focal-cuda12.4-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda12_4-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
+
+  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build:
+    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
+    uses: ./.github/workflows/_linux-build-label.yml
+    with:
+      use_split_build: true
+      build-environment: linux-focal-cuda11.8-py3.10-gcc9
+      docker-image-name: pytorch-linux-focal-cuda11.8-cudnn9-py3-gcc9
+      test-matrix: |
+        { include: [
+          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.8xlarge.nvidia.gpu" },
+        ]}
+
+  linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build-test:
+    name: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build
+      - target-determination
+    with:
+      timeout-minutes: 360
+      build-environment: linux-focal-cuda11.8-py3.10-gcc9-experimental-split-build
+      docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-experimental-split-build.outputs.test-matrix }}
--- a/.github/workflows/trymerge.yml
+++ b/.github/workflows/trymerge.yml
@ -9,6 +9,8 @@ jobs:
    name: try_merge_pr_${{ github.event.client_payload.pr_num }}
    runs-on: linux.20_04.4x
    environment: mergebot
+    permissions:
+      id-token: write
    env:
        GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
    steps:
@ -43,6 +45,7 @@ jobs:
          IGNORE_CURRENT: ${{ github.event.client_payload.ignore_current }}
          ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
          DRCI_BOT_KEY: ${{ secrets.DRCI_BOT_KEY }}
+          GITHUB_RUN_ID: ${{ github.run_id }}
        run: |
          set -x
          if [ -n "${REBASE}" ]; then
@ -84,6 +87,22 @@ jobs:
          set -x
          python3 .github/scripts/comment_on_pr.py "${PR_NUM}" "merge"

+      - name: configure aws credentials
+        uses: aws-actions/configure-aws-credentials@v3
+        continue-on-error: true
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/upload_to_ossci_raw_job_status
+          aws-region: us-east-1
+
+      - name: Upload merge record to s3
+        if: always()
+        continue-on-error: true
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: ossci-raw-job-status
+          s3-prefix: merges/${{ github.repository }}/${{ github.event.client_payload.pr_num }}/${{ github.event.client_payload.comment_id }}/${{ github.run_id }}
+          path: merge_record.json
+
 # We want newer merge commands to supercede old ones
 concurrency:
  group: try-merge-${{ github.event.client_payload.pr_num }}
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -25,10 +25,7 @@ jobs:

  upload-test-stats:
    needs: get_workflow_conclusion
-    if:
-      github.repository_owner == 'pytorch' &&
-      (github.event.workflow_run.conclusion == 'success' || github.event.workflow_run.conclusion == 'failure' ||
-      needs.get_workflow_conclusion.outputs.conclusion == 'success' || needs.get_workflow_conclusion.outputs.conclusion == 'failure')
+    if: github.repository_owner == 'pytorch'
    runs-on: ubuntu-22.04
    environment: upload-stats
    name: Upload test stats for ${{ github.event.workflow_run.id }}, attempt ${{ github.event.workflow_run.run_attempt }}
--- a/.gitignore
+++ b/.gitignore
@ -129,6 +129,7 @@ env
 scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
+merge_record.json

 # These files get copied over on invoking setup.py
 torchgen/packaged/*
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -68,6 +68,8 @@ include_patterns = [
    'aten/src/ATen/native/cudnn/*.cpp',
    'c10/**/*.h',
    'c10/**/*.cpp',
+    'distributed/c10d/*DMAConnectivity.*',
+    'distributed/c10d/*SymmetricMemory.*',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.hpp',
    'torch/csrc/**/*.cpp',
@ -136,7 +138,7 @@ init_command = [
    'numpy==1.24.3 ; python_version == "3.8"',
    'numpy==1.26.0 ; python_version >= "3.9"',
    'expecttest==0.1.6',
-    'mypy==1.9.0',
+    'mypy==1.10.0',
    'sympy==1.11.1',
    'types-requests==2.27.25',
    'types-PyYAML==6.0.7',
@ -202,6 +204,8 @@ include_patterns = [
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
    'torch/csrc/**/*.cpp',
+    'torch/csrc/jit/serialization/*.h',
+    'torch/csrc/jit/serialization/*.cpp',
 ]
 exclude_patterns = [
    # The negative filters below are to exclude files that include onnx_pb.h or
@ -216,7 +220,6 @@ exclude_patterns = [
    'c10/util/complex_math.h',
    'c10/util/complex_utils.h',
    'c10/util/flat_hash_map.h',
-    'c10/util/Float8*.h',
    'c10/util/logging*.h',
    'c10/util/hash.h',
    'c10/util/strong_type.h',
@ -224,7 +227,6 @@ exclude_patterns = [
    'c10/util/win32-headers.h',
    'c10/util/*inl.h',
    'c10/test/**/*.h',
-    'aten/src/ATen/core/TensorImpl_test.cpp',
    'third_party/**/*',
    'torch/csrc/api/**',
    'torch/csrc/autograd/generated/**',
@ -232,10 +234,8 @@ exclude_patterns = [
    'torch/csrc/dynamo/eval_frame.h',
    'torch/csrc/inductor/**/*',
    'torch/csrc/jit/**/*',
-    'torch/csrc/jit/serialization/import_legacy.cpp',
-    'torch/csrc/jit/serialization/export.cpp',
+    'torch/csrc/jit/serialization/mobile_bytecode_generated.h',
    'torch/csrc/lazy/**/*',
-    'torch/csrc/mps/**/*',
 ]
 init_command = [
    'python3',
@ -999,7 +999,6 @@ command = [
 ]
 exclude_patterns = [
    'tools/gen_vulkan_spv.py',
-    'torch/__init__.py',  # Skip this file to format because it's part of the public API
    # We don't care too much about files in this directory, don't enforce
    # formatting on them
    'caffe2/**/*.py',
@ -1099,14 +1098,12 @@ exclude_patterns = [
    'test/test_namedtuple_return_api.py',
    'test/test_native_functions.py',
    'test/test_native_mha.py',
-    'test/test_nestedtensor.py',
    'test/test_nn.py',
    'test/test_out_dtype_op.py',
    'test/test_overrides.py',
    'test/test_prims.py',
    'test/test_proxy_tensor.py',
    'test/test_pruning_op.py',
-    'test/test_public_bindings.py',
    'test/test_quantization.py',
    'test/test_reductions.py',
    'test/test_scatter_gather_ops.py',
@ -1132,8 +1129,6 @@ exclude_patterns = [
    'test/test_type_promotion.py',
    'test/test_unary_ufuncs.py',
    'test/test_vulkan.py',
-    'test/test_xnnpack_integration.py',
-    'test/torch_np/numpy_test/**/*.py',
    'torch/_awaits/__init__.py',
    'torch/_custom_op/__init__.py',
    'torch/_custom_op/autograd.py',
@ -1194,9 +1189,6 @@ exclude_patterns = [
    'torch/_export/serde/upgrade.py',
    'torch/_export/trace.py',
    'torch/_export/verifier.py',
-    'torch/_higher_order_ops/__init__.py',
-    'torch/_higher_order_ops/out_dtype.py',
-    'torch/_higher_order_ops/wrap.py',
    'torch/_vendor/**',
    'torch/ao/__init__.py',
    'torch/ao/nn/__init__.py',
@ -1393,172 +1385,8 @@ exclude_patterns = [
    'torch/contrib/_tensorboard_vis.py',
    "torch/cuda/_gpu_trace.py",
    'torch/cuda/_memory_viz.py',  # mypy: Value of type "object" is not indexable
-    'torch/distributed/__init__.py',
-    'torch/distributed/_composable_state.py',
-    'torch/distributed/_shard/__init__.py',
-    'torch/distributed/_shard/_utils.py',
-    'torch/distributed/_shard/api.py',
-    'torch/distributed/_shard/checkpoint/__init__.py',
-    'torch/distributed/_shard/common_op_utils.py',
-    'torch/distributed/_shard/metadata.py',
-    'torch/distributed/_shard/op_registry_utils.py',
-    'torch/distributed/_shard/sharded_optim/__init__.py',
-    'torch/distributed/_shard/sharded_optim/api.py',
-    'torch/distributed/_shard/sharded_tensor/__init__.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/__init__.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/_common.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/binary_cmp.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/init.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/misc_ops.py',
-    'torch/distributed/_shard/sharded_tensor/_ops/tensor_ops.py',
-    'torch/distributed/_shard/sharded_tensor/api.py',
-    'torch/distributed/_shard/sharded_tensor/logger.py',
-    'torch/distributed/_shard/sharded_tensor/logging_handlers.py',
-    'torch/distributed/_shard/sharded_tensor/metadata.py',
-    'torch/distributed/_shard/sharded_tensor/reshard.py',
-    'torch/distributed/_shard/sharded_tensor/shard.py',
-    'torch/distributed/_shard/sharded_tensor/utils.py',
-    'torch/distributed/_shard/sharder.py',
-    'torch/distributed/_shard/sharding_plan/__init__.py',
-    'torch/distributed/_shard/sharding_plan/api.py',
-    'torch/distributed/_shard/sharding_spec/__init__.py',
-    'torch/distributed/_shard/sharding_spec/_internals.py',
-    'torch/distributed/_shard/sharding_spec/api.py',
-    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec.py',
-    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/__init__.py',
-    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/_common.py',
-    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding.py',
-    'torch/distributed/_shard/sharding_spec/chunk_sharding_spec_ops/embedding_bag.py',
-    'torch/distributed/_sharded_tensor/__init__.py',
-    'torch/distributed/_sharding_spec/__init__.py',
-    'torch/distributed/_tools/__init__.py',
-    'torch/distributed/_tools/memory_tracker.py',
-    'torch/distributed/algorithms/__init__.py',
-    'torch/distributed/algorithms/_checkpoint/__init__.py',
-    'torch/distributed/algorithms/_checkpoint/checkpoint_wrapper.py',
-    'torch/distributed/algorithms/_comm_hooks/__init__.py',
-    'torch/distributed/algorithms/_comm_hooks/default_hooks.py',
-    'torch/distributed/algorithms/_optimizer_overlap/__init__.py',
-    'torch/distributed/algorithms/_optimizer_overlap/optimizer_overlap.py',
-    'torch/distributed/algorithms/_quantization/__init__.py',
-    'torch/distributed/algorithms/_quantization/quantization.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/__init__.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/ddp_zero_hook.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/debugging_hooks.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/default_hooks.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/mixed_precision_hooks.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/optimizer_overlap_hooks.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/post_localSGD_hook.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/powerSGD_hook.py',
-    'torch/distributed/algorithms/ddp_comm_hooks/quantization_hooks.py',
-    'torch/distributed/algorithms/join.py',
-    'torch/distributed/algorithms/model_averaging/__init__.py',
-    'torch/distributed/algorithms/model_averaging/averagers.py',
-    'torch/distributed/algorithms/model_averaging/hierarchical_model_averager.py',
-    'torch/distributed/algorithms/model_averaging/utils.py',
-    'torch/distributed/argparse_util.py',
-    'torch/distributed/autograd/__init__.py',
-    'torch/distributed/benchmarks/benchmark_ddp_rpc.py',
-    'torch/distributed/c10d_logger.py',
-    'torch/distributed/collective_utils.py',
-    'torch/distributed/constants.py',
-    'torch/distributed/distributed_c10d.py',
-    'torch/distributed/elastic/__init__.py',
-    'torch/distributed/elastic/agent/__init__.py',
-    'torch/distributed/elastic/agent/server/__init__.py',
-    'torch/distributed/elastic/agent/server/api.py',
-    'torch/distributed/elastic/agent/server/local_elastic_agent.py',
-    'torch/distributed/elastic/events/__init__.py',
-    'torch/distributed/elastic/events/api.py',
-    'torch/distributed/elastic/events/handlers.py',
-    'torch/distributed/elastic/metrics/__init__.py',
-    'torch/distributed/elastic/metrics/api.py',
-    'torch/distributed/elastic/multiprocessing/__init__.py',
-    'torch/distributed/elastic/multiprocessing/api.py',
-    'torch/distributed/elastic/multiprocessing/errors/__init__.py',
-    'torch/distributed/elastic/multiprocessing/errors/error_handler.py',
-    'torch/distributed/elastic/multiprocessing/errors/handlers.py',
-    'torch/distributed/elastic/multiprocessing/redirects.py',
-    'torch/distributed/elastic/multiprocessing/tail_log.py',
-    'torch/distributed/elastic/rendezvous/__init__.py',
-    'torch/distributed/elastic/rendezvous/api.py',
-    'torch/distributed/elastic/rendezvous/c10d_rendezvous_backend.py',
-    'torch/distributed/elastic/rendezvous/dynamic_rendezvous.py',
-    'torch/distributed/elastic/rendezvous/etcd_rendezvous.py',
-    'torch/distributed/elastic/rendezvous/etcd_rendezvous_backend.py',
-    'torch/distributed/elastic/rendezvous/etcd_server.py',
-    'torch/distributed/elastic/rendezvous/etcd_store.py',
-    'torch/distributed/elastic/rendezvous/registry.py',
-    'torch/distributed/elastic/rendezvous/static_tcp_rendezvous.py',
-    'torch/distributed/elastic/rendezvous/utils.py',
-    'torch/distributed/elastic/timer/__init__.py',
-    'torch/distributed/elastic/timer/api.py',
-    'torch/distributed/elastic/timer/file_based_local_timer.py',
-    'torch/distributed/elastic/timer/local_timer.py',
-    'torch/distributed/elastic/utils/__init__.py',
-    'torch/distributed/elastic/utils/api.py',
-    'torch/distributed/elastic/utils/data/__init__.py',
-    'torch/distributed/elastic/utils/data/cycling_iterator.py',
-    'torch/distributed/elastic/utils/data/elastic_distributed_sampler.py',
-    'torch/distributed/elastic/utils/distributed.py',
-    'torch/distributed/elastic/utils/log_level.py',
-    'torch/distributed/elastic/utils/logging.py',
-    'torch/distributed/elastic/utils/store.py',
-    'torch/distributed/examples/memory_tracker_example.py',
-    'torch/distributed/launch.py',
-    'torch/distributed/launcher/__init__.py',
-    'torch/distributed/launcher/api.py',
-    'torch/distributed/logging_handlers.py',
-    'torch/distributed/nn/__init__.py',
-    'torch/distributed/nn/api/__init__.py',
-    'torch/distributed/nn/api/remote_module.py',
-    'torch/distributed/nn/functional.py',
-    'torch/distributed/nn/jit/__init__.py',
-    'torch/distributed/nn/jit/instantiator.py',
-    'torch/distributed/nn/jit/templates/__init__.py',
-    'torch/distributed/nn/jit/templates/remote_module_template.py',
-    'torch/distributed/optim/__init__.py',
-    'torch/distributed/optim/apply_optimizer_in_backward.py',
-    'torch/distributed/optim/functional_adadelta.py',
-    'torch/distributed/optim/functional_adagrad.py',
-    'torch/distributed/optim/functional_adam.py',
-    'torch/distributed/optim/functional_adamax.py',
-    'torch/distributed/optim/functional_adamw.py',
-    'torch/distributed/optim/functional_rmsprop.py',
-    'torch/distributed/optim/functional_rprop.py',
-    'torch/distributed/optim/functional_sgd.py',
-    'torch/distributed/optim/named_optimizer.py',
-    'torch/distributed/optim/optimizer.py',
-    'torch/distributed/optim/post_localSGD_optimizer.py',
-    'torch/distributed/optim/utils.py',
-    'torch/distributed/optim/zero_redundancy_optimizer.py',
-    'torch/distributed/remote_device.py',
-    'torch/distributed/rendezvous.py',
-    'torch/distributed/rpc/__init__.py',
-    'torch/distributed/rpc/_testing/__init__.py',
-    'torch/distributed/rpc/_testing/faulty_agent_backend_registry.py',
-    'torch/distributed/rpc/_utils.py',
-    'torch/distributed/rpc/api.py',
-    'torch/distributed/rpc/backend_registry.py',
-    'torch/distributed/rpc/constants.py',
-    'torch/distributed/rpc/functions.py',
-    'torch/distributed/rpc/internal.py',
-    'torch/distributed/rpc/options.py',
-    'torch/distributed/rpc/rref_proxy.py',
-    'torch/distributed/rpc/server_process_global_profiler.py',
-    'torch/distributed/run.py',
-    'torch/distributed/tensor/__init__.py',
-    'torch/distributed/tensor/parallel/__init__.py',
-    'torch/distributed/tensor/parallel/_utils.py',
-    'torch/distributed/tensor/parallel/_view_with_dim_change.py',
-    'torch/distributed/tensor/parallel/api.py',
-    'torch/distributed/tensor/parallel/fsdp.py',
-    'torch/distributed/tensor/parallel/input_reshard.py',
-    'torch/distributed/tensor/parallel/multihead_attention_tp.py',
-    'torch/distributed/tensor/parallel/style.py',
    'torch/fft/__init__.py',
    'torch/func/__init__.py',
-    'torch/functional.py',
    'torch/futures/__init__.py',
    'torch/fx/__init__.py',
    'torch/fx/_compatibility.py',
@ -1644,20 +1472,9 @@ exclude_patterns = [
    'torch/fx/subgraph_rewriter.py',
    'torch/fx/tensor_type.py',
    'torch/fx/traceback.py',
-    'torch/hub.py',
-    'torch/library.py',
    'torch/linalg/__init__.py',
    'torch/monitor/__init__.py',
    'torch/nested/__init__.py',
-    'torch/nn/__init__.py',
-    'torch/nn/_reduction.py',
-    'torch/nn/backends/__init__.py',
-    'torch/nn/backends/thnn.py',
-    'torch/nn/common_types.py',
-    'torch/nn/cpp.py',
-    'torch/nn/functional.py',
-    'torch/nn/grad.py',
-    'torch/nn/init.py',
    'torch/nn/intrinsic/__init__.py',
    'torch/nn/intrinsic/modules/__init__.py',
    'torch/nn/intrinsic/modules/fused.py',
@ -1674,40 +1491,6 @@ exclude_patterns = [
    'torch/nn/intrinsic/quantized/modules/bn_relu.py',
    'torch/nn/intrinsic/quantized/modules/conv_relu.py',
    'torch/nn/intrinsic/quantized/modules/linear_relu.py',
-    'torch/nn/modules/__init__.py',
-    'torch/nn/modules/_functions.py',
-    'torch/nn/modules/activation.py',
-    'torch/nn/modules/adaptive.py',
-    'torch/nn/modules/batchnorm.py',
-    'torch/nn/modules/channelshuffle.py',
-    'torch/nn/modules/container.py',
-    'torch/nn/modules/conv.py',
-    'torch/nn/modules/distance.py',
-    'torch/nn/modules/dropout.py',
-    'torch/nn/modules/flatten.py',
-    'torch/nn/modules/fold.py',
-    'torch/nn/modules/instancenorm.py',
-    'torch/nn/modules/lazy.py',
-    'torch/nn/modules/linear.py',
-    'torch/nn/modules/loss.py',
-    'torch/nn/modules/module.py',
-    'torch/nn/modules/normalization.py',
-    'torch/nn/modules/padding.py',
-    'torch/nn/modules/pixelshuffle.py',
-    'torch/nn/modules/pooling.py',
-    'torch/nn/modules/rnn.py',
-    'torch/nn/modules/sparse.py',
-    'torch/nn/modules/transformer.py',
-    'torch/nn/modules/upsampling.py',
-    'torch/nn/modules/utils.py',
-    'torch/nn/parallel/__init__.py',
-    'torch/nn/parallel/_functions.py',
-    'torch/nn/parallel/comm.py',
-    'torch/nn/parallel/data_parallel.py',
-    'torch/nn/parallel/parallel_apply.py',
-    'torch/nn/parallel/replicate.py',
-    'torch/nn/parallel/scatter_gather.py',
-    'torch/nn/parameter.py',
    'torch/nn/qat/__init__.py',
    'torch/nn/qat/dynamic/__init__.py',
    'torch/nn/qat/dynamic/modules/__init__.py',
@ -1745,35 +1528,6 @@ exclude_patterns = [
    'torch/nn/quantized/modules/normalization.py',
    'torch/nn/quantized/modules/rnn.py',
    'torch/nn/quantized/modules/utils.py',
-    'torch/nn/utils/__init__.py',
-    'torch/nn/utils/_deprecation_utils.py',
-    'torch/nn/utils/_expanded_weights/__init__.py',
-    'torch/nn/utils/_expanded_weights/conv_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/conv_utils.py',
-    'torch/nn/utils/_expanded_weights/embedding_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/expanded_weights_impl.py',
-    'torch/nn/utils/_expanded_weights/expanded_weights_utils.py',
-    'torch/nn/utils/_expanded_weights/group_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/instance_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/layer_norm_expanded_weights.py',
-    'torch/nn/utils/_expanded_weights/linear_expanded_weights.py',
-    'torch/nn/utils/_per_sample_grad.py',
-    'torch/nn/utils/clip_grad.py',
-    'torch/nn/utils/convert_parameters.py',
-    'torch/nn/utils/fusion.py',
-    'torch/nn/utils/init.py',
-    'torch/nn/utils/memory_format.py',
-    'torch/nn/utils/parametrizations.py',
-    'torch/nn/utils/parametrize.py',
-    'torch/nn/utils/prune.py',
-    'torch/nn/utils/rnn.py',
-    'torch/nn/utils/spectral_norm.py',
-    'torch/nn/utils/weight_norm.py',
-    'torch/overrides.py',
-    'torch/quasirandom.py',
-    'torch/random.py',
-    'torch/return_types.py',
-    'torch/serialization.py',
    'torch/signal/__init__.py',
    'torch/signal/windows/__init__.py',
    'torch/signal/windows/windows.py',
@ -1782,7 +1536,6 @@ exclude_patterns = [
    'torch/sparse/_triton_ops.py',
    'torch/sparse/semi_structured.py',
    'torch/special/__init__.py',
-    'torch/storage.py',
    'torch/testing/_internal/__init__.py',
    'torch/testing/_internal/autocast_test_lists.py',
    'torch/testing/_internal/autograd_function_db.py',
@ -1790,9 +1543,7 @@ exclude_patterns = [
    'torch/testing/_internal/codegen/__init__.py',
    'torch/testing/_internal/codegen/random_topo_test.py',
    'torch/testing/_internal/common_cuda.py',
-    'torch/testing/_internal/common_device_type.py',
    'torch/testing/_internal/common_distributed.py',
-    'torch/testing/_internal/common_dtype.py',
    'torch/testing/_internal/common_jit.py',
    'torch/testing/_internal/common_methods_invocations.py',
    'torch/testing/_internal/common_modules.py',
@ -1857,7 +1608,6 @@ exclude_patterns = [
    'torch/testing/_internal/test_module/__init__.py',
    'torch/testing/_internal/test_module/future_div.py',
    'torch/testing/_internal/test_module/no_future_div.py',
-    'torch/utils/__init__.py',
    'torch/utils/_contextlib.py',
    'torch/utils/_cpp_extension_versioner.py',
    'torch/utils/_crash_handler.py',
@ -1908,53 +1658,6 @@ exclude_patterns = [
    'torch/utils/collect_env.py',
    'torch/utils/cpp_backtrace.py',
    'torch/utils/cpp_extension.py',
-    'torch/utils/data/__init__.py',
-    'torch/utils/data/_utils/__init__.py',
-    'torch/utils/data/_utils/collate.py',
-    'torch/utils/data/_utils/fetch.py',
-    'torch/utils/data/_utils/pin_memory.py',
-    'torch/utils/data/_utils/serialization.py',
-    'torch/utils/data/_utils/signal_handling.py',
-    'torch/utils/data/_utils/worker.py',
-    'torch/utils/data/backward_compatibility.py',
-    'torch/utils/data/dataloader.py',
-    'torch/utils/data/datapipes/__init__.py',
-    'torch/utils/data/datapipes/_decorator.py',
-    'torch/utils/data/datapipes/_hook_iterator.py',
-    'torch/utils/data/datapipes/_typing.py',
-    'torch/utils/data/datapipes/dataframe/__init__.py',
-    'torch/utils/data/datapipes/dataframe/dataframe_wrapper.py',
-    'torch/utils/data/datapipes/dataframe/dataframes.py',
-    'torch/utils/data/datapipes/dataframe/datapipes.py',
-    'torch/utils/data/datapipes/dataframe/structures.py',
-    'torch/utils/data/datapipes/datapipe.py',
-    'torch/utils/data/datapipes/gen_pyi.py',
-    'torch/utils/data/datapipes/iter/__init__.py',
-    'torch/utils/data/datapipes/iter/callable.py',
-    'torch/utils/data/datapipes/iter/combinatorics.py',
-    'torch/utils/data/datapipes/iter/combining.py',
-    'torch/utils/data/datapipes/iter/filelister.py',
-    'torch/utils/data/datapipes/iter/fileopener.py',
-    'torch/utils/data/datapipes/iter/grouping.py',
-    'torch/utils/data/datapipes/iter/routeddecoder.py',
-    'torch/utils/data/datapipes/iter/selecting.py',
-    'torch/utils/data/datapipes/iter/sharding.py',
-    'torch/utils/data/datapipes/iter/streamreader.py',
-    'torch/utils/data/datapipes/iter/utils.py',
-    'torch/utils/data/datapipes/map/__init__.py',
-    'torch/utils/data/datapipes/map/callable.py',
-    'torch/utils/data/datapipes/map/combinatorics.py',
-    'torch/utils/data/datapipes/map/combining.py',
-    'torch/utils/data/datapipes/map/grouping.py',
-    'torch/utils/data/datapipes/map/utils.py',
-    'torch/utils/data/datapipes/utils/__init__.py',
-    'torch/utils/data/datapipes/utils/common.py',
-    'torch/utils/data/datapipes/utils/decoder.py',
-    'torch/utils/data/datapipes/utils/snapshot.py',
-    'torch/utils/data/distributed.py',
-    'torch/utils/data/graph.py',
-    'torch/utils/data/graph_settings.py',
-    'torch/utils/data/sampler.py',
    'torch/utils/dlpack.py',
    'torch/utils/file_baton.py',
    'torch/utils/flop_counter.py',
@ -1994,8 +1697,9 @@ init_command = [
    '--dry-run={{DRYRUN}}',
    '--no-black-binary',
    'black==23.12.1',
-    'ufmt==2.1.0',
-    'usort==1.0.6',
+    'ufmt==2.7.0',
+    'usort==1.0.8.post1',
+    'isort==5.13.2',
 ]
 is_formatter = true

@ -2079,7 +1783,7 @@ init_command = [
    'python3',
    'tools/linter/adapters/pip_init.py',
    '--dry-run={{DRYRUN}}',
-    'ruff==0.4.8',
+    'ruff==0.5.0',
 ]
 is_formatter = true

--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -461,7 +461,6 @@ filegroup(
 filegroup(
    name = "caffe2_perfkernels_srcs",
    srcs = [
-        "caffe2/perfkernels/embedding_lookup.cc",
        "caffe2/perfkernels/embedding_lookup_idx.cc",
    ],
 )
@ -499,7 +498,6 @@ cc_library(
    hdrs = [
        "caffe2/core/common.h",
        "caffe2/perfkernels/common.h",
-        "caffe2/perfkernels/embedding_lookup.h",
        "caffe2/perfkernels/embedding_lookup_idx.h",
        "caffe2/utils/fixed_divisor.h",
    ] + glob([
@ -746,6 +744,7 @@ cc_library(
            "torch/csrc/cuda/python_nccl.cpp",
            "torch/csrc/cuda/nccl.cpp",
            "torch/csrc/distributed/c10d/intra_node_comm.cu",
+            "torch/csrc/distributed/c10d/CUDASymmetricMemory.cu",
            "torch/csrc/distributed/c10d/Utils.cu",
            "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu",
        ],
@ -763,6 +762,7 @@ cc_library(
        ":torch_headers",
        "@kineto",
        "@cpp-httplib",
+        "@nlohmann",
    ] + if_cuda([
        "@cuda//:nvToolsExt",
        "@cutlass",
--- a/30
+++ b/30
@ -43,12 +43,12 @@ nn/qat/ @jerryzh168
 /torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw

 # ONNX Export
-/torch/_dynamo/backends/onnxrt.py @bowenbao @thiagocrepaldi @wschin
-/torch/csrc/jit/passes/onnx.h @bowenbao @thiagocrepaldi
-/torch/csrc/jit/passes/onnx.cpp @bowenbao @thiagocrepaldi
-/torch/csrc/jit/passes/onnx/ @bowenbao @thiagocrepaldi
-/torch/onnx/ @bowenbao @thiagocrepaldi @wschin
-/test/onnx/ @bowenbao @thiagocrepaldi @wschin
+/torch/_dynamo/backends/onnxrt.py @wschin @xadupre
+/torch/csrc/jit/passes/onnx.h @titaiwangms @shubhambhokare1 @xadupre
+/torch/csrc/jit/passes/onnx.cpp @titaiwangms @shubhambhokare1 @xadupre
+/torch/csrc/jit/passes/onnx/ @titaiwangms @shubhambhokare1 @xadupre
+/torch/onnx/ @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre
+/test/onnx/  @titaiwangms @shubhambhokare1 @justinchuby @wschin @xadupre

 # CI
 /.ci  @pytorch/pytorch-dev-infra
@ -57,6 +57,7 @@ nn/qat/ @jerryzh168
 /.ci/docker/ @jeffdaily
 /.ci/docker/ci_commit_pins/triton.txt @desertfire @Chillee @eellison @shunting314 @bertmaher @jeffdaily @jataylo @jithunnair-amd @pruthvistony
 /.ci/docker/ci_commit_pins/triton-rocm.txt @jeffdaily @jataylo @jithunnair-amd @pruthvistony
+/.ci/docker/ci_commit_pins/triton-xpu.txt @EikanWang @gujinghui

 # Github Actions
 # This list is for people wanting to be notified every time there's a change
@ -107,10 +108,10 @@ aten/src/ATen/detail/MTIAHooksInterface.h @egienvalue
 torch/csrc/mtia/ @egienvalue

 # Profiler
-torch/csrc/autograd/profiler* @aaronenyeshi
-torch/autograd/profiler* @aaronenyeshi
-torch/csrc/profiler/ @aaronenyeshi
-torch/profiler/ @aaronenyeshi
+torch/csrc/autograd/profiler* @aaronenyeshi @sraikund16
+torch/autograd/profiler* @aaronenyeshi @sraikund16
+torch/csrc/profiler/ @aaronenyeshi @sraikund16
+torch/profiler/ @aaronenyeshi @sraikund16

 # AOTDispatch tests
 test/functorch/test_aotdispatch.py @ezyang @Chillee
@ -132,6 +133,15 @@ caffe2/operators/hip @jeffdaily @jithunnair-amd
 caffe2/operators/rnn/hip @jeffdaily @jithunnair-amd
 caffe2/utils/hip @jeffdaily @jithunnair-amd

+# XPU-specific files
+/aten/src/ATen/xpu/ @EikanWang @gujinghui
+/c10/xpu/ @EikanWang @gujinghui
+/torch/csrc/xpu/ @EikanWang @gujinghui
+/torch/xpu/ @EikanWang @gujinghui
+/test/xpu/ @EikanWang @gujinghui
+/test/test_xpu.py @EikanWang @gujinghui
+/third_party/xpu.txt @EikanWang @gujinghui
+
 # torch.export
 /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17
--- a/5
+++ b/5
@ -77,6 +77,11 @@ RUN case ${TARGETPLATFORM} in \
    esac && \
    /opt/conda/bin/conda clean -ya
 RUN /opt/conda/bin/pip install torchelastic
+RUN IS_CUDA=$(python -c 'import torch ; print(torch.cuda._is_compiled())'); \
+    echo "Is torch compiled with cuda: ${IS_CUDA}"; \
+    if test "${IS_CUDA}" != "True" -a ! -z "${CUDA_VERSION}"; then \
+        exit 1; \
+    fi

 FROM ${BASE_IMAGE} as official
 ARG PYTORCH_VERSION
--- a/RELEASE.md
+++ b/RELEASE.md
@ -290,7 +290,7 @@ After the final RC is created. The following tasks should be performed :

 * Create validation issue for the release, see for example [Validations for 2.1.2 release](https://github.com/pytorch/pytorch/issues/114904) and perform required validations.

-* Run performance tests in [benchmark repository](https://github.com/pytorch/benchmark). Make sure there are no prerformance regressions.
+* Run performance tests in [benchmark repository](https://github.com/pytorch/benchmark). Make sure there are no performance regressions.

 * Prepare and stage PyPI binaries for promotion. This is done with this script:
 [`pytorch/builder:release/pypi/promote_pypi_to_staging.sh`](https://github.com/pytorch/builder/blob/main/release/pypi/promote_pypi_to_staging.sh)
@ -429,12 +429,12 @@ need to support these particular versions of software.

 ## Operating Systems
 Supported OS flavors are summarized in the table below:
-| Operating System family | Architectrue | Notes |
+| Operating System family | Architecture | Notes |
 | --- | --- | --- |
 | Linux | aarch64, x86_64 | Wheels are manylinux2014 compatible, i.e. they should be runnable on any Linux system with glibc-2.17 or above. |
 | MacOS | arm64 | Builds should be compatible with MacOS 11 (Big Sur) or newer, but are actively tested against MacOS 14 (Sonoma). |
 | MacOS | x86_64 | Requires MacOS Catalina or above, not supported after 2.2, see https://github.com/pytorch/pytorch/issues/114602 |
-| Windows | x86_64 | Buils are compatible with Windows-10 or newer. |
+| Windows | x86_64 | Builds are compatible with Windows-10 or newer. |

 # Submitting Tutorials

--- a/SECURITY.md
+++ b/SECURITY.md
@ -6,7 +6,7 @@
   - [Untrusted inputs](#untrusted-inputs)
   - [Data privacy](#data-privacy)
   - [Using distributed features](#using-distributed-features)
-
+- [**CI/CD security principles**](#cicd-security-principles)
 ## Reporting Security Issues

 Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
@ -61,3 +61,27 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 PyTorch can be used for distributed computing, and as such there is a `torch.distributed` package. PyTorch Distributed features are intended for internal communication only. They are not built for use in untrusted environments or networks.

 For performance reasons, none of the PyTorch Distributed primitives (including c10d, RPC, and TCPStore) include any authorization protocol and will send messages unencrypted. They accept connections from anywhere, and execute the workload sent without performing any checks. Therefore, if you run a PyTorch Distributed program on your network, anybody with access to the network can execute arbitrary code with the privileges of the user running PyTorch.
+
+## CI/CD security principles
+_Audience_: Contributors and reviewers, especially if modifying the workflow files/build system.
+
+PyTorch CI/CD security philosophy is based on finding a balance between open and transparent CI pipelines while keeping the environment efficient and safe.
+
+PyTorch testing requirements are complex, and a large part of the code base can only be tested on specialized powerful hardware, such as GPU, making it a lucrative target for resource misuse. To prevent this, we require workflow run approval for PRs from non-member contributors. To keep the volume of those approvals relatively low, we easily extend write permissions to the repository to regular contributors.
+
+More widespread write access to the repo presents challenges when it comes to reviewing changes, merging code into trunk, and creating releases. [Protected branches](https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/managing-protected-branches/about-protected-branches) are used to restrict the ability to merge to the trunk/release branches only to the repository administrators and merge bot. The merge bot is responsible for mechanistically merging the change and validating reviews against the path-based rules defined in [merge_rules.yml](https://github.com/pytorch/pytorch/blob/main/.github/merge_rules.yaml). Once a PR has been reviewed by person(s) mentioned in these rules, leaving a `@pytorchbot merge` comment on the PR will initiate the merge process. To protect merge bot credentials from leaking, merge actions must be executed only on ephemeral runners (see definition below) using a specialized deployment environment.
+
+To speed up the CI system, build steps of the workflow rely on the distributed caching mechanism backed by [sccache](https://github.com/mozilla/sccache), making them susceptible to cache corruption compromises. For that reason binary artifacts generated during CI should not be executed in an environment that contains an access to any sensitive/non-public information and should not be published for use by general audience. One should not have any expectation about the lifetime of those artifacts, although in practice they likely remain accessible for about two weeks after the PR has been closed.
+
+To speed up CI system setup, PyTorch relies heavily on Docker to pre-build and pre-install the dependencies. To prevent a potentially malicious PR from altering ones that were published in the past, ECR has been configured to use immutable tags.
+
+To improve runner availability and more efficient resource utilization, some of the CI runners are non-ephemeral, i.e., workflow steps from completely unrelated PRs could be scheduled sequentially on the same runner, making them susceptible to reverse shell attacks. For that reason, PyTorch does not rely on the repository secrets mechanism, as these can easily be compromised in such attacks.
+
+### Release pipelines security
+
+To ensure safe binary releases, PyTorch release pipelines are built on the following principles:
+ - All binary builds/upload jobs must be run on ephemeral runners, i.e., on a machine that is allocated from the cloud to do the build and released back to the cloud after the build is finished. This protects those builds from interference from external actors, who potentially can get reverse shell access to a non-ephemeral runner and wait there for a binary build.
+ - All binary builds are cold-start builds, i.e., distributed caching/incremental builds are not permitted. This renders builds much slower than incremental CI builds but isolates them from potential compromises of the intermediate artifacts caching systems.
+ - All upload jobs are executed in a [deployment environments](https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment) that are restricted to protected branches
+ - Security credentials needed to upload binaries to PyPI/conda or stable indexes `download.pytorch.org/whl` are never uploaded to repo secrets storage/environment. This requires an extra manual step to publish the release but ensures that access to those would not be compromised by deliberate/accidental leaks of secrets stored in the cloud.
+ - No binary artifacts should be published to GitHub releases pages, as these are overwritable by anyone with write permission to the repo.
--- a/6
+++ b/6
@ -174,6 +174,12 @@ new_local_repository(
    path = "third_party/cpp-httplib",
 )

+new_local_repository(
+    name = "nlohmann",
+    build_file = "//third_party:nlohmann.BUILD",
+    path = "third_party/nlohmann",
+)
+
 new_local_repository(
    name = "tensorpipe",
    build_file = "//third_party:tensorpipe.BUILD",
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -53,11 +53,6 @@ if(NOT BUILD_LITE_INTERPRETER)
  file(GLOB_RECURSE ATen_CORE_TEST_SRCS "core/*_test.cpp")
 endif()
 EXCLUDE(ATen_CORE_SRCS "${ATen_CORE_SRCS}" ${ATen_CORE_TEST_SRCS})
-# Exclude TensorImpl_test.cpp if compiling without Caffe2
-if(NOT BUILD_LITE_INTERPRETER)
-  file(GLOB_RECURSE ATen_CORE_EXCLUDED_TEST_SRCS "core/TensorImpl_test.cpp")
-  EXCLUDE(ATen_CORE_TEST_SRCS "${ATen_CORE_TEST_SRCS}" ${ATen_CORE_EXCLUDED_TEST_SRCS})
-endif()

 file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h")
 file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp")
@ -473,6 +468,7 @@ endif()

 if(USE_CUDA AND NOT USE_ROCM)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
+  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  if($ENV{ATEN_STATIC_CUDA})
    list(APPEND ATen_CUDA_DEPENDENCY_LIBS
      ${CUDA_LIBRARIES}
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -56,6 +56,14 @@ void Context::setDeterministicCuDNN(bool b) {
  deterministic_cudnn = b;
 }

+bool Context::deterministicMkldnn() const {
+  return deterministic_mkldnn;
+}
+
+void Context::setDeterministicMkldnn(bool b) {
+  deterministic_mkldnn = b;
+}
+
 bool Context::deterministicAlgorithms() const {
  return _deterministic_algorithms;
 }
@ -145,6 +153,13 @@ void Context::setSDPUseCuDNN(bool e) {
  enabled_cudnnSDP = e;
 }

+void Context::setSDPUseOverrideable(bool e) {
+  enabled_overrideable = e;
+}
+
+bool Context::userEnabledOverrideableSDP() const {
+  return enabled_overrideable;
+}

 // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
 static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG";
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -65,6 +65,8 @@ class TORCH_API Context {
        : at::getAccelerator(true).value();
    if (device_type == at::kCUDA) {
      return at::detail::getCUDAHooks();
+    } else if (device_type == at::kXPU) {
+      return at::detail::getXPUHooks();
    } else if (device_type == at::kMPS) {
      return at::detail::getMPSHooks();
    } else if (device_type == at::kPrivateUse1) {
@ -188,6 +190,8 @@ class TORCH_API Context {
  void setBenchmarkLimitCuDNN(int);
  bool deterministicCuDNN() const;
  void setDeterministicCuDNN(bool);
+  bool deterministicMkldnn() const;
+  void setDeterministicMkldnn(bool);
  bool userEnabledNNPACK() const;
  void setUserEnabledNNPACK(bool e);

@ -214,6 +218,9 @@ class TORCH_API Context {
  void setSDPUseCuDNN(bool);
  bool userEnabledCuDNNSDP() const;

+  void setSDPUseOverrideable(bool);
+  bool userEnabledOverrideableSDP() const;
+
  at::LinalgBackend linalgPreferredBackend() const;
  void setLinalgPreferredBackend(at::LinalgBackend);

@ -358,6 +365,7 @@ class TORCH_API Context {
  c10::once_flag thp_init;
  bool enabled_cudnn = true;
  bool deterministic_cudnn = false;
+  bool deterministic_mkldnn = false;
  bool _deterministic_algorithms = false;
  bool _deterministic_algorithms_warn_only = false;
  bool _deterministic_fill_uninitialized_memory = true;
@ -365,6 +373,7 @@ class TORCH_API Context {
  bool enabled_mem_efficientSDP = true;
  bool enabled_mathSDP = true;
  bool enabled_cudnnSDP = true;
+  bool enabled_overrideable = true;
 #ifdef USE_ROCM
  bool benchmark_cudnn = true;
 #else
--- a/aten/src/ATen/DeviceAccelerator.cpp
+++ b/aten/src/ATen/DeviceAccelerator.cpp
@ -1,39 +1,37 @@
-#include <ATen/DeviceAccelerator.h>
 #include <ATen/Context.h>
-
+#include <ATen/DeviceAccelerator.h>
 namespace at {

 C10_API std::optional<DeviceType> getAccelerator(bool checked) {
-#define CHECK_NO_CUDA \
-  TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1");
+#define DETECT_AND_ASSIGN_ACCELERATOR(device_name) \
+  if (at::has##device_name()) {                    \
+    device_type = k##device_name;                  \
+    TORCH_CHECK(                                   \
+        !is_accelerator_detected,                  \
+        "Cannot have ",                            \
+        device_type.value(),                       \
+        " with other accelerators.");              \
+    is_accelerator_detected = true;                \
+  }

-#define CHECK_NO_PU1 \
-  TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1");
+  if (is_privateuse1_backend_registered()) {
+    // We explicitly allow PrivateUse1 and another device at the same time as we
+    // use this for testing. Whenever a PrivateUse1 device is registered, use it
+    // first.
+    return kPrivateUse1;
+  }
+  std::optional<DeviceType> device_type = std::nullopt;
+  bool is_accelerator_detected = false;
+  DETECT_AND_ASSIGN_ACCELERATOR(CUDA)
+  DETECT_AND_ASSIGN_ACCELERATOR(MTIA)
+  DETECT_AND_ASSIGN_ACCELERATOR(XPU)
+  if (checked) {
+    TORCH_CHECK(
+        device_type, "Cannot access accelerator device when none is available.")
+  }
+  return device_type;

-#define CHECK_NO_MTIA \
-  TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices");
-
-    if (is_privateuse1_backend_registered()) {
-        // We explicitly allow PrivateUse1 and another device at the same time
-        // as we use this for testing.
-        // Whenever a PrivateUse1 device is registered, use it first.
-        return kPrivateUse1;
-    } else if (at::hasCUDA()) {
-        CHECK_NO_PU1
-        CHECK_NO_MTIA
-        return kCUDA;
-    } else if (at::hasMTIA()) {
-        CHECK_NO_CUDA
-        CHECK_NO_PU1
-        return kMTIA;
-    } else {
-        TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.")
-        return std::nullopt;
-    }
-
-#undef CHECK_NO_CUDA
-#undef CHECK_NO_PU1
+#undef DETECT_AND_ASSIGN_ACCELERATOR
 }

-
 } // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -13,9 +13,9 @@
 // - It provides a set of common APIs as defined by AcceleratorHooksInterface
 //
 // As of today, accelerator devices are (in no particular order):
-// CUDA, MTIA, PrivateUse1
+// CUDA, MTIA, XPU, PrivateUse1
 // We want to add once all the proper APIs are supported and tested:
-// HIP, MPS, XPU
+// HIP, MPS

 namespace at {

--- a/aten/src/ATen/EmptyTensor.cpp
+++ b/aten/src/ATen/EmptyTensor.cpp
@ -29,6 +29,7 @@ c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) {
  return c10::GetCPUAllocator();
 }

+#ifndef C10_MOBILE
 constexpr uint64_t storage_max() {
  // int64_t and size_t are used somewhat inconsistently throughout ATen.
  // To be safe, storage size calculations must fit in both types.
@ -38,6 +39,7 @@ constexpr uint64_t storage_max() {
      std::numeric_limits<size_t>::max());
  return std::min(int64_max, size_max);
 }
+#endif

 inline void raise_warning_for_complex_half(ScalarType dtype) {
  if (dtype == kComplexHalf) {
--- a/aten/src/ATen/ExpandUtils.h
+++ b/aten/src/ATen/ExpandUtils.h
@ -462,7 +462,7 @@ inline Tensor _sum_to(
    reduce_dims.push_back(i);
  }
  for (int64_t i = leading_dims; i < static_cast<int64_t>(sizes.size()); ++i) {
-    if (shape[i - leading_dims] == 1 &&
+    if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(shape[i - leading_dims], 1)) &&
        TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) {
      reduce_dims.push_back(i);
    }
--- a/aten/src/ATen/FunctionalInverses.cpp
+++ b/aten/src/ATen/FunctionalInverses.cpp
@ -303,7 +303,7 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base,
    return Tensor();
 }

-Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx) {
+Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional<Tensor>& lengths, int64_t ragged_idx, const c10::optional<Tensor>& min_seqlen, const c10::optional<Tensor>& max_seqlen) {
  auto values = at::_nested_get_values(mutated_view);
  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return values;
@ -317,7 +317,12 @@ Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const
  auto lengths = at::_nested_get_lengths(base);
  auto ragged_idx = at::_nested_get_ragged_idx(base);
  auto dummy = at::_nested_get_jagged_dummy(base);
-  auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx);
+  auto min_seqlen = at::_nested_get_min_seqlen(base);
+  auto max_seqlen = at::_nested_get_max_seqlen(base);
+  auto nt = at::_nested_view_from_jagged(
+      mutated_view, offsets, dummy, lengths, ragged_idx,
+      (min_seqlen.defined() ? c10::optional<Tensor>(min_seqlen) : c10::nullopt),
+      (max_seqlen.defined() ? c10::optional<Tensor>(max_seqlen) : c10::nullopt));

  if (inverse_return_mode != InverseReturnMode::NeverView) {
    return nt;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -514,6 +514,9 @@ c10::SymInt FunctionalTensorWrapper::sym_size_custom(int64_t d) const {
 c10::SymInt FunctionalTensorWrapper::sym_storage_offset_custom() const {
  return value_.unsafeGetTensorImpl()->sym_storage_offset();
 }
+c10::Layout FunctionalTensorWrapper::layout_impl() const {
+  return value_.unsafeGetTensorImpl()->layout();
+}

 namespace functionalization {
 namespace impl {
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -222,6 +222,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  c10::SymIntArrayRef sym_strides_custom() const override;
  c10::SymInt sym_storage_offset_custom() const override;
  c10::Device device_custom() const override;
+  c10::Layout layout_impl() const override;

 private:
  const char* tensorimpl_type_name() const override;
--- a/aten/src/ATen/LegacyBatchedFallback.cpp
+++ b/aten/src/ATen/LegacyBatchedFallback.cpp
@ -139,7 +139,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t
    if (self_vmap_levels != (self_vmap_levels | other_vmap_levels)) {
      // Find one vmap level to complain about
      auto additional_bdims = (self_vmap_levels | other_vmap_levels) ^ self_vmap_levels;
-      auto offending_level = llvm::findLastSet(additional_bdims.to_ulong());
+      [[maybe_unused]] auto offending_level = llvm::findLastSet(additional_bdims.to_ulong());
      // The following prints out "vmap: aten::add_(tensor, ...) is not possible",
      // but it would be better to print out "tensor.add_(...) is not possible".
      // Afaict there's no official way to get the add_ and there is no way to
--- a/aten/src/ATen/MapAllocator.h
+++ b/aten/src/ATen/MapAllocator.h
@ -55,6 +55,10 @@ class TORCH_API MapAllocator {
    return base_ptr_;
  }

+  int flags() const {
+    return flags_;
+  }
+
  static MapAllocator* fromDataPtr(const at::DataPtr&);
  static at::DataPtr makeDataPtr(
      c10::string_view filename,
--- a/aten/src/ATen/NamedTensorUtils.cpp
+++ b/aten/src/ATen/NamedTensorUtils.cpp
@ -8,12 +8,14 @@

 namespace at {

+#ifndef STRIP_ERROR_MESSAGES
 // Returns "Tensor['N', 'C', 'H', 'W']" for a tensor with names ('N', 'C', 'H', 'W').
 static std::string toDimnameRepr(const Tensor& tensor) {
  std::ostringstream os;
  os << "Tensor" << tensor.names();
  return os.str();
 }
+#endif

 int64_t dimname_to_position(const Tensor& tensor, Dimname dim) {
  TORCH_CHECK(dim.type() != NameType::WILDCARD,
--- a/aten/src/ATen/ParallelCommon.cpp
+++ b/aten/src/ATen/ParallelCommon.cpp
@ -29,6 +29,7 @@ const char* get_env_var(
  return value ? value : def_value;
 }

+#ifndef C10_MOBILE
 size_t get_env_num_threads(const char* var_name, size_t def_value = 0) {
  try {
    if (auto* value = std::getenv(var_name)) {
@ -43,6 +44,7 @@ size_t get_env_num_threads(const char* var_name, size_t def_value = 0) {
  }
  return def_value;
 }
+#endif

 } // namespace

--- a/aten/src/ATen/SavedTensorHooks.cpp
+++ b/aten/src/ATen/SavedTensorHooks.cpp
@ -35,6 +35,12 @@ void SavedTensorDefaultHooks::enable() {
  tls.disabled_error_message = c10::nullopt;
 }

+/* static */ bool SavedTensorDefaultHooks::set_tracing(bool is_tracing) {
+  bool prior  = tls.is_tracing;
+  tls.is_tracing = is_tracing;
+  return prior;
+}
+
 const std::optional<std::string>& SavedTensorDefaultHooks::get_disabled_error_message() {
  return tls.disabled_error_message;
 }
@ -59,25 +65,20 @@ void SavedTensorDefaultHooks::push_hooks(PyObject* pack_hook, PyObject* unpack_h
  tls.stack.emplace(pack_hook, unpack_hook);
 }

-void SavedTensorDefaultHooks::pop_hooks() {
+std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::pop_hooks() {
  // Reference counting is handled by the caller of `pop_hooks`
  TORCH_INTERNAL_ASSERT(is_initialized && !tls.stack.empty());
+  std::pair<PyObject*, PyObject*> hooks = tls.stack.top();
  tls.stack.pop();
+  return hooks;
 }

 std::pair<PyObject*, PyObject*> SavedTensorDefaultHooks::get_hooks() {
-  if (!is_initialized || tls.stack.empty()) {
+  // For tls.is_tracing, see NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  if (!is_initialized || tls.stack.empty() || tls.is_tracing) {
    return std::make_pair(nullptr, nullptr);
  }
  return tls.stack.top();
 }

-std::stack<std::pair<PyObject*, PyObject*>> SavedTensorDefaultHooks::get_stack() {
-  return tls.stack;
-}
-
-void SavedTensorDefaultHooks::set_stack(std::stack<std::pair<PyObject*, PyObject*>> stack_) {
-  tls.stack = std::move(stack_);
-}
-
 }
--- a/aten/src/ATen/SavedTensorHooks.h
+++ b/aten/src/ATen/SavedTensorHooks.h
@ -22,17 +22,18 @@ struct TORCH_API SavedTensorDefaultHooksTLS {
  // We did this for efficiency (so we didn't have to keep a separate bool
  // around)
  std::optional<std::string> disabled_error_message;
+
+  // See NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  bool is_tracing = false;
 };

 } // namespace impl

 struct TORCH_API SavedTensorDefaultHooks {
  static void push_hooks(PyObject* pack_hook, PyObject* unpack_hook);
-  static void pop_hooks();
+  static std::pair<PyObject*, PyObject*> pop_hooks();
  static std::pair<PyObject*, PyObject*> get_hooks();
  static void lazy_initialize();
-  static std::stack<std::pair<PyObject*, PyObject*>> get_stack();
-  static void set_stack(std::stack<std::pair<PyObject*, PyObject*>>);

  static const impl::SavedTensorDefaultHooksTLS& get_tls_state();
  static void set_tls_state(const impl::SavedTensorDefaultHooksTLS& tls);
@ -42,11 +43,20 @@ struct TORCH_API SavedTensorDefaultHooks {
  // hooks, especially if their feature does not work with it. If they are
  // disabled, then the following will raise an error:
  // - Attempting to push_hooks
-  // - calling disable(message) with a non-zero stack (from get_stack) size
+  // - calling disable(message) with a non-zero stack (hooks) size
  static void disable(const std::string& error_message);
  static void enable();
  static bool is_enabled();
  static const std::optional<std::string>& get_disabled_error_message();
+
+  // NOTE: [Deferring tensor pack/unpack hooks until runtime]
+  // To preserve eager semantics of pack/unpack hooks firing only once per saved
+  // variable, Dynamo/AOTAutograd need to defer hook firing until runtime. Using
+  // disable() would loud error at trace time, and pushing a no-op hook would
+  // fail when the traced code is wrapped in a disable_saved_tensors_hooks ctx.
+  // To do so, we disable these hooks during tracing. See
+  // https://github.com/pytorch/pytorch/issues/113263.
+  static bool set_tracing(bool is_tracing);
 };

 } // namespace at
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@ -140,7 +140,7 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
        "), but got ",
        size.size());
    if (nnz() > 0) {
-      auto alt_options_msg =
+      [[maybe_unused]] auto constexpr alt_options_msg =
          "You could try the following options:\n\
 1. If you need an empty sparse tensor of this size, call `x = torch.sparse_coo_tensor(size)`.\n\
 2. If you need to resize this tensor, you have the following options:\n\
--- a/aten/src/ATen/core/List.h
+++ b/aten/src/ATen/core/List.h
@ -478,8 +478,6 @@ namespace impl {
 // (maybe except for some internal prim ops).
 using GenericList = List<IValue>;

-const IValue* ptr_to_first_element(const GenericList& list);
-
 }
 }

--- a/aten/src/ATen/core/List_inl.h
+++ b/aten/src/ATen/core/List_inl.h
@ -350,11 +350,4 @@ void List<T>::unsafeSetElementType(TypePtr t) {
  impl_->elementType = std::move(t);
 }

-namespace impl {
-
-inline const IValue* ptr_to_first_element(const GenericList& list) {
-  return &list.impl_->list[0];
-}
-
-}
 }
--- a/aten/src/ATen/core/MetaFallbackKernel.cpp
+++ b/aten/src/ATen/core/MetaFallbackKernel.cpp
@ -17,7 +17,7 @@ static void metaFallback(
      "while using an operator with PT2 compilation APIs (torch.compile/torch.export); "
      "in order to use this operator with those APIs you'll need to add a fake impl. "
      "Please see the following for next steps:  "
-      "https://pytorch.org/docs/main/notes/custom_operators.html");
+      "https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html");
 }

 TORCH_LIBRARY_IMPL(_, Meta, m) {
--- a/aten/src/ATen/core/TensorImpl_test.cpp
+++ b/aten/src/ATen/core/TensorImpl_test.cpp
@ -1,7 +0,0 @@
-#include <caffe2/core/tensor.h>
-#include <gtest/gtest.h>
-
-TEST(TensorImplTest, Caffe2Constructor) {
-  caffe2::Tensor tensor(caffe2::CPU);
-  ASSERT_EQ(tensor.strides()[0], 1);
-}
--- a/aten/src/ATen/core/VariableFallbackKernel.cpp
+++ b/aten/src/ATen/core/VariableFallbackKernel.cpp
@ -24,15 +24,6 @@ using c10::Stack;

 namespace {

-// Register fallthrough for Autograd backends dispatch keys
-// NB: But not the private use ones; maybe the extension wants
-// to override it themselves!
-
-void autograd_fallback(
-    const c10::OperatorHandle& op,
-    c10::DispatchKeySet dispatch_keys,
-    torch::jit::Stack* stack);
-
 #ifdef C10_MOBILE
 // NOTE [mobile/edge builds and the autograd fallback]
 // To save on binary size, some of the mobile configs don't include the
@ -47,6 +38,25 @@ void autograd_fallback(
 // As a result, on mobile we set the fallback to the fallthrough.
 #define AUTOGRAD_FALLBACK torch::CppFunction::makeFallthrough()
 #else
+
+// Register fallthrough for Autograd backends dispatch keys
+// NB: But not the private use ones; maybe the extension wants
+// to override it themselves!
+void autograd_fallback(
+    const c10::OperatorHandle& op,
+    c10::DispatchKeySet dispatch_keys,
+    torch::jit::Stack* stack) {
+  // PyTorch has separate builds, some of which don't include autograd.
+  // So we define some behavior for when autograd isn't included and
+  // go through a layer of indirection (VariableHooksInterface) when it is.
+  // See aten/src/ATen/core/VariableHooksInterface.h for more details.
+  if (!at::impl::HasVariableHooks()) {
+    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
+    return;
+  }
+  at::impl::GetVariableHooks()->basic_autograd_not_implemented_fallback(op, dispatch_keys, stack);
+}
+
 #define AUTOGRAD_FALLBACK torch::CppFunction::makeFromBoxedFunction<&autograd_fallback>()
 #endif

@ -93,19 +103,4 @@ TORCH_LIBRARY_IMPL(_, AutogradHPU, m) {

 #undef AUTOGRAD_FALLBACK

-void autograd_fallback(
-    const c10::OperatorHandle& op,
-    c10::DispatchKeySet dispatch_keys,
-    torch::jit::Stack* stack) {
-  // PyTorch has separate builds, some of which don't include autograd.
-  // So we define some behavior for when autograd isn't included and
-  // go through a layer of indirection (VariableHooksInterface) when it is.
-  // See aten/src/ATen/core/VariableHooksInterface.h for more details.
-  if (!at::impl::HasVariableHooks()) {
-    op.redispatchBoxed(dispatch_keys & c10::after_autograd_keyset, stack);
-    return;
-  }
-  at::impl::GetVariableHooks()->basic_autograd_not_implemented_fallback(op, dispatch_keys, stack);
-}
-
 } // namespace
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@ -1178,6 +1178,7 @@ TORCH_API intrusive_ptr<ivalue::Future> collectAll(

 namespace {

+#ifndef STRIP_ERROR_MESSAGES
 std::string formatSetOfDevices(const std::vector<c10::Device>& devices) {
  std::ostringstream oss;
  std::copy(
@ -1186,6 +1187,7 @@ std::string formatSetOfDevices(const std::vector<c10::Device>& devices) {
      std::ostream_iterator<c10::Device>(oss, ", "));
  return oss.str();
 }
+#endif

 }

--- a/aten/src/ATen/core/library.cpp
+++ b/aten/src/ATen/core/library.cpp
@ -27,6 +27,7 @@ namespace {
 #endif
  }

+#ifndef STRIP_ERROR_MESSAGES
  const char* toString(Library::Kind kind) {
    switch (kind) {
      case Library::DEF:
@ -38,6 +39,7 @@ namespace {
    }
    return "(unknown)";
  }
+#endif

  constexpr auto CatchAll = c10::DispatchKey::CatchAll;
 } // anonymous namespace
--- a/aten/src/ATen/cpu/Utils.cpp
+++ b/aten/src/ATen/cpu/Utils.cpp
@ -2,6 +2,10 @@
 #if !defined(__s390x__ ) && !defined(__powerpc__)
 #include <cpuinfo.h>
 #endif
+#if defined(__linux__)
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif

 namespace at::cpu {
 bool is_cpu_support_avx2() {
@ -20,7 +24,7 @@ bool is_cpu_support_avx512() {
 #endif
 }

-bool is_cpu_support_vnni() {
+bool is_cpu_support_avx512_vnni() {
 #if !defined(__s390x__) && !defined(__powerpc__)
  return cpuinfo_initialize() && cpuinfo_has_x86_avx512vnni();
 #else
@ -28,4 +32,47 @@ bool is_cpu_support_vnni() {
 #endif
 }

+bool is_cpu_support_amx_tile() {
+#if !defined(__s390x__) && !defined(__powerpc__)
+  return cpuinfo_initialize() && cpuinfo_has_x86_amx_tile();
+#else
+  return false;
+#endif
+}
+
+bool init_amx() {
+  if (!is_cpu_support_amx_tile()) {
+    return false;
+  }
+
+#if defined(__linux__) && !defined(__ANDROID__) && defined(__x86_64__)
+#define XFEATURE_XTILECFG 17
+#define XFEATURE_XTILEDATA 18
+#define XFEATURE_MASK_XTILECFG (1 << XFEATURE_XTILECFG)
+#define XFEATURE_MASK_XTILEDATA (1 << XFEATURE_XTILEDATA)
+#define XFEATURE_MASK_XTILE (XFEATURE_MASK_XTILECFG | XFEATURE_MASK_XTILEDATA)
+
+#define ARCH_GET_XCOMP_PERM 0x1022
+#define ARCH_REQ_XCOMP_PERM 0x1023
+
+  unsigned long bitmask = 0;
+  // Request permission to use AMX instructions
+  long rc = syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
+  if (rc) {
+      return false;
+  }
+  // Check if the system supports AMX instructions
+  rc = syscall(SYS_arch_prctl, ARCH_GET_XCOMP_PERM, &bitmask);
+  if (rc) {
+      return false;
+  }
+  if (bitmask & XFEATURE_MASK_XTILE) {
+      return true;
+  }
+  return false;
+#else
+  return true;
+#endif
+}
+
 } // namespace at::cpu
--- a/aten/src/ATen/cpu/Utils.h
+++ b/aten/src/ATen/cpu/Utils.h
@ -8,6 +8,12 @@ TORCH_API bool is_cpu_support_avx2();
 TORCH_API bool is_cpu_support_avx512();

 // Detect if CPU support Vector Neural Network Instruction.
-TORCH_API bool is_cpu_support_vnni();
+TORCH_API bool is_cpu_support_avx512_vnni();
+
+// Detect if CPU support Advanced Matrix Extension.
+TORCH_API bool is_cpu_support_amx_tile();
+
+// Enable the system to use AMX instructions.
+TORCH_API bool init_amx();

 } // namespace at::cpu
--- a/Show More
+++ b/Show More