Remove pytorch quant docs since we are moving to torchao

Summary: att Test Plan: doc page generated from CI Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 14150e4e3a5d65f1a9eb5a4201fcd4bac09920f1 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157766
Fix unbound local when an error occurs before pool is initialized (#156750 )
2025-10-21 21:49:24 +08:00 · 2025-07-09 15:31:30 -07:00 · 2025-07-08 00:28:21 +00:00 · 2025-07-07 17:13:21 -07:00 · 2025-07-08 00:11:52 +00:00 · 2025-07-08 00:11:52 +00:00
1460 changed files with 53750 additions and 14434 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -79,6 +79,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
    os.system(f"unzip {wheel_path} -d {folder}/tmp")
    libs_to_copy = [
        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+        "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so",
        "/usr/local/cuda/lib64/libcudnn.so.9",
        "/usr/local/cuda/lib64/libcublas.so.12",
        "/usr/local/cuda/lib64/libcublasLt.so.12",
@ -88,6 +89,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libcusparseLt.so.0",
        "/usr/local/cuda/lib64/libcusolver.so.11",
        "/usr/local/cuda/lib64/libcurand.so.10",
+        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -275,17 +275,6 @@ case "$tag" in
    VISION=yes
    TRITON=yes
    ;;
-  pytorch-linux-jammy-py3-clang12-asan)
-    ANACONDA_PYTHON_VERSION=3.9
-    CLANG_VERSION=12
-    VISION=yes
-    TRITON=yes
-    ;;
-  pytorch-linux-jammy-py3-clang15-asan)
-    ANACONDA_PYTHON_VERSION=3.10
-    CLANG_VERSION=15
-    VISION=yes
-    ;;
  pytorch-linux-jammy-py3-clang18-asan)
    ANACONDA_PYTHON_VERSION=3.10
    CLANG_VERSION=18
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-c8757738a7418249896224430ce84888e8ecdd79
+ae848267bebc65c6181e8cc5e64a6357d2679260
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -3,11 +3,10 @@
 set -uex -o pipefail

 PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
-PYTHON_DOWNLOAD_GITHUB_BRANCH=https://github.com/python/cpython/archive/refs/heads  # @lint-ignore
 GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py

 # Python versions to be installed in /opt/$VERSION_NO
-CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t"}
+CPYTHON_VERSIONS=${CPYTHON_VERSIONS:-"3.9.0 3.10.1 3.11.0 3.12.0 3.13.0 3.13.0t 3.14.0 3.14.0t"}

 function check_var {
    if [ -z "$1" ]; then
@ -24,9 +23,8 @@ function do_cpython_build {
    tar -xzf Python-$py_ver.tgz

    local additional_flags=""
-    if [ "$py_ver" == "3.13.0t" ]; then
+    if [[ "$py_ver" == *"t" ]]; then
        additional_flags=" --disable-gil"
-        mv cpython-3.13/ cpython-3.13t/
    fi

    pushd $py_folder
@ -76,24 +74,20 @@ function do_cpython_build {
 function build_cpython {
    local py_ver=$1
    check_var $py_ver
-    check_var $PYTHON_DOWNLOAD_URL
-    local py_ver_folder=$py_ver
+    local py_suffix=$py_ver
+    local py_folder=$py_ver

-    if [ "$py_ver" = "3.13.0t" ]; then
-        PY_VER_SHORT="3.13"
-        PYT_VER_SHORT="3.13t"
-        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
-        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
-        do_cpython_build $py_ver cpython-$PYT_VER_SHORT
-    elif [ "$py_ver" = "3.13.0" ]; then
-        PY_VER_SHORT="3.13"
-        check_var $PYTHON_DOWNLOAD_GITHUB_BRANCH
-        wget $PYTHON_DOWNLOAD_GITHUB_BRANCH/$PY_VER_SHORT.tar.gz -O Python-$py_ver.tgz
-        do_cpython_build $py_ver cpython-$PY_VER_SHORT
-    else
-        wget -q $PYTHON_DOWNLOAD_URL/$py_ver_folder/Python-$py_ver.tgz
-        do_cpython_build $py_ver Python-$py_ver
+    # Special handling for nogil
+    if [[ "${py_ver}" == *"t" ]]; then
+        py_suffix=${py_ver::-1}
+        py_folder=$py_suffix
    fi
+    # Only b3 is available now
+    if [ "$py_suffix" == "3.14.0" ]; then
+        py_suffix="3.14.0b3"
+    fi
+    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
+    do_cpython_build $py_ver Python-$py_suffix

    rm -f Python-$py_ver.tgz
 }
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,6 +10,8 @@ else
  arch_path='sbsa'
 fi

+NVSHMEM_VERSION=3.3.9
+
 function install_cuda {
  version=$1
  runfile=$2
@ -40,13 +42,52 @@ function install_cudnn {
  rm -rf tmp_cudnn
 }

+function install_nvshmem {
+  cuda_major_version=$1      # e.g. "12"
+  nvshmem_version=$2         # e.g. "3.3.9"
+
+  case "${arch_path}" in
+    sbsa)
+      dl_arch="aarch64"
+      ;;
+    x86_64)
+      dl_arch="x64"
+      ;;
+    *)
+      dl_arch="${arch}"
+      ;;
+  esac
+
+  tmpdir="tmp_nvshmem"
+  mkdir -p "${tmpdir}" && cd "${tmpdir}"
+
+  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
+
+  # download, unpack, install
+  wget -q "${url}"
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/lib/
+
+  # cleanup
+  cd ..
+  rm -rf "${tmpdir}"
+
+  echo "nvSHMEM ${nvshmem_version} for CUDA ${cuda_major_version} (${arch_path}) installed."
+}
+
+
 function install_126 {
  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.6.3 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  install_cuda 12.6.3 cuda_12.6.3_560.35.05_linux

  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.6 bash install_nccl.sh

  CUDA_VERSION=12.6 bash install_cusparselt.sh
@ -56,13 +97,15 @@ function install_126 {

 function install_129 {
  CUDNN_VERSION=9.10.2.21
-  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.9.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.9.1 in the same container
  install_cuda 12.9.1 cuda_12.9.1_575.57.08_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.9 bash install_nccl.sh

  CUDA_VERSION=12.9 bash install_cusparselt.sh
@ -106,13 +149,15 @@ function prune_126 {

 function install_128 {
  CUDNN_VERSION=9.8.0.87
-  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NCCL and cuSparseLt-0.7.1"
+  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 12.8.1 in the same container
  install_cuda 12.8.1 cuda_12.8.1_570.124.06_linux

  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 12 $CUDNN_VERSION

+  install_nvshmem 12 $NVSHMEM_VERSION
+
  CUDA_VERSION=12.8 bash install_nccl.sh

  CUDA_VERSION=12.8 bash install_cusparselt.sh
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -20,7 +20,7 @@ pip_install \

 pip_install coloredlogs packaging
 pip_install onnxruntime==1.18.1
-pip_install onnxscript==0.3.0
+pip_install onnxscript==0.3.1

 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -5,7 +5,12 @@ set -eou pipefail

 function do_install() {
    rocm_version=$1
-    rocm_version_nodot=${1//./}
+    if [[ ${rocm_version} =~ ^[0-9]+\.[0-9]+\.[0-9]+$ ]]; then
+        # chop off any patch version
+        rocm_version="${rocm_version%.*}"
+    fi
+
+    rocm_version_nodot=${rocm_version//./}

    # Version 2.7.2 + ROCm related updates
    MAGMA_VERSION=a1625ff4d9bc362906bd01f805dbbe12612953f6
--- a/.ci/docker/common/install_triton.sh
+++ b/.ci/docker/common/install_triton.sh
@ -98,6 +98,10 @@ fi
 if [ -n "${NUMPY_VERSION}" ]; then
  pip_install "numpy==${NUMPY_VERSION}"
 fi
+
+# IMPORTANT: helion needs to be installed without dependencies.
+# It depends on torch and triton. We don't want to install
+# triton and torch from production on Docker CI images
 if [[ "$ANACONDA_PYTHON_VERSION" != 3.9* ]]; then
-  pip_install helion
+  pip_install helion --no-deps
 fi
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,6 +39,10 @@ case ${DOCKER_TAG_PREFIX} in
        DOCKER_GPU_BUILD_ARG=""
        ;;
    rocm*)
+        # we want the patch version of 6.4 instead
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
+        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
        PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
--- a/.ci/docker/manywheel/Dockerfile_s390x
+++ b/.ci/docker/manywheel/Dockerfile_s390x
@ -131,6 +131,8 @@ RUN pip3 install flatbuffers && \
  git clone https://github.com/microsoft/onnxruntime && \
  cd onnxruntime && git checkout v1.21.0 && \
  git submodule update --init --recursive && \
+  wget https://github.com/microsoft/onnxruntime/commit/f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \
+  patch -p1 < f57db79743c4d1a3553aa05cf95bcd10966030e6.patch && \
  ./build.sh --config Release --parallel 0 --enable_pybind \
  --build_wheel --enable_training --enable_training_apis \
  --enable_training_ops --skip_tests --allow_running_as_root \
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -75,6 +75,10 @@ case ${image} in
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    manylinux2_28-builder:rocm*)
+        # we want the patch version of 6.4 instead
+        if [[ $(ver $GPU_ARCH_VERSION) -eq $(ver 6.4) ]]; then
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.1"
+        fi
        TARGET=rocm_final
        MANY_LINUX_VERSION="2_28"
        DEVTOOLSET_VERSION="11"
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -339,7 +339,7 @@ onnx==1.18.0
 #Pinned versions:
 #test that import:

-onnxscript==0.2.6
+onnxscript==0.3.1
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -383,6 +383,6 @@ cmake==4.0.0
 tlparse==0.3.30
 #Description: required for log parsing

-cuda-bindings>=12.0,<13.0
+cuda-bindings>=12.0,<13.0 ; platform_machine != "s390x"
 #Description: required for testing CUDAGraph::raw_cuda_graph(). See https://nvidia.github.io/cuda-python/cuda-bindings/latest/support.html for how this version was chosen. Note "Any fix in the latest bindings would be backported to the prior major version" means that only the newest version of cuda-bindings will get fixes. Depending on the latest version of 12.x is okay because all 12.y versions will be supported via "CUDA minor version compatibility". Pytorch builds against 13.z versions of cuda toolkit work with 12.x versions of cuda-bindings as well because newer drivers work with old toolkits.
 #test that import: test_cuda.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -19,9 +19,10 @@ sphinx_sitemap==2.6.0
 #Description: This is used to generate sitemap for PyTorch docs
 #Pinned versions: 2.6.0

-matplotlib==3.5.3
+matplotlib==3.5.3 ; python_version < "3.13"
+matplotlib==3.6.3 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 3.5.3
+#Pinned versions: 3.6.3 if python > 3.12. Otherwise 3.5.3.

 tensorboard==2.13.0 ; python_version < "3.13"
 tensorboard==2.18.0 ; python_version >= "3.13"
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.3.1
+3.4.0
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -51,20 +51,22 @@ else
 fi

 cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
+EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")

-TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
-    12.8|12.9)
-        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX" #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+    #removing sm_50-sm_70 as these architectures are deprecated in CUDA 12.8/9 and will be removed in future releases
+    12.8)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0"
+        ;;
+    12.9)
+        TORCH_CUDA_ARCH_LIST="7.5;8.0;8.6;9.0;10.0;12.0+PTX"
        # WAR to resolve the ld error in libtorch build with CUDA 12.9
-        if [[ "$DESIRED_CUDA" == "cu129" && "$PACKAGE_TYPE" == "libtorch" ]]; then
+        if [[ "$PACKAGE_TYPE" == "libtorch" ]]; then
            TORCH_CUDA_ARCH_LIST="7.5;8.0;9.0;10.0;12.0+PTX"
        fi
        ;;
    12.6)
-        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
+        TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6;9.0"
        ;;
    *)
        echo "unknown cuda version $CUDA_VERSION"
@ -131,6 +133,8 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
+            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
+            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
        DEPS_SONAME+=(
            "libcudnn_adv.so.9"
@ -149,7 +153,14 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "libnvrtc-builtins.so"
            "libcufile.so.0"
            "libcufile_rdma.so.1"
+            "libcupti.so.12"
+            "libnvperf_host.so"
        )
+        # Add libnvToolsExt only if CUDA version is not 12.9
+        if [[ $CUDA_VERSION != 12.9* ]]; then
+            DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
+            DEPS_SONAME+=("libnvToolsExt.so.1")
+        fi
    else
        echo "Using nvidia libs from pypi."
        CUDA_RPATHS=(
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -198,10 +198,8 @@ fi

 # We only build FlashAttention files for CUDA 8.0+, and they require large amounts of
 # memory to build and will OOM
-if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]] && [ -z "$MAX_JOBS_OVERRIDE" ]; then
-  echo "WARNING: FlashAttention files require large amounts of memory to build and will OOM"
-  echo "Setting MAX_JOBS=(nproc-2)/3 to reduce memory usage"
-  export MAX_JOBS="$(( $(nproc --ignore=2) / 3 ))"
+if [[ "$BUILD_ENVIRONMENT" == *cuda* ]] && [[ 1 -eq $(echo "${TORCH_CUDA_ARCH_LIST} >= 8.0" | bc) ]]; then
+  export BUILD_CUSTOM_STEP="ninja -C build flash_attention -j 2"
 fi

 if [[ "${BUILD_ENVIRONMENT}" == *clang* ]]; then
@ -395,10 +393,8 @@ else
    # This is an attempt to mitigate flaky libtorch build OOM error. By default, the build parallelization
    # is set to be the number of CPU minus 2. So, let's try a more conservative value here. A 4xlarge has
    # 16 CPUs
-    if [ -z "$MAX_JOBS_OVERRIDE" ]; then
-      MAX_JOBS=$(nproc --ignore=4)
-      export MAX_JOBS
-    fi
+    MAX_JOBS=$(nproc --ignore=4)
+    export MAX_JOBS

    # NB: Install outside of source directory (at the same level as the root
    # pytorch folder) so that it doesn't get cleaned away prior to docker push.
--- a/.ci/pytorch/common-build.sh
+++ b/.ci/pytorch/common-build.sh
@ -13,6 +13,13 @@ if [[ "$BUILD_ENVIRONMENT" != *win-* ]]; then
    fi

    if which sccache > /dev/null; then
+        # Clear SCCACHE_BUCKET and SCCACHE_REGION if they are empty, otherwise
+        # sccache will complain about invalid bucket configuration
+        if [[ -z "${SCCACHE_BUCKET:-}" ]]; then
+          unset SCCACHE_BUCKET
+          unset SCCACHE_REGION
+        fi
+
        # Save sccache logs to file
        sccache --stop-server > /dev/null  2>&1 || true
        rm -f ~/sccache_error.log || true
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -11,6 +11,8 @@ export TERM=vt100

 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
+# shellcheck source=./common-build.sh
+source "$(dirname "${BASH_SOURCE[0]}")/common-build.sh"

 # Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
@ -163,8 +165,6 @@ elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then
  export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu"
  # setting PYTHON_TEST_EXTRA_OPTION
  export PYTHON_TEST_EXTRA_OPTION="--xpu"
-  # Disable sccache for xpu test due to flaky issue https://github.com/pytorch/pytorch/issues/143585
-  sudo rm -rf /opt/cache
 fi

 if [[ "$TEST_CONFIG" == *crossref* ]]; then
@ -327,9 +327,15 @@ test_h100_distributed() {
  time python test/run_test.py --include distributed/_composable/test_composability/test_pp_composability.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  # This test requires multicast support
  time python test/run_test.py --include distributed/_composable/fsdp/test_fully_shard_comm.py -k TestFullyShardAllocFromPG $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+test_h100_symm_mem() {
  # symmetric memory test
  time python test/run_test.py --include distributed/test_symmetric_memory.py  $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  time python test/run_test.py --include distributed/test_nvshmem.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nvshmem_triton.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include distributed/test_nccl.py $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }

@ -347,6 +353,7 @@ test_dynamo_wrapped_shard() {
    exit 1
  fi
  python tools/dynamo/verify_dynamo.py
+  python tools/dynamo/gb_id_mapping.py verify
  # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE.
  # Instead, use @skipIfTorchDynamo on your tests.
  time python test/run_test.py --dynamo \
@ -361,6 +368,17 @@ test_dynamo_wrapped_shard() {
  assert_git_not_dirty
 }

+test_einops() {
+  pip install einops==0.6.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.7.0
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  pip install einops==0.8.1
+  time python test/run_test.py --einops --verbose --upload-artifacts-while-running
+  assert_git_not_dirty
+}
+
+
 test_inductor_distributed() {
  # Smuggle a few multi-gpu tests here so that we don't have to request another large node
  echo "Testing multi_gpu tests in test_torchinductor"
@ -418,14 +436,21 @@ test_inductor_aoti() {
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
-    BUILD_AOT_INDUCTOR_TEST=1 TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop
+    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python setup.py develop)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
-    LD_LIBRARY_PATH=/opt/conda/envs/py_3.10/lib/:${TORCH_LIB_DIR}:$LD_LIBRARY_PATH
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
-    BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop
-    CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
+    BUILD_COMMAND=(python setup.py develop)
+    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi
+
+  # aoti cmake custom command requires `torch` to be installed
+  # initialize the cmake build cache and install torch
+  /usr/bin/env "${BUILD_COMMAND[@]}"
+  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
+  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
+
+  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference -dist=loadfile
 }

 test_inductor_cpp_wrapper_shard() {
@ -438,47 +463,26 @@ test_inductor_cpp_wrapper_shard() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"

-  if [[ "$1" -eq "2" ]]; then
-    # For now, manually put the opinfo tests in shard 2, and all other tests in
-    # shard 1.  Run all CPU tests, as well as specific GPU tests triggering past
-    # bugs, for now.
-    python test/run_test.py \
-      --include inductor/test_torchinductor_opinfo \
-      -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
-      --verbose
-    exit
-  fi
-
  # Run certain inductor unit tests with cpp wrapper. In the end state, we
  # should be able to run all the inductor unit tests with cpp_wrapper.
+  #
+  # TODO: I'm pretty sure that "TestInductorOpInfoCPU" is not a valid filter,
+  # but change that in another PR to more accurately monitor the increased CI
+  # usage.
+  python test/run_test.py \
+    --include inductor/test_torchinductor_opinfo \
+    -k 'linalg or to_sparse or TestInductorOpInfoCPU' \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
  python test/run_test.py \
    --include inductor/test_torchinductor inductor/test_max_autotune inductor/test_cpu_repro \
+    --shard "$1" "$NUM_TEST_SHARDS" \
+    --verbose
+  python test/run_test.py --inductor \
+    --include test_torch \
+    -k 'take' \
+    --shard "$1" "$NUM_TEST_SHARDS" \
    --verbose
-  python test/run_test.py --inductor --include test_torch -k 'take' --verbose
-
-  # Run inductor benchmark tests with cpp wrapper.
-  # Skip benchmark tests if it's in rerun-disabled-mode.
-  if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]]; then
-    echo "skip dynamo benchmark tests for rerun-disabled-test"
-  else
-    echo "run dynamo benchmark tests with cpp wrapper"
-    python benchmarks/dynamo/timm_models.py --device cuda --accuracy --amp \
-    --training --inductor --disable-cudagraphs --only vit_base_patch16_224 \
-    --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_training.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_timm_training.csv"
-
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only hf_T5 --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only llama --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/torchbench.py --device cuda --accuracy \
-      --bfloat16 --inference --inductor --only moco --output "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv"
-    python benchmarks/dynamo/check_accuracy.py \
-      --actual "$TEST_REPORTS_DIR/inductor_cpp_wrapper_inference.csv" \
-      --expected "benchmarks/dynamo/ci_expected_accuracy/${MAYBE_ROCM}inductor_torchbench_inference.csv"
-  fi
 }

 # "Global" flags for inductor benchmarking controlled by TEST_CONFIG
@ -599,8 +603,8 @@ test_perf_for_dashboard() {

  local device=cuda
  if [[ "${TEST_CONFIG}" == *cpu* ]]; then
-    if [[ "${TEST_CONFIG}" == *zen_cpu_x86* ]]; then
-      device=zen_cpu_x86
+    if [[ "${TEST_CONFIG}" == *cpu_x86_zen* ]]; then
+      device=cpu_x86_zen
    elif [[ "${TEST_CONFIG}" == *cpu_x86* ]]; then
      device=cpu_x86
    elif [[ "${TEST_CONFIG}" == *cpu_aarch64* ]]; then
@ -617,7 +621,11 @@ test_perf_for_dashboard() {

  for mode in "${modes[@]}"; do
    if [[ "$mode" == "inference" ]]; then
-      dtype=bfloat16
+      if [[ "$device" == "cpu_x86" ]]; then
+        dtype=amp
+      else
+        dtype=bfloat16
+      fi
    elif [[ "$mode" == "training" ]]; then
      dtype=amp
    fi
@ -629,6 +637,10 @@ test_perf_for_dashboard() {
        target_flag+=( --no-translation-validation)
      fi

+      if [[ "$DASHBOARD_TAG" == *freezing-true* ]]; then
+        target_flag+=( --freezing)
+      fi
+
      if [[ "$DASHBOARD_TAG" == *default-true* ]]; then
        $TASKSET python "benchmarks/dynamo/$suite.py" \
            "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" --disable-cudagraphs "$@" \
@ -1682,11 +1694,11 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
    PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id"
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
-  install_torchaudio cuda
  install_torchvision
-  checkout_install_torchbench hf_T5 llama moco
  PYTHONPATH=$(pwd)/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
-  test_inductor_aoti
+  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
+    test_inductor_aoti
+  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
@ -1695,6 +1707,8 @@ elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
      test_inductor_distributed
    fi
  fi
+elif [[ "${TEST_CONFIG}" == *einops* ]]; then
+  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
  install_torchvision
  test_dynamo_wrapped_shard "${SHARD_NUMBER}"
@ -1744,6 +1758,8 @@ elif [[ "${TEST_CONFIG}" == smoke ]]; then
  test_python_smoke
 elif [[ "${TEST_CONFIG}" == h100_distributed ]]; then
  test_h100_distributed
+elif [[ "${TEST_CONFIG}" == "h100-symm-mem" ]]; then
+  test_h100_symm_mem
 else
  install_torchvision
  install_monkeytype
--- a/.ci/pytorch/test_example_code/CMakeLists.txt
+++ b/.ci/pytorch/test_example_code/CMakeLists.txt
@ -16,7 +16,7 @@ target_link_libraries(simple-torch-test CUDA::cudart CUDA::cufft CUDA::cusparse
 find_library(CUDNN_LIBRARY NAMES cudnn)
 target_link_libraries(simple-torch-test  ${CUDNN_LIBRARY} )
 if(MSVC)
-  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll")
+  file(GLOB TORCH_DLLS  "$ENV{CUDA_PATH}/bin/cudnn64_8.dll" "$ENV{NVTOOLSEXT_PATH}/bin/x64/*.dll")
  message("dlls to copy "  ${TORCH_DLLS})
  add_custom_command(TARGET simple-torch-test
                     POST_BUILD
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -52,6 +52,9 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

+# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
+python -m pip install expecttest==0.3.0
+
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/pytorch/windows/cuda126.bat
+++ b/.ci/pytorch/windows/cuda126.bat
@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64

+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V126%"=="" (
    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6\bin\nvcc.exe" (
        set "CUDA_PATH_V126=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.6"
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64

+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V128%"=="" (
    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8\bin\nvcc.exe" (
        set "CUDA_PATH_V128=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.8"
--- a/.ci/pytorch/windows/cuda129.bat
+++ b/.ci/pytorch/windows/cuda129.bat
@ -18,6 +18,15 @@ REM Check for optional components
 set USE_CUDA=
 set CMAKE_GENERATOR=Visual Studio 15 2017 Win64

+IF "%NVTOOLSEXT_PATH%"=="" (
+    IF EXIST "C:\Program Files\NVIDIA Corporation\NvToolsExt\lib\x64\nvToolsExt64_1.lib"  (
+        set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt
+    ) ELSE (
+        echo NVTX ^(Visual Studio Extension ^for CUDA^) ^not installed, failing
+        exit /b 1
+    )
+)
+
 IF "%CUDA_PATH_V129%"=="" (
    IF EXIST "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9\bin\nvcc.exe" (
        set "CUDA_PATH_V129=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.9"
--- a/.ci/pytorch/windows/internal/copy.bat
+++ b/.ci/pytorch/windows/internal/copy.bat
@ -8,7 +8,9 @@ copy "%CUDA_PATH%\bin\cusolver*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\bin\cudnn*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\bin\nvrtc*64_*.dll*" pytorch\torch\lib
 copy "%CUDA_PATH%\extras\CUPTI\lib64\cupti64_*.dll*" pytorch\torch\lib
+copy "%CUDA_PATH%\extras\CUPTI\lib64\nvperf_host*.dll*" pytorch\torch\lib

+copy "C:\Program Files\NVIDIA Corporation\NvToolsExt\bin\x64\nvToolsExt64_1.dll*" pytorch\torch\lib
 copy "%PYTHON_LIB_PATH%\libiomp*5md.dll" pytorch\torch\lib

 :: Should be set in build_pytorch.bat
--- a/.ci/pytorch/windows/internal/cuda_install.bat
+++ b/.ci/pytorch/windows/internal/cuda_install.bat
@ -119,6 +119,11 @@ goto cuda_common
 :: If you cannot find the CUDA version you want to build for here then please
 :: add it @ https://github.com/pytorch/test-infra/tree/main/aws/ami/windows
 if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
+    if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
+        curl -k -L https://ossci-windows.s3.us-east-1.amazonaws.com/builder/NvToolsExt.7z --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
+        if errorlevel 1 exit /b 1
+    )
+
    if not exist "%SRC_DIR%\temp_build\gpu_driver_dlls.zip" (
        curl -k -L "https://ossci-windows.s3.us-east-1.amazonaws.com/builder/additional_dlls.zip" --output "%SRC_DIR%\temp_build\gpu_driver_dlls.zip"
        if errorlevel 1 exit /b 1
@ -145,6 +150,15 @@ if not exist "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_
        xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\MSBuild\Microsoft\VC\v170\BuildCustomizations"
    )

+    echo Installing NvToolsExt...
+    7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
+    xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
+
    echo Installing cuDNN...
    7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
    xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
@ -175,3 +189,4 @@ echo Setting up environment...
 set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
 set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
 set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
+set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt"
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -18,3 +18,5 @@ start /wait "" python-amd64.exe /quiet InstallAllUsers=1 PrependPath=0 Include_t
 if errorlevel 1 exit /b 1

 set "PATH=%CD%\Python\Scripts;%CD%\Python;%PATH%"
+%PYTHON_EXEC% -m pip install --upgrade pip setuptools packaging wheel
+if errorlevel 1 exit /b 1
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -75,8 +75,8 @@ TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
 # Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
 TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"

-# CUDA 12.8 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == cu128 ]]; then
+# CUDA 12.9 builds have triton for Linux and Linux aarch64 binaries.
+if [[ "$DESIRED_CUDA" == "cu129" ]]; then
  TRITON_CONSTRAINT="platform_system == 'Linux'"
 fi

--- a/.github/actions/filter-test-configs/action.yml
+++ b/.github/actions/filter-test-configs/action.yml
@ -125,7 +125,7 @@ runs:
        TAG: ${{ steps.parse-ref.outputs.tag }}
        EVENT_NAME: ${{ github.event_name }}
        SCHEDULE: ${{ github.event.schedule }}
-        HEAD_BRANCH: ${{ github.event.workflow_run.head_branch }}
+        HEAD_BRANCH: ${{ steps.parse-ref.outputs.branch }}
      id: filter
      run: |
        echo "Workflow: ${GITHUB_WORKFLOW}"
--- a/.github/actions/reuse-old-whl/reuse_old_whl.py
+++ b/.github/actions/reuse-old-whl/reuse_old_whl.py
@ -304,8 +304,7 @@ def unzip_artifact_and_replace_files() -> None:


 def set_output() -> None:
-    # Disable for now so we can monitor first
-    # pass
+    print("Setting output reuse=true")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print("reuse=true", file=env)
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-4e94321c54617dd738a05bfedfc28bc0fa635b5c
+70caf76066ef2c1054d6128b11769dc816a779e7
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-55a75404c9b75cd5fd62ab5d4deafc8c506b3af2
+1c00dea2c9adb2137903c86b4191e8c247f8fda9
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -48,3 +48,12 @@
  - "module: dynamic shapes"
  then:
  - "oncall: pt2"
+- any:
+  - "release notes: distributed (c10d)"
+  - "release notes: distributed (symm_mem)"
+  - "release notes: distributed (pipeline)"
+  - "release notes: distributed (fsdp)"
+  - "release notes: distributed (dtensor)"
+  - "oncall: distributed"
+  then:
+  - "ciflow/h100-distributed"
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -384,6 +384,7 @@
  - leslie-fang-intel
  - jgong5
  - EikanWang
+  - CaoE
  mandatory_checks_name:
  - EasyCLA
  - Lint
@ -435,6 +436,7 @@
  approved_by:
  - leslie-fang-intel
  - jgong5
+  - CaoE
  mandatory_checks_name:
  - EasyCLA
  - Lint
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -4,6 +4,7 @@ ciflow_push_tags:
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
+- ciflow/triton_binaries
 - ciflow/inductor
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
@ -30,6 +31,7 @@ ciflow_push_tags:
 - ciflow/pull
 - ciflow/h100
 - ciflow/h100-distributed
+- ciflow/h100-symm-mem
 retryable_workflows:
 - pull
 - trunk
--- a/.github/scripts/delete_old_branches.py
+++ b/.github/scripts/delete_old_branches.py
@ -275,7 +275,7 @@ def delete_branches() -> None:
        delete_branch(git_repo, branch)


-def delete_old_ciflow_tags() -> None:
+def delete_old_tags() -> None:
    # Deletes ciflow tags if they are associated with a closed PR or a specific
    # commit.  Lightweight tags don't have information about the date they were
    # created, so we can't check how old they are.  The script just assumes that
@ -288,23 +288,29 @@ def delete_old_ciflow_tags() -> None:
        delete_branch(git_repo, f"refs/tags/{tag}")

    tags = git_repo._run_git("tag").splitlines()
-    open_pr_numbers = [x["number"] for x in get_open_prs()]

+    CIFLOW_TAG_REGEX = re.compile(r"^ciflow\/.*\/(\d{5,6}|[0-9a-f]{40})$")
+    AUTO_REVERT_TAG_REGEX = re.compile(r"^trunk\/[0-9a-f]{40}$")
    for tag in tags:
        try:
            if ESTIMATED_TOKENS[0] > 400:
                print("Estimated tokens exceeded, exiting")
                break
-            if not tag.startswith("ciflow/"):
+
+            if not CIFLOW_TAG_REGEX.match(tag) and not AUTO_REVERT_TAG_REGEX.match(tag):
                continue
-            re_match_pr = re.match(r"^ciflow\/.*\/(\d{5,6})$", tag)
-            re_match_sha = re.match(r"^ciflow\/.*\/([0-9a-f]{40})$", tag)
-            if re_match_pr:
-                pr_number = int(re_match_pr.group(1))
-                if pr_number in open_pr_numbers:
-                    continue
-                delete_tag(tag)
-            elif re_match_sha:
+
+            # This checks the date of the commit associated with the tag instead
+            # of the tag itself since lightweight tags don't have this
+            # information.  I think it should be ok since this only runs once a
+            # day
+            tag_info = git_repo._run_git("show", "-s", "--format=%ct", tag)
+            tag_timestamp = int(tag_info.strip())
+            # Maybe some timezone issues, but a few hours shouldn't matter
+            tag_age_days = (datetime.now().timestamp() - tag_timestamp) / SEC_IN_DAY
+
+            if tag_age_days > 7:
+                print(f"[{tag}] Tag is older than 7 days, deleting")
                delete_tag(tag)
        except Exception as e:
            print(f"Failed to check tag {tag}: {e}")
@ -312,4 +318,4 @@ def delete_old_ciflow_tags() -> None:

 if __name__ == "__main__":
    delete_branches()
-    delete_old_ciflow_tags()
+    delete_old_tags()
--- a/.github/scripts/filter_test_configs.py
+++ b/.github/scripts/filter_test_configs.py
@ -18,6 +18,7 @@ import yaml


 REENABLE_TEST_REGEX = "(?i)(Close(d|s)?|Resolve(d|s)?|Fix(ed|es)?) (#|https://github.com/pytorch/pytorch/issues/)([0-9]+)"
+MAIN_BRANCH = "main"

 PREFIX = "test-config/"

@ -97,7 +98,7 @@ def parse_args() -> Any:
    parser.add_argument(
        "--branch",
        type=str,
-        default="main",
+        default=MAIN_BRANCH,
        help="the branch name",
    )
    return parser.parse_args()
@ -456,6 +457,7 @@ def download_json(url: str, headers: dict[str, str], num_retries: int = 3) -> An


 def set_output(name: str, val: Any) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
@ -495,13 +497,20 @@ def check_for_setting(labels: set[str], body: str, setting: str) -> bool:


 def perform_misc_tasks(
-    labels: set[str], test_matrix: dict[str, list[Any]], job_name: str, pr_body: str
+    labels: set[str],
+    test_matrix: dict[str, list[Any]],
+    job_name: str,
+    pr_body: str,
+    branch: Optional[str] = None,
 ) -> None:
    """
    In addition to apply the filter logic, the script also does the following
    misc tasks to set keep-going and is-unstable variables
    """
-    set_output("keep-going", check_for_setting(labels, pr_body, "keep-going"))
+    set_output(
+        "keep-going",
+        branch == MAIN_BRANCH or check_for_setting(labels, pr_body, "keep-going"),
+    )
    set_output(
        "ci-verbose-test-logs",
        check_for_setting(labels, pr_body, "ci-verbose-test-logs"),
@ -624,6 +633,7 @@ def main() -> None:
        test_matrix=filtered_test_matrix,
        job_name=args.job_name,
        pr_body=pr_body if pr_body else "",
+        branch=args.branch,
    )

    # Set the filtered test matrix as the output
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -17,7 +17,7 @@ from typing import Optional

 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
 CUDA_ARCHES = ["12.6", "12.8", "12.9"]
-CUDA_STABLE = "12.6"
+CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,6 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
--- a/.github/scripts/get_workflow_job_id.py
+++ b/.github/scripts/get_workflow_job_id.py
@ -136,10 +136,10 @@ def find_job_id_name(args: Any) -> tuple[str, str]:


 def set_output(name: str, val: Any) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
-        print(f"setting {name}={val}")
    else:
        print(f"::set-output name={name}::{val}")

--- a/.github/scripts/parse_ref.py
+++ b/.github/scripts/parse_ref.py
@ -5,6 +5,7 @@ import re


 def set_output(name: str, val: str) -> None:
+    print(f"Setting output {name}={val}")
    if os.getenv("GITHUB_OUTPUT"):
        with open(str(os.getenv("GITHUB_OUTPUT")), "a") as env:
            print(f"{name}={val}", file=env)
--- a/.github/scripts/tag_docker_images_for_release.py
+++ b/.github/scripts/tag_docker_images_for_release.py
@ -1,64 +0,0 @@
-import argparse
-import subprocess
-
-import generate_binary_build_matrix
-
-
-def tag_image(
-    image: str,
-    default_tag: str,
-    release_version: str,
-    dry_run: str,
-    tagged_images: dict[str, bool],
-) -> None:
-    if image in tagged_images:
-        return
-    release_image = image.replace(f"-{default_tag}", f"-{release_version}")
-    print(f"Tagging {image} to {release_image} , dry_run: {dry_run}")
-
-    if dry_run == "disabled":
-        subprocess.check_call(["docker", "pull", image])
-        subprocess.check_call(["docker", "tag", image, release_image])
-        subprocess.check_call(["docker", "push", release_image])
-    tagged_images[image] = True
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--version",
-        help="Version to tag",
-        type=str,
-        default="2.2",
-    )
-    parser.add_argument(
-        "--dry-run",
-        help="No Runtime Error check",
-        type=str,
-        choices=["enabled", "disabled"],
-        default="enabled",
-    )
-
-    options = parser.parse_args()
-    tagged_images: dict[str, bool] = {}
-    platform_images = [
-        generate_binary_build_matrix.WHEEL_CONTAINER_IMAGES,
-        generate_binary_build_matrix.LIBTORCH_CONTAINER_IMAGES,
-    ]
-    default_tag = generate_binary_build_matrix.DEFAULT_TAG
-
-    for platform_image in platform_images:  # type: ignore[attr-defined]
-        for arch in platform_image.keys():  # type: ignore[attr-defined]
-            if arch == "cpu-s390x":
-                continue
-            tag_image(
-                platform_image[arch],  # type: ignore[index]
-                default_tag,
-                options.version,
-                options.dry_run,
-                tagged_images,
-            )
-
-
-if __name__ == "__main__":
-    main()
--- a/.github/scripts/test_delete_old_branches.py
+++ b/.github/scripts/test_delete_old_branches.py
@ -0,0 +1,56 @@
+import os
+import unittest
+from datetime import datetime
+from unittest.mock import MagicMock, patch
+
+
+os.environ["GITHUB_TOKEN"] = "test_token"
+
+from delete_old_branches import delete_old_tags
+
+
+@patch("delete_old_branches.delete_branch")
+@patch("gitutils.GitRepo._run_git")
+class TestDeleteTag(unittest.TestCase):
+    def test_delete_tag(
+        self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
+    ) -> None:
+        for tag in [
+            "ciflow/branch/12345",
+            "ciflow/commitsha/1234567890abcdef1234567890abcdef12345678",
+            "trunk/1234567890abcdef1234567890abcdef12345678",
+        ]:
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)),  # 8 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_called_once()
+            mock_delete_tag.reset_mock()
+
+            # Don't delete if the tag is not old enough
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 6 * 24 * 60 * 60)),  # 6 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_not_called()
+
+    def test_do_not_delete_tag(
+        self, mock_run_git: "MagicMock", mock_delete_tag: "MagicMock"
+    ) -> None:
+        for tag in [
+            "ciflow/doesntseemtomatch",
+            "trunk/doesntseemtomatch",
+            "doesntseemtomatch",
+        ]:
+            mock_run_git.side_effect = [
+                tag,
+                str(int(datetime.now().timestamp() - 8 * 24 * 60 * 60)),  # 8 days ago
+            ]
+            delete_old_tags()
+            mock_delete_tag.assert_not_called()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/.github/scripts/windows/build_magma.bat
+++ b/.github/scripts/windows/build_magma.bat
@ -17,6 +17,7 @@ if errorlevel 1 exit /b 1

 set "PATH=C:\Tools;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%\libnvvp;%PATH%"
 set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUVER%
+set NVTOOLSEXT_PATH=C:\Program Files\NVIDIA Corporation\NvToolsExt

 mkdir magma_cuda%CUVER_NODOT%
 cd magma_cuda%CUVER_NODOT%
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -69,11 +69,6 @@ on:
        required: false
        type: string
        default: ""
-      max-jobs:
-        description: |
-          Overwrite the number of jobs to use for the build
-        required: false
-        type: string
      disable-monitor:
        description: |
          Disable utilization monitoring for build job
@ -266,7 +261,6 @@ jobs:
          OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          MAX_JOBS_OVERRIDE: ${{ inputs.max-jobs }}
        run: |
          START_TIME=$(date +%s)
          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
@ -286,12 +280,6 @@ jobs:
            DOCKER_SHELL_CMD=
          fi

-          if [[ ${MAX_JOBS_OVERRIDE} == "" ]]; then
-            MAX_JOBS="$(nproc --ignore=2)"
-          else
-            MAX_JOBS="${MAX_JOBS_OVERRIDE}"
-          fi
-
          # Leaving 1GB for the runner and other things
          TOTAL_AVAILABLE_MEMORY_IN_GB=$(awk '/MemTotal/ { printf "%.3f \n", $2/1024/1024 - 1 }' /proc/meminfo)
          # https://docs.docker.com/engine/containers/resource_constraints/#--memory-swap-details, the 3GB swap
@ -303,8 +291,7 @@ jobs:
          # shellcheck disable=SC2086
          container_name=$(docker run \
            -e BUILD_ENVIRONMENT \
-            -e MAX_JOBS=${MAX_JOBS} \
-            -e MAX_JOBS_OVERRIDE \
+            -e MAX_JOBS="$(nproc --ignore=2)" \
            -e AWS_DEFAULT_REGION \
            -e PR_NUMBER \
            -e SHA1 \
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -90,10 +90,13 @@ jobs:
    environment: ${{ github.ref == 'refs/heads/main' && 'scribe-protected' || startsWith(github.ref, 'refs/heads/release/') && 'scribe-protected' || contains(github.event.pull_request.labels.*.name, 'ci-scribe') && 'scribe-pr' || '' }}
    runs-on: ${{ matrix.runner }}
    timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }}
+    permissions:
+      id-token: write
+      contents: read
    steps:
      - name: Setup SSH (Click me for login details)
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ matrix.runner != 'B200' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
          instructions: |
@ -105,18 +108,31 @@ jobs:
        with:
          no-sudo: true

+      - name: Setup Python
+        if: matrix.runner == 'B200'
+        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
+        with:
+          python-version: '3.12'
+          cache: pip
+
      - name: Setup Linux
        uses: ./.github/actions/setup-linux
-        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel' && matrix.runner != 'B200'

      - name: configure aws credentials
-        if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
+        if: ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
        with:
          role-to-assume: ${{ inputs.aws-role-to-assume }}
          role-session-name: gha-linux-test
          aws-region: us-east-1

+      - name: Login to Amazon ECR
+        if: ${{ inputs.aws-role-to-assume != '' && matrix.runner == 'B200' }}
+        id: login-ecr
+        continue-on-error: true
+        uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
+
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
@ -148,17 +164,17 @@ jobs:
      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@main
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && matrix.runner != 'B200' }}

      - name: Setup GPU_FLAG for docker run
        id: setup-gpu-flag
        run: echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
-        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && (steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' || matrix.runner == 'B200') }}

      - name: Setup SCCACHE_SERVER_PORT environment for docker run when on container
        id: setup-sscache-port-flag
        run: echo "SCCACHE_SERVER_PORT_DOCKER_FLAG=-e SCCACHE_SERVER_PORT=$((RUNNER_UID + 4226))" >> "${GITHUB_ENV}"
-        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' }}
+        if: ${{ steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'true' && matrix.runner != 'B200' }}

      - name: Lock NVIDIA A100 40GB Frequency
        run: |
@ -225,6 +241,12 @@ jobs:
        run: |
          echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}"

+      - name: Preserve github env variables for use in docker
+        shell: bash
+        run: |
+          env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+
      - name: Test
        id: test
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
@ -253,8 +275,8 @@ jobs:
          NO_TD: ${{ steps.keep-going.outputs.ci-no-td }}
          TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }}
          # Do not set SCCACHE_S3_KEY_PREFIX to share the cache between all build jobs
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
+          SCCACHE_BUCKET: ${{ matrix.runner != 'B200' && 'ossci-compiler-cache-circleci-v2' || '' }}
+          SCCACHE_REGION: ${{ matrix.runner != 'B200' && 'us-east-1' || '' }}
          SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }}
@ -264,7 +286,6 @@ jobs:
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
          SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.SCRIBE_GRAPHQL_ACCESS_TOKEN }}
-          IS_A100_RUNNER: ${{ contains(matrix.runner, 'a100') && '1' || '0' }}
          ARTIFACTS_FILE_SUFFIX: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }}
        run: |
          set -x
@ -290,10 +311,6 @@ jobs:
            # if for some reason cleanup action doesn't stop container
            # when job is cancelled
            DOCKER_SHELL_CMD="sleep 12h"
-
-            # since some steps are skipped on s390x, if they are necessary, run them here
-            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
-            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
          else
            SHM_OPTS="--shm-size=${SHM_SIZE}"
            JENKINS_USER="--user jenkins"
@ -345,7 +362,6 @@ jobs:
            -e HUGGING_FACE_HUB_TOKEN \
            -e SCRIBE_GRAPHQL_ACCESS_TOKEN \
            -e DASHBOARD_TAG \
-            -e IS_A100_RUNNER \
            -e ARTIFACTS_FILE_SUFFIX \
            --memory="${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}g" \
            --memory-swap="${TOTAL_MEMORY_WITH_SWAP}g" \
@ -384,6 +400,15 @@ jobs:
          test_config: ${{ matrix.config }}
          job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }}

+      - name: Authenticate with AWS
+        if: ${{ matrix.runner == 'B200' }}
+        uses: aws-actions/configure-aws-credentials@ececac1a45f3b08a01d2dd070d28d111c5fe6722 # v4.1.0
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
      - name: Upload the benchmark results
        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
--- a/.github/workflows/_mac-build.yml
+++ b/.github/workflows/_mac-build.yml
@ -123,7 +123,7 @@ jobs:
            else
              # The runner has access to the S3 bucket via IAM profile without the need
              # for any credential
-              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"0
+              echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}"
              echo "SCCACHE_S3_KEY_PREFIX=${GITHUB_WORKFLOW}" >> "${GITHUB_ENV}"
            fi

--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -88,6 +88,10 @@ jobs:
            pkill "${PROCESS}" || true
          done

+      - name: Clean up leftover miniconda installation
+        continue-on-error: true
+        run: brew uninstall miniconda || true
+
      - name: Clean up leftover local python3 site-packages on MacOS pet runner
        continue-on-error: true
        run: |
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -191,9 +191,6 @@ jobs:
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
-          SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
-          SCCACHE_REGION: us-east-1
-          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          DOCKER_IMAGE: ${{ inputs.docker-image }}
          XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -23,7 +23,7 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -32,7 +32,7 @@ concurrency:
 jobs:
  build-docker:
    if: github.repository_owner == 'pytorch'
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -22,7 +22,7 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -40,7 +40,7 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}

  build:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
    needs: get-label-type
    runs-on: ${{ needs.get-label-type.outputs.label-type }}linux.9xlarge.ephemeral
    name: libtorch-cxx11-builder:${{ matrix.tag }}
--- a/.github/workflows/build-manywheel-images-s390x.yml
+++ b/.github/workflows/build-manywheel-images-s390x.yml
@ -12,7 +12,7 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -21,7 +21,7 @@ concurrency:
 jobs:
  build-docker-cpu-s390x:
    if: github.repository_owner == 'pytorch'
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
    runs-on: linux.s390x
    steps:
      - name: Checkout PyTorch
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -23,8 +23,7 @@ on:
 env:
  DOCKER_REGISTRY: "docker.io"
  DOCKER_BUILDKIT: 1
-  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release')) }}
-
+  WITH_PUSH: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) }}
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
@ -41,7 +40,7 @@ jobs:
      curr_ref_type: ${{ github.ref_type }}

  build:
-    environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }}
+    environment: ${{ (github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/release') || startsWith(github.ref, 'refs/tags/v')) && 'docker-build') || '' }}
    needs: get-label-type
    strategy:
      fail-fast: false
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -8,6 +8,7 @@ on:
      # NOTE: Binary build pipelines should only get triggered on release candidate builds
      # Release candidate tags look like: v1.11.0-rc1
      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      - 'ciflow/triton_binaries/*'
    paths:
      - .github/workflows/build-triton-wheel.yml
      - .github/scripts/build_triton_wheel.py
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -82,7 +82,7 @@ jobs:
          path: ${{ env.PT_RELEASE_FILE }}
      - name: Set output
        id: release_name
-        run: echo "name=pt_release_name::${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"
+        run: echo "pt_release_name=${{ env.PT_RELEASE_NAME }}.tar.gz" >> "${GITHUB_OUTPUT}"

  upload_source_code_to_s3:
    if: ${{ github.repository == 'pytorch/pytorch' && github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') && contains(github.ref, 'rc') }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -69,7 +69,6 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-2025.0-py3,
          pytorch-linux-jammy-xpu-2025.1-py3,
-          pytorch-linux-jammy-py3-clang15-asan,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -136,7 +136,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -252,7 +252,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -368,7 +368,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -484,7 +484,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -600,7 +600,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -716,7 +716,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -61,7 +61,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -108,7 +108,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -155,7 +155,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -131,7 +131,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -200,7 +200,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -269,7 +269,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -744,7 +744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -813,7 +813,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -882,7 +882,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1357,7 +1357,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1407,74 +1407,6 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

-  manywheel-py3_11-cuda12_6-full-build:
-    if: ${{ github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_binary-build-linux.yml
-    needs: get-label-type
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_11-cuda12_6-full
-      build_environment: linux-binary-manywheel
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_6-full-test:  # Testing
-    if: ${{ github.repository_owner == 'pytorch' }}
-    needs:
-      - manywheel-py3_11-cuda12_6-full-build
-      - get-label-type
-    uses: ./.github/workflows/_binary-test-linux.yml
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_6-full
-      build_environment: linux-binary-manywheel
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      runs_on: linux.4xlarge.nvidia.gpu # for other cuda versions, we use 4xlarge runner
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_11-cuda12_6-full-upload:  # Uploading
-    if: ${{ github.repository_owner == 'pytorch' }}
-    permissions:
-      id-token: write
-      contents: read
-    needs: manywheel-py3_11-cuda12_6-full-test
-    with:
-      PYTORCH_ROOT: /pytorch
-      PACKAGE_TYPE: manywheel
-      # TODO: This is a legacy variable that we eventually want to get rid of in
-      #       favor of GPU_ARCH_VERSION
-      DESIRED_CUDA: cu126
-      GPU_ARCH_VERSION: 12.6
-      GPU_ARCH_TYPE: cuda
-      DOCKER_IMAGE: manylinux2_28-builder
-      DOCKER_IMAGE_TAG_PREFIX: cuda12.6
-      use_split_build: False
-      DESIRED_PYTHON: "3.11"
-      build_name: manywheel-py3_11-cuda12_6-full
-    secrets:
-      github-token: ${{ secrets.GITHUB_TOKEN }}
-    uses: ./.github/workflows/_binary-upload.yml
-
  manywheel-py3_11-cuda12_8-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1494,7 +1426,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1544,6 +1476,74 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_8-full-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_8-full-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu  # 12.8 and 12.9 build need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_8-full-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_8-full-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu128
+      GPU_ARCH_VERSION: 12.8
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.8
+      use_split_build: False
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_8-full
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda12_9-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1563,7 +1563,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -2038,7 +2038,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2107,7 +2107,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2176,7 +2176,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2651,7 +2651,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2720,7 +2720,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2789,7 +2789,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3264,7 +3264,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3333,7 +3333,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.2.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3402,7 +3402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
--- a/.github/workflows/h100-distributed.yml
+++ b/.github/workflows/h100-distributed.yml
@ -8,6 +8,8 @@ on:
  push:
    tags:
      - ciflow/h100-distributed/*
+  schedule:
+    - cron: 46 8 * * *  # about 1:46am PDT

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.github/workflows/h100-symm-mem.yml
+++ b/.github/workflows/h100-symm-mem.yml
@ -0,0 +1,54 @@
+name: Limited CI for symmetric memory tests on H100
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/h100-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/h100-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *  # about 1:22am PDT
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+
+  get-label-type:
+    if: github.repository_owner == 'pytorch'
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '9.0'
+      test-matrix: |
+        { include: [
+          { config: "h100-symm-mem", shard: 1, num_shards: 1, runner: "linux.aws.h100.4" },
+        ]}
+    secrets: inherit
+
+  linux-jammy-cuda12_8-py3_10-gcc11-sm90-test:
+    name: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm
+    with:
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-sm90-symm
+      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-sm90-build-symm.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/inductor-micro-benchmark-x86.yml
+++ b/.github/workflows/inductor-micro-benchmark-x86.yml
@ -13,7 +13,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  linux-jammy-cpu-py3_9-gcc11-inductor-build:
--- a/.github/workflows/inductor-micro-benchmark.yml
+++ b/.github/workflows/inductor-micro-benchmark.yml
@ -13,7 +13,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -16,7 +16,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-perf-compare.yml
+++ b/.github/workflows/inductor-perf-compare.yml
@ -10,7 +10,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-perf-test-nightly-aarch64.yml
+++ b/.github/workflows/inductor-perf-test-nightly-aarch64.yml
@ -48,7 +48,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly-h100.yml
+++ b/.github/workflows/inductor-perf-test-nightly-h100.yml
@ -63,7 +63,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-perf-test-nightly-rocm.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm.yml
@ -5,7 +5,7 @@ on:
    tags:
      - ciflow/inductor-perf-test-nightly-rocm/*
  schedule:
-    - cron: 0 7 * * 0
+    - cron: 0 7 * * 0,3
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
  workflow_dispatch:
@ -88,18 +88,23 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
-          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.mi300.2" },
        ]}
    secrets: inherit

--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -47,13 +47,15 @@ on:
        description: The list of configs used the benchmark
        required: false
        type: string
-        default: inductor_huggingface_perf_zen_cpu_x86,inductor_timm_perf_zen_cpu_x86,inductor_torchbench_perf_zen_cpu_x86
+        default: inductor_huggingface_perf_cpu_x86_zen,inductor_timm_perf_cpu_x86_zen,inductor_torchbench_perf_cpu_x86_zen

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
@ -77,18 +79,18 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-py3.9-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
-          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" },
-          { config: "inductor_huggingface_perf_zen_cpu_x86", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" },
-          { config: "inductor_timm_perf_zen_cpu_x86", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" },
-          { config: "inductor_timm_perf_zen_cpu_x86", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" },
-          { config: "inductor_timm_perf_zen_cpu_x86", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" },
-          { config: "inductor_timm_perf_zen_cpu_x86", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" },
-          { config: "inductor_timm_perf_zen_cpu_x86", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" },
-          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" },
-          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" },
-          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" },
-          { config: "inductor_torchbench_perf_zen_cpu_x86", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 1, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 2, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_huggingface_perf_cpu_x86_zen", shard: 3, num_shards: 3, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_cpu_x86_zen", shard: 1, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_cpu_x86_zen", shard: 2, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_cpu_x86_zen", shard: 3, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_cpu_x86_zen", shard: 4, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_timm_perf_cpu_x86_zen", shard: 5, num_shards: 5, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 1, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 2, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 3, num_shards: 4, runner: "linux.24xlarge.amd" },
+          { config: "inductor_torchbench_perf_cpu_x86_zen", shard: 4, num_shards: 4, runner: "linux.24xlarge.amd" },
        ]}
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -1,6 +1,9 @@
 name: inductor-perf-nightly-x86

 on:
+  pull_request:
+    paths:
+      - .github/workflows/inductor-perf-test-nightly-x86.yml
  schedule:
    # - cron: 0 7 * * 1-6
    # - cron: 0 7 * * 0
@ -40,6 +43,11 @@ on:
        required: false
        type: boolean
        default: false
+      freezing:
+        description: Run freezing?
+        required: false
+        type: boolean
+        default: true
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
@ -47,10 +55,12 @@ on:
        default: inductor_huggingface_perf_cpu_x86,inductor_timm_perf_cpu_x86,inductor_torchbench_perf_cpu_x86

 concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
@ -90,15 +100,14 @@ jobs:
      selected-test-configs: ${{ inputs.benchmark_configs }}
    secrets: inherit

-
-  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly:
+  linux-jammy-cpu-py3_9-gcc11-inductor-test-nightly-freezing:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
    needs: linux-jammy-cpu-py3_9-gcc11-inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
+      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
@ -108,7 +117,6 @@ jobs:
      monitor-data-collect-interval: 4
    secrets: inherit

-
  linux-jammy-cpu-py3_9-gcc11-inductor-test:
    name: linux-jammy-cpu-py3.9-gcc11-inductor
    uses: ./.github/workflows/_linux-test.yml
@ -116,7 +124,7 @@ jobs:
    if: github.event_name == 'workflow_dispatch'
    with:
      build-environment: linux-jammy-py3.9-gcc11-build
-      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
+      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
      docker-image: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cpu-py3_9-gcc11-inductor-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-test-nightly.yml
+++ b/.github/workflows/inductor-perf-test-nightly.yml
@ -63,7 +63,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -15,7 +15,9 @@ concurrency:
  cancel-in-progress: true


-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-default-label-prefix:
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -12,7 +12,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-unittest
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  get-label-type:
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -22,7 +22,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  unit-test:
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -19,7 +19,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  linux-jammy-cpu-py3_9-gcc11-opbenchmark-build:
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -20,7 +20,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -19,7 +19,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
@ -201,6 +203,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
        ]}
    secrets: inherit

@ -236,6 +239,7 @@ jobs:
          { config: "dynamo_wrapped", shard: 1, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 2, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "dynamo_wrapped", shard: 3, num_shards: 3, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
+          { config: "einops", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" }
        ]}
    secrets: inherit

--- a/.github/workflows/s390x-periodic.yml
+++ b/.github/workflows/s390x-periodic.yml
@ -15,7 +15,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@ -18,7 +18,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -16,7 +16,9 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true

-permissions: read-all
+permissions:
+  id-token: write
+  contents: read

 jobs:
  llm-td:
--- a/.github/workflows/update-viablestrict.yml
+++ b/.github/workflows/update-viablestrict.yml
@ -7,7 +7,7 @@ on:

 concurrency:
  group: ${{ github.workflow }}
-  cancel-in-progress: false
+  cancel-in-progress: true

 jobs:
  do_update_viablestrict:
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -16,6 +16,7 @@ on:
      - rocm-mi300
      - inductor-micro-benchmark
      - inductor-micro-benchmark-x86
+      - inductor-cu124
      - inductor-rocm
      - inductor-rocm-mi300
      - mac-mps
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -5,6 +5,10 @@ on:
    tags:
      - ciflow/xpu/*
  workflow_dispatch:
+  schedule:
+    # Run 3 times on weekdays and less frequently on weekends.
+    - cron: 45 0,8,16 * * 1-5
+    - cron: 45 4 * * 0,6

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
--- a/.gitmodules
+++ b/.gitmodules
@ -129,3 +129,6 @@
 [submodule "third_party/flash-attention"]
 	path = third_party/flash-attention
 	url = https://github.com/Dao-AILab/flash-attention.git
+[submodule "third_party/aiter"]
+	path = third_party/aiter
+	url = https://github.com/ROCm/aiter.git
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -87,7 +87,7 @@ include_patterns = [
    'torch/csrc/**/*.cpp',
    'torch/nativert/**/*.h',
    'torch/nativert/**/*.cpp',
-    'torch/standalone/**/*.h',
+    'torch/headeronly/**/*.h',
    'test/cpp/**/*.h',
    'test/cpp/**/*.cpp',
 ]
@ -122,6 +122,7 @@ is_formatter = true
 [[linter]]
 code = 'MYPY'
 include_patterns = [
+    'setup.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
    'caffe2/**/*.py',
@ -241,7 +242,7 @@ include_patterns = [
    'torch/nativert/*.cpp',
    'torch/nativert/**/*.h',
    'torch/nativert/**/*.cpp',
-    'torch/standalone/**/*.h',
+    'torch/headeronly/**/*.h',
 ]
 exclude_patterns = [
    # The negative filters below are to exclude files that include onnx_pb.h or
@ -1156,6 +1157,7 @@ exclude_patterns = [
    'torch/_vendor/**',
    'torch/_inductor/fx_passes/serialized_patterns/**',
    'torch/_inductor/autoheuristic/artifacts/**',
+    'torch/utils/model_dump/preact.mjs',
    # These files are all grandfathered in, feel free to remove from this list
    # as necessary
    # NOTE: remove the patterns in the order they are listed
@ -1167,18 +1169,10 @@ exclude_patterns = [
    'test/**',
    'test/test_*',
    'test/[a-hA-h]*/**',
-    'test/inductor/**',
-    'test/dynamo/**',
    'test/distributed/**',
    'torch/**',
    'torch/_*/**',
-    'torch/ao/**',
-    'torch/fx/**',
    'torch/distributed/tensor/**',
-    'torch/[j-o]*/**',
-    'torch/utils/**',
-    'torch/csrc/jit/**',
-    'torch/csrc/jit/[a-o]*/**',
 ]
 init_command = [
    'python3',
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -671,14 +671,6 @@ flatbuffer_cc_library(
    out_prefix = "torch/csrc/jit/serialization/",
 )

-cc_library(
-    name = "torch_standalone_headers",
-    hdrs = glob([
-        "torch/standalone/**/*.h"
-    ]),
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
    name = "torch_headers",
    hdrs = if_cuda(
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,31 +1,50 @@
-include MANIFEST.in
+# Reference: https://setuptools.pypa.io/en/latest/userguide/miscellaneous.html
+
+# Include source files in SDist
 include CMakeLists.txt
-include CITATION.cff
-include LICENSE
-include NOTICE
-include .gitmodules
-include build_variables.bzl
-include mypy.ini
-include requirements.txt
-include ufunc_defs.bzl
+include *.bzl *.bazel .bazel* BUILD *.BUILD BUILD.* WORKSPACE
+include BUCK BUCK.*
+include requirements*.txt
 include version.txt
-recursive-include android *.*
-recursive-include aten *.*
-recursive-include binaries *.*
-recursive-include c10 *.*
-recursive-include caffe2 *.*
-recursive-include cmake *.*
-recursive-include torch *.*
-recursive-include tools *.*
-recursive-include test *.*
-recursive-include docs *.*
-recursive-include ios *.*
-recursive-include third_party *
-recursive-include test *.*
-recursive-include benchmarks *.*
-recursive-include scripts *.*
-recursive-include mypy_plugins *.*
-recursive-include modules *.*
-recursive-include functorch *.*
+include [Mm]akefile *.[Mm]akefile [Mm]akefile.*
+include [Dd]ockerfile *.[Dd]ockerfile [Dd]ockerfile.* .dockerignore
+graft android
+graft aten
+graft binaries
+graft c10
+graft caffe2
+graft cmake
+graft functorch
+graft third_party
+graft tools
+graft torch
+graft torchgen
+# FIXME: torch-xla build during codegen will fail if include this file in wheel
+exclude torchgen/BUILD.bazel
+
+# Misc files and directories in SDist
+include *.md
+include CITATION.cff
+include LICENSE NOTICE
+include mypy*.ini
+graft benchmarks
+graft docs
+graft mypy_plugins
+graft scripts
+
+# Misc files needed for custom setuptools command
+include .gitignore
+include .gitmodules
+
+# Include test suites in SDist
+graft test
+include pytest.ini
+include .coveragerc
+
+# Prune generated/compiled files
+prune torchgen/packaged
 prune */__pycache__
-global-exclude *.o *.so *.dylib *.a .git *.pyc *.swp
+global-exclude *.o *.obj *.so *.a *.dylib *.pxd *.dll *.lib *.py[cod]
+
+prune */.git
+global-exclude .git *~ *.swp
--- a/6
+++ b/6
@ -57,7 +57,8 @@ setup-env-cuda:
 setup-env-rocm:
 	$(MAKE) setup-env PYTHON="$(PYTHON)" NIGHTLY_TOOL_OPTS="$(NIGHTLY_TOOL_OPTS) --rocm"

-.lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
+.PHONY: setup-lint
+setup-lint .lintbin/.lintrunner.sha256: requirements.txt pyproject.toml .lintrunner.toml
 	@echo "Setting up lintrunner..."
 	$(PIP) install lintrunner
 	lintrunner init
@ -65,9 +66,6 @@ setup-env-rocm:
 	@mkdir -p .lintbin
 	@sha256sum requirements.txt pyproject.toml .lintrunner.toml > .lintbin/.lintrunner.sha256

-.PHONY: setup-lint
-setup-lint: .lintbin/.lintrunner.sha256
-
 .PHONY: lazy-setup-lint
 lazy-setup-lint: .lintbin/.lintrunner.sha256
 	@if [ ! -x "$(shell command -v lintrunner)" ]; then \
--- a/README.md
+++ b/README.md
@ -200,7 +200,7 @@ If you want to compile with CUDA support, [select a supported version of CUDA fr
 - [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above
 - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA

-Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware
+Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/backend/latest/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver, and NVIDIA hardware.

 If you want to disable CUDA support, export the environment variable `USE_CUDA=0`.
 Other potentially useful environment variables may be found in `setup.py`.  If
--- a/RELEASE.md
+++ b/RELEASE.md
@ -50,6 +50,7 @@ Following is the Release Compatibility Matrix for PyTorch releases:

 | PyTorch version | Python | C++ | Stable CUDA | Experimental CUDA | Stable ROCm |
 | --- | --- | --- | --- | --- | --- |
+| 2.8 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 12.6 (CUDNN 9.10.2.21), CUDA 12.8 (CUDNN 9.10.2.21) | CUDA 12.9 (CUDNN 9.10.2.21) | ROCm 6.4 |
 | 2.7 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8 (CUDNN 9.1.0.70), CUDA 12.6 (CUDNN 9.5.1.17) | CUDA 12.8 (CUDNN 9.7.1.26) | ROCm 6.3 |
 | 2.6 | >=3.9, <=3.13, (3.13t experimental) | C++17 | CUDA 11.8, CUDA 12.4 (CUDNN 9.1.0.70) | CUDA 12.6 (CUDNN 9.5.1.17) | ROCm 6.2.4 |
 | 2.5 | >=3.9, <=3.12, (3.13 experimental) | C++17 | CUDA 11.8, CUDA 12.1, CUDA 12.4, CUDNN 9.1.0.70  | None | ROCm 6.2 |
@ -73,9 +74,9 @@ Following is the release cadence. All future dates below are tentative. For late
 | 2.4 | Jun 2024 | Jul 2024 | Sept 2024 | Not planned |
 | 2.5 | Sep 2024 | Oct 2024 | Nov 2024 |  Not planned |
 | 2.6 | Dec 2024 | Jan 2025 | Not planned | Not planned |
-| 2.7 | Mar 2025 | Apr 2025 | (May 2025) | (Jun 2025) |
+| 2.7 | Mar 2025 | Apr 2025 | Jun 2025 | Not planned |
 | 2.8 | Jun 2025 | Jul 2025 | (Aug 2025) | (Sep 2025) |
-| 2.9 | Aug 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
+| 2.9 | Sept 2025 | Oct 2025 | (Nov 2025) | (Dec 2025) |
 | 2.10 | Dec 2025 | Jan 2026 | (Feb 2026) | (Mar 2026) |
 | 2.11 | Mar 2026 | Apr 2026 | (Jun 2026) | (Jul 2026) |

--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -193,6 +193,10 @@ if(USE_FLASH_ATTENTION)
        add_subdirectory(native/transformers/hip/flash_attn/ck)
        file(GLOB flash_attention_hip_ck_hip "native/transformers/hip/flash_attn/ck/*.hip")
        list(APPEND native_transformers_hip_hip ${flash_attention_hip_ck_hip})
+        # FAv3 Generation
+        add_subdirectory(native/transformers/hip/flash_attn/ck/fav_v3)
+        file(GLOB flash_attention_v3_hip "native/transformers/hip/flash_attn/ck/fav_v3/*.hip")
+        list(APPEND native_transformers_hip_hip ${flash_attention_v3_hip})
      endif()
  endif()
  file(GLOB flash_attention_hip_aot_hip "native/transformers/hip/flash_attn/aot/*.hip")
@ -392,6 +396,7 @@ if(USE_ROCM)
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/include)
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/composable_kernel/library/include)
  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/composable_kernel)
+  list(APPEND ATen_HIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/aiter/csrc/include)
  _pytorch_rocm_generate_ck_conf()

  # Next two lines are needed because TunableOp uses third-party/fmt
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -19,9 +19,69 @@
 #if defined(__aarch64__) && !defined(C10_MOBILE)
 #include <cpuinfo.h>
 #endif
-
 namespace at {

+namespace {
+
+/*
+  These const variables defined the fp32 precisions for different backend
+  We have "generic", "cuda", "mkldnn" backend now and we can choose fp32
+  prevision from "ieee", "tf32", "bf16" and "none". The "ieee" precision means
+  IEEE standard floating point format "tf32" and "bf16" means we are allowed to
+  use "tf32" or "bf16" as internal computation data types for fp32 computations.
+  And "none" means it is override-able by parent's node
+
+  generic->mkldnn->matmul
+                ->conv
+                ->rnn
+         ->cuda ->matmul
+                ->conv
+                ->rnn
+*/
+const std::map<std::string, std::vector<std::string>> _fp32_precisions = {
+    {"generic", {{"ieee", "tf32", "bf16", "none"}}},
+    {"mkldnn", {{"ieee", "bf16", "none"}}},
+    {"cuda", {{"ieee", "tf32", "none"}}}};
+
+// Check whether the backend and op are legal
+void check_fp32_prec_backend_and_op(
+    const std::string& backend,
+    const std::string& op) {
+  static std::vector<std::string> backends = {"generic", "mkldnn", "cuda"};
+  static std::vector<std::string> operators = {"conv", "matmul", "rnn", "all"};
+  TORCH_CHECK(
+      std::find(backends.begin(), backends.end(), backend) != backends.end(),
+      "Invalid backend: ",
+      backend);
+  TORCH_CHECK(
+      std::find(operators.begin(), operators.end(), op) != operators.end(),
+      "Invalid operator: ",
+      op);
+  if (backend == "generic") {
+    TORCH_CHECK(op == "all", "Invalid operation for generic backend: ", op);
+  }
+  }
+
+  // Return whether the precision is supported by backends
+  bool validate_fp32_prec(
+      const std::string& backend,
+      const std::string& precision) {
+    auto iterp = _fp32_precisions.find(backend);
+    TORCH_CHECK(iterp != _fp32_precisions.end());
+    auto precisions = iterp->second;
+    bool valid = std::find(precisions.begin(), precisions.end(), precision) !=
+        precisions.end();
+    return valid;
+  }
+
+  C10_ALWAYS_INLINE void warn_deprecated_fp32_precision_api(){
+    TORCH_WARN_ONCE(
+      "This API is going to be deprecated, please see "
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices"
+    );
+  }
+} // namespace
+
 Context::Context() = default;

 // TODO: This could be bad juju if someone calls globalContext() in the
@ -115,12 +175,29 @@ void Context::setUserEnabledNNPACK(bool e) {
  enabled_nnpack = e;
 }

-bool Context::allowTF32CuDNN() const {
+bool Context::allowTF32CuDNN(const std::string& op) const {
+  if (op.size() == 0){
+    bool allow_tf32_rnn = float32Precision("cuda", "rnn") == "tf32";
+    bool allow_tf32_conv = float32Precision("cuda", "conv") == "tf32";
+    TORCH_CHECK(
+        allow_tf32_rnn == allow_tf32_conv && allow_tf32_rnn == allow_tf32_cudnn,
+        "PyTorch is checking whether allow_tf32 is enabled for cuDNN without a specific operator name,",
+        "but the current flag(s) indicate that cuDNN conv and cuDNN RNN have different TF32 flags.",
+        "This combination indicates that you have used a mix of the legacy and new APIs to set the TF32 flags. ",
+        "We suggest only using the new API to set the TF32 flag(s). See also: ",
+        "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  } else {
+    return float32Precision("cuda", op) == "tf32";
+  }
+  warn_deprecated_fp32_precision_api();
  return allow_tf32_cudnn;
 }

 void Context::setAllowTF32CuDNN(bool b) {
+  setFloat32Precision("cuda", "rnn", b ? "tf32" : "none");
+  setFloat32Precision("cuda", "conv", b ? "tf32" : "none");
  allow_tf32_cudnn = b;
+  warn_deprecated_fp32_precision_api();
 }

 void Context::setSDPPriorityOrder(const std::vector<int64_t>& order) {
@ -141,12 +218,13 @@ bool Context::allowTF32OneDNN() const {
  return allow_tf32_onednn;
 }

-void Context::setAllowTF32OneDNN(bool b){
-#ifdef USE_XPU
+  // NOLINTNEXTLINE(clang-diagnostic-unused-parameter)
+  void Context::setAllowTF32OneDNN(bool b){
+  #ifdef USE_XPU
  allow_tf32_onednn = b;
-#else
+  #else
  TORCH_WARN("TF32 acceleration on top of oneDNN is available for Intel GPUs. The current Torch version does not have Intel GPU Support.");
-#endif
+  #endif
 }

 bool Context::userEnabledFlashSDP() const {
@ -259,7 +337,16 @@ bool Context::allowTF32CuBLAS() const {
      return false;
    }
 #endif
-  return float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
+  bool legacy_allow_tf32 = float32_matmul_precision != at::Float32MatmulPrecision::HIGHEST;
+  bool allow_tf32_new = float32Precision("cuda", "matmul") == "tf32";
+  TORCH_CHECK(
+      legacy_allow_tf32 == allow_tf32_new,
+      "PyTorch is checking whether allow_tf32_new is enabled for cuBlas matmul,",
+      "Current status indicate that you have used mix of the legacy and new APIs to set the TF32 status for cublas matmul. ",
+      "We suggest only using the new API to set the TF32 flag. See also: ",
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  warn_deprecated_fp32_precision_api();
+  return allow_tf32_new;
 }

 void Context::setAllowTF32CuBLAS(bool b) {
@ -272,27 +359,54 @@ void Context::setAllowTF32CuBLAS(bool b) {
  }
 #endif
  float32_matmul_precision = b ? at::Float32MatmulPrecision::HIGH : at::Float32MatmulPrecision::HIGHEST;
+  setFloat32Precision("cuda", "matmul", b ? "tf32" : "ieee");
 }

 Float32MatmulPrecision Context::float32MatmulPrecision() const {
+  bool invalid = float32Precision("cuda", "matmul") == "tf32" &&
+      float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST;
+  invalid = invalid ||
+      (float32Precision("mkldnn", "matmul") == "bf16" &&
+       float32_matmul_precision != at::Float32MatmulPrecision::MEDIUM);
+  TORCH_CHECK(
+      !invalid,
+      "PyTorch is checking the matmul precision without a specific backend name,",
+      "Current status indicate that you have used mix of the legacy and new APIs to set the matmul precision. ",
+      "We suggest only using the new API for matmul precision. See also: ",
+      "https://pytorch.org/docs/main/notes/cuda.html#tensorfloat-32-tf32-on-ampere-and-later-devices");
+  warn_deprecated_fp32_precision_api();
  return float32_matmul_precision;
 }

-void Context::setFloat32MatmulPrecision(Float32MatmulPrecision p) {
-  float32_matmul_precision = p;
+std::string Context::float32Precision(const std::string& backend, const std::string& op) const {
+  check_fp32_prec_backend_and_op(backend, op);
+  auto precision = fp32_precision.find(backend)->second.find(op)->second;
+  if (precision == "none")
+    precision = fp32_precision.find(backend)->second.find("all")->second;
+  if (precision == "none")
+    precision = fp32_precision.find("generic")->second.find("all")->second;
+  bool valid_prec = validate_fp32_prec(backend, precision);
+  return valid_prec ? precision : "none";
 }

 void Context::setFloat32MatmulPrecision(const std::string &s) {
  auto match = [this](const std::string & s_) {
+    warn_deprecated_fp32_precision_api();
    // TODO: consider if CuDNN field needs to also be set for potential future CuDNN ops like multi-headed attention
    if (s_ == "highest") {
      float32_matmul_precision = at::Float32MatmulPrecision::HIGHEST;
+      setFloat32Precision("cuda", "matmul", "ieee");
+      setFloat32Precision("mkldnn", "matmul", "ieee");
      return true;
    } else if (s_ == "high") {
      float32_matmul_precision = at::Float32MatmulPrecision::HIGH;
+      setFloat32Precision("cuda", "matmul", "tf32");
+      setFloat32Precision("mkldnn", "matmul", "ieee");
      return true;
    } else if (s_ == "medium") {
      float32_matmul_precision = at::Float32MatmulPrecision::MEDIUM;
+      setFloat32Precision("cuda", "matmul", "tf32");
+      setFloat32Precision("mkldnn", "matmul", "bf16");
      return true;
    }
    return false;
@ -306,6 +420,27 @@ void Context::setFloat32MatmulPrecision(const std::string &s) {
    "setFloat32MatmulPrecision call has no effect.");
 }

+void Context::setFloat32Precision(const std::string& backend, const std::string& op, const std::string& p) {
+  check_fp32_prec_backend_and_op(backend, op);
+  if (validate_fp32_prec(backend, p)) {
+    fp32_precision[backend][op] = p;
+  } else {
+    std::string msg;
+    auto iterp = _fp32_precisions.find(backend);
+    TORCH_CHECK(iterp != _fp32_precisions.end());
+    for (auto p : iterp->second) {
+      msg += p;
+      msg += " ";
+    }
+    TORCH_WARN(
+        "you have set wrong precision for backend:",
+        backend,
+        " setFloat32Precision call has no effect.",
+        "Please choose precision from: ",
+        msg);
+  }
+}
+
 at::LinalgBackend Context::linalgPreferredBackend() const {
  return linalg_preferred_backend;
 }
@ -535,13 +670,14 @@ at::QEngine Context::qEngine() const {
 #endif
    return qengine;
  }();
-  return quantized_engine.value_or(_quantized_engine);
+  auto qt_engine = quantized_engine.load();
+  return qt_engine == at::QEngine::NoQEngine ? _quantized_engine : qt_engine;
 }

 void Context::setQEngine(at::QEngine e) {
  const auto& qengines = supportedQEngines();
  if (std::find(qengines.begin(), qengines.end(), e) != qengines.end()) {
-    quantized_engine = e;
+    quantized_engine.store(e);
    return;
  }
  TORCH_CHECK(false, "quantized engine ", toString(e), " is not supported");
@ -553,17 +689,9 @@ const std::vector<at::QEngine>& Context::supportedQEngines() {
    // Engines are listed in priority order: later one wins
    // By default we prefer FBGEMM if we're running on server side
    // QNNPACK on server side has some issue, so we disable it by default.
-#ifdef C10_MOBILE
-    engines.push_back(at::kNoQEngine);
 #ifdef USE_PYTORCH_QNNPACK
    engines.push_back(at::kQNNPACK);
 #endif
-#else  // C10_MOBILE
-#ifdef USE_PYTORCH_QNNPACK
-    engines.push_back(at::kQNNPACK);
-#endif
-    engines.push_back(at::kNoQEngine);
-#endif // C10_MOBILE

 #if AT_MKLDNN_ENABLED()
    engines.push_back(at::kONEDNN);
@ -695,6 +823,7 @@ void Context::setAllowFP16ReductionCPU(bool b) {
 #if defined(__aarch64__) && !defined(C10_MOBILE)
    if (!cpuinfo_initialize() || !cpuinfo_has_arm_fp16_arith())
 #else
+    // NOLINTNEXTLINE(facebook-hte-MissingBraces)
    if (true)
 #endif
      TORCH_CHECK(false, "Float16 arithmetic is not supported by the CPU!");
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -28,6 +28,7 @@
 #include <c10/util/irange.h>

 #include <cstdint>
+#include <map>
 #include <mutex>

 namespace at {
@ -336,14 +337,20 @@ class TORCH_API Context {
  void alertCuBLASConfigNotDeterministic() const;

  void setFloat32MatmulPrecision(const std::string& s);
-  bool allowTF32CuDNN() const;
+  void setFloat32Precision(
+      const std::string& backend,
+      const std::string& op,
+      const std::string& s);
+  bool allowTF32CuDNN(const std::string& op = std::string()) const;
  void setAllowTF32CuDNN(bool);
  bool allowTF32OneDNN() const;
  void setAllowTF32OneDNN(bool);
  bool allowTF32CuBLAS() const;
  void setAllowTF32CuBLAS(bool);
  Float32MatmulPrecision float32MatmulPrecision() const;
-  void setFloat32MatmulPrecision(Float32MatmulPrecision p);
+  std::string float32Precision(
+      const std::string& backend,
+      const std::string& op) const;
  bool allowFP16ReductionCuBLAS() const;
  void setAllowFP16ReductionCuBLAS(bool);
  bool allowBF16ReductionCuBLAS() const;
@ -465,10 +472,27 @@ class TORCH_API Context {
  bool release_original_weights = false;
 #endif
  bool display_vmap_fallback_warnings_ = false;
-  std::optional<at::QEngine> quantized_engine = std::nullopt;
+  std::atomic<at::QEngine> quantized_engine = at::QEngine::NoQEngine;
  bool enable_sparse_tensor_invariant_checks = false;
  bool allow_fp16_reduction_cpu = false;

+  std::map<std::string, std::map<std::string, std::string>> fp32_precision = {
+      {"generic", {{"all", "none"}}},
+      {"mkldnn",
+       {{"matmul", "none"},
+        {"conv", "none"},
+        {"rnn", "none"},
+        {"all", "none"}}},
+      {"cuda",
+       {{"matmul",
+         float32_matmul_precision == at::Float32MatmulPrecision::HIGHEST
+             ? "none"
+             : "tf32"},
+        {"conv", "tf32"},
+        {"rnn", "tf32"},
+        {"all", "none"}}},
+  };
+
  Allocator* prev_allocator_ptr_{nullptr};
 };

--- a/aten/src/ATen/DLConvertor.cpp
+++ b/aten/src/ATen/DLConvertor.cpp
@ -266,19 +266,38 @@ ScalarType toScalarType(const DLDataType& dtype) {
 }

 namespace {
+
+// The templated classes below are needed for supporting both:
+//   - DLManagedTensor
+//   - DLManagedTensorVersioned
+template <class T>
 struct ATenDLMTensor {
  Tensor handle;
-  DLManagedTensor tensor{};
+  T tensor{};
 };
-} // namespace

-static void deleter(DLManagedTensor* arg) {
-  delete static_cast<ATenDLMTensor*>(arg->manager_ctx);
+template <class T>
+void deleter(T* arg) {
+  delete static_cast<ATenDLMTensor<T>*>(arg->manager_ctx);
+}
+
+// Adds version information for DLManagedTensorVersioned.
+// This is a no-op for the other types.
+template <class T>
+void fillVersion(T* tensor) {}
+
+template <>
+void fillVersion<DLManagedTensorVersioned>(
+    DLManagedTensorVersioned* tensor) {
+  tensor->flags = 0;
+  tensor->version.major = DLPACK_MAJOR_VERSION;
+  tensor->version.minor = DLPACK_MINOR_VERSION;
 }

 // This function returns a shared_ptr to memory managed DLpack tensor
 // constructed out of ATen tensor
-DLManagedTensor* toDLPack(const Tensor& src) {
+template <class T>
+T* toDLPackImpl(const Tensor& src) {
  // create a new tensor with possibly normalized strides
  // gh-83069
  auto shape = src.sizes();
@ -290,10 +309,10 @@ DLManagedTensor* toDLPack(const Tensor& src) {
  }

  auto view = src.as_strided(shape, strides, src.storage_offset());
-  ATenDLMTensor* atDLMTensor(new ATenDLMTensor);
+  ATenDLMTensor<T>* atDLMTensor(new ATenDLMTensor<T>);
  atDLMTensor->handle = view;
  atDLMTensor->tensor.manager_ctx = atDLMTensor;
-  atDLMTensor->tensor.deleter = &deleter;
+  atDLMTensor->tensor.deleter = &deleter<T>;
  atDLMTensor->tensor.dl_tensor.data = view.data_ptr();
  c10::DeviceIndex device_id = 0;
  if (src.is_cuda() || src.is_privateuseone()) {
@ -305,35 +324,68 @@ DLManagedTensor* toDLPack(const Tensor& src) {
  atDLMTensor->tensor.dl_tensor.shape = view.sizes().data();
  atDLMTensor->tensor.dl_tensor.strides = view.strides().data();
  atDLMTensor->tensor.dl_tensor.byte_offset = 0;
+  fillVersion(&atDLMTensor->tensor);
+
  return &(atDLMTensor->tensor);
 }

-Tensor fromDLPack(DLManagedTensor* src) {
-  auto deleter = [src](void* self [[maybe_unused]]) {
-    if (src->deleter) {
-      src->deleter(src);
-    }
-  };
-  return fromDLPack(src, std::move(deleter));
-}
+// Explicitly instantiate the template above for both classes.
+template DLManagedTensor* toDLPackImpl<DLManagedTensor>(const Tensor&);
+template DLManagedTensorVersioned* toDLPackImpl<DLManagedTensorVersioned>(const Tensor&);

-Tensor fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter) {
-  Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data);
-  ScalarType stype = toScalarType(src->dl_tensor.dtype);
-  if (!src->dl_tensor.strides) {
+// This function constructs a Tensor from a memory managed DLPack which
+// may be represented as either: DLManagedTensor and DLManagedTensorVersioned.
+template <class T>
+at::Tensor fromDLPackImpl(T* src, std::function<void(void*)> deleter) {
+  if (!deleter) {
+    deleter = [src](void* self [[maybe_unused]]) {
+      if (src->deleter) {
+        src->deleter(src);
+      }
+    };
+  }
+
+  DLTensor& dl_tensor = src->dl_tensor;
+  Device device = getATenDevice(dl_tensor.device, dl_tensor.data);
+  ScalarType stype = toScalarType(dl_tensor.dtype);
+
+  if (!dl_tensor.strides) {
    return at::from_blob(
-        src->dl_tensor.data,
-        IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
+        dl_tensor.data,
+        IntArrayRef(dl_tensor.shape, dl_tensor.ndim),
        std::move(deleter),
        at::device(device).dtype(stype),
        {device});
  }
  return at::from_blob(
-      src->dl_tensor.data,
-      IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim),
-      IntArrayRef(src->dl_tensor.strides, src->dl_tensor.ndim),
+      dl_tensor.data,
+      IntArrayRef(dl_tensor.shape, dl_tensor.ndim),
+      IntArrayRef(dl_tensor.strides, dl_tensor.ndim),
      deleter,
      at::device(device).dtype(stype),
      {device});
 }
+
+// Explicitly instantiate the template above for both classes.
+template at::Tensor fromDLPackImpl<DLManagedTensor>(DLManagedTensor* src, std::function<void(void*)> deleter);
+template at::Tensor fromDLPackImpl<DLManagedTensorVersioned>(DLManagedTensorVersioned* src, std::function<void(void*)> deleter);
+
+} // namespace
+
+DLManagedTensor* toDLPack(const Tensor& src) {
+  return toDLPackImpl<DLManagedTensor>(src);
+}
+
+DLManagedTensorVersioned* toDLPackVersioned(const Tensor& src) {
+  return toDLPackImpl<DLManagedTensorVersioned>(src);
+}
+
+Tensor fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter) {
+  return fromDLPackImpl<DLManagedTensor>(src, std::move(deleter));
+}
+
+Tensor fromDLPackVersioned(DLManagedTensorVersioned* src, std::function<void(void*)> deleter) {
+  return fromDLPackImpl<DLManagedTensorVersioned>(src, std::move(deleter));
+}
+
 } // namespace at
--- a/aten/src/ATen/DLConvertor.h
+++ b/aten/src/ATen/DLConvertor.h
@ -12,10 +12,48 @@ namespace at {

 TORCH_API ScalarType toScalarType(const DLDataType& dtype);
 TORCH_API DLManagedTensor* toDLPack(const Tensor& src);
-TORCH_API Tensor fromDLPack(DLManagedTensor* src);
+TORCH_API struct DLManagedTensorVersioned* toDLPackVersioned(const Tensor& src);
 TORCH_API Tensor
-fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter);
+fromDLPack(DLManagedTensor* src, std::function<void(void*)> deleter = {});
+TORCH_API Tensor fromDLPackVersioned(
+    DLManagedTensorVersioned* src,
+    std::function<void(void*)> deleter = {});
 TORCH_API DLDataType getDLDataType(const Tensor& t);
 TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id);

+// This trait class is used for retrieving different attributes, such as the
+// PyCapsule names and conversion functions for both DLPack tensor classes:
+// `DLManagedTensor` and `DLManagedTensorVersioned`.
+//
+// Each specialization should contain the following 2 traits:
+//   - `capsule`: actual name of the capsule
+//   - `used`: name of the capsule after using it
+//   - `toDLPack`: function for converting a tensor into a DLPack capsule
+//   - `fromDLPack`: function for creating a tensor from a DLPack capsule
+//
+// While `toDLPack` is the directly exposed to Python, `fromDLPack` is not.
+// Although it contains the core implementation, it lacks the required book
+// keeping logic contained in its caller `tensor_fromDLPack`.
+//
+// That said, `fromDLPack` is used directly in a few DLPack tests that live
+// inside ATen (no Python available).
+template <class T>
+struct DLPackTraits {};
+
+template <>
+struct DLPackTraits<DLManagedTensor> {
+  inline static const char* capsule = "dltensor";
+  inline static const char* used = "used_dltensor";
+  inline static auto toDLPack = at::toDLPack;
+  inline static auto fromDLPack = at::fromDLPack;
+};
+
+template <>
+struct DLPackTraits<DLManagedTensorVersioned> {
+  inline static const char* capsule = "dltensor_versioned";
+  inline static const char* used = "used_dltensor_versioned";
+  inline static auto toDLPack = at::toDLPackVersioned;
+  inline static auto fromDLPack = at::fromDLPackVersioned;
+};
+
 } // namespace at
--- a/aten/src/ATen/DeviceAccelerator.h
+++ b/aten/src/ATen/DeviceAccelerator.h
@ -1,6 +1,5 @@
 #pragma once

-#include <c10/core/CachingDeviceAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/macros/Macros.h>

@ -31,7 +30,7 @@ TORCH_API bool isAccelerator(c10::DeviceType device_type);
 template <
    typename... T,
    typename = std::enable_if_t<(std::is_same_v<T, c10::DeviceType> && ...)>>
-TORCH_API inline bool isAcceleratorExcluded(
+inline bool isAcceleratorExcluded(
    c10::DeviceType device_type,
    c10::DeviceType first_excluded,
    T... rest_excluded) {
@ -73,27 +72,6 @@ TORCH_API c10::DeviceIndex exchangeDevice(c10::DeviceIndex device_index);
 // original device index that was active before the change.
 TORCH_API c10::DeviceIndex maybeExchangeDevice(c10::DeviceIndex device_index);

-TORCH_API inline void emptyCache() {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->emptyCache();
-}
-
-TORCH_API inline at::CachingDeviceAllocator::DeviceStats getDeviceStats(
-    c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  return at::getDeviceAllocator(device_type)->getDeviceStats(device_index);
-}
-
-TORCH_API inline void resetAccumulatedStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetAccumulatedStats(device_index);
-}
-
-TORCH_API inline void resetPeakStats(c10::DeviceIndex device_index) {
-  const auto device_type = getAccelerator(true).value();
-  at::getDeviceAllocator(device_type)->resetPeakStats(device_index);
-}
-
 } // namespace at::accelerator

 namespace at {
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -122,6 +122,9 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {

  ~FunctionalStorageImpl() override = default;

+  uint64_t mutation_counter() {
+    return mutation_counter_;
+  }
  void mark_mutation() {
    mutation_counter_++;
  }
@ -150,12 +153,17 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  void mark_inductor_storage_resize(c10::SymInt new_size) {
    inductor_storage_resized_ = true;
    curr_storage_size_ = std::move(new_size);
+    inductor_storage_resized_counter_++;
  }

  bool was_inductor_storage_resized() {
    return inductor_storage_resized_;
  }

+  uint64_t inductor_storage_resized_counter() {
+    return inductor_storage_resized_counter_;
+  }
+
 private:
  // NB: base_ should always point to a tensor BELOW the current
  // functionalization layer. This is mainly to avoid reference cycles. e.g.
@ -201,6 +209,7 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
  // (1) There were any storage resizes on a graph input
  // (2) The original/curr storage size tell us if these resizes result in a nop
  bool inductor_storage_resized_ = false;
+  uint64_t inductor_storage_resized_counter_ = 0;
  c10::SymInt original_storage_size_;
  c10::SymInt curr_storage_size_;
 };
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -178,7 +178,7 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 // See Note [Functionalization Pass - Inplace View Ops]
 void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
  view_metas_.push_back(meta);
-  // Manually track the fact that this tensor recieved a metadata mutation!
+  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
  maybe_mark_symbolic(meta);
@ -273,7 +273,7 @@ void FunctionalTensorWrapper::set__impl(const FunctionalTensorWrapper* other) {
  // (We could check if the updated value has a new storage than the original value,
  // but this won't also let us uniquely determine if the tensor **also**
  // experienced a data mutation).
-  was_storage_changed_ = true;
+  mark_storage_changed();

  auto sizes_ = value_.sym_sizes();
  auto strides_ = value_.sym_strides();
@ -499,8 +499,8 @@ int64_t FunctionalTensorWrapper::dim_custom() const {
 int64_t FunctionalTensorWrapper::numel_custom() const {
  return value_.unsafeGetTensorImpl()->numel();
 }
-bool FunctionalTensorWrapper::is_contiguous_custom(at::MemoryFormat memory_format) const {
-  return value_.unsafeGetTensorImpl()->is_contiguous(memory_format);
+c10::SymBool FunctionalTensorWrapper::sym_is_contiguous_custom(at::MemoryFormat memory_format) const {
+  return value_.unsafeGetTensorImpl()->sym_is_contiguous(memory_format);
 }
 c10::SymIntArrayRef FunctionalTensorWrapper::sym_sizes_custom() const {
  return value_.unsafeGetTensorImpl()->sym_sizes();
@ -579,7 +579,7 @@ std::vector<Tensor> from_functional_tensor(ITensorListRef t_list) {
  for (const auto& tensor : t_list) {
    // from_functional_tensor(Tensor) has asserts to make sure you don't accidentally call
    // it on a non-functional input,
-    // but from_functional_tensor(TensorList) can recieve a list containing both
+    // but from_functional_tensor(TensorList) can receive a list containing both
    // functional and non-functional tensors.
    // Example of when that can happen: torch.cat(function_input_tensor, global_state_tensor).
    // When that happens, we're okay with only unwrapping the functional tensors.
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -74,7 +74,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool has_metadata_mutation() const {
    return has_metadata_mutation_;
  }
-
+  uint64_t mutation_counter() const {
+    return functional_storage_impl()->mutation_counter();
+  }
  void mark_mutation() {
    functional_storage_impl()->mark_mutation();
  }
@ -161,8 +163,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    return was_storage_changed_;
  }

-  void set_storage_changed() {
+  void mark_storage_changed() {
    was_storage_changed_ = true;
+    storage_changed_counter_++;
+  }
+
+  uint64_t storage_changed_counter() {
+    return storage_changed_counter_;
  }

  // A FunctionalTensor is considered a base if its not a view of another
@ -181,6 +188,9 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
    return functional_storage_impl()->was_inductor_storage_resized();
  }

+  bool inductor_storage_resized_counter() {
+    return functional_storage_impl()->inductor_storage_resized_counter();
+  }
  // The functionalization pass can be used to remove mutations.
  // It does so by replacing any mutation op with it's corresponding
  // out-of-place op, followed by a call to replace_(). e.g:
@ -226,7 +236,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  at::IntArrayRef strides_custom() const override;
  int64_t dim_custom() const override;
  int64_t numel_custom() const override;
-  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  c10::SymBool sym_is_contiguous_custom(
+      at::MemoryFormat memory_format) const override;
  c10::SymIntArrayRef sym_sizes_custom() const override;
  c10::SymInt sym_size_custom(int64_t d) const override;
  c10::SymIntArrayRef sym_strides_custom() const override;
@ -269,6 +280,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_multi_output_view_ = false;
  // Did the tensor experience a set_() call.
  bool was_storage_changed_ = false;
+  uint64_t storage_changed_counter_ = 0;
  // Did the tensor experience any view operation with symbolic int.
  bool is_symbolic_ = false;

@ -288,7 +300,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
 namespace functionalization {
 namespace impl {

-TORCH_API inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
+inline FunctionalTensorWrapper* unsafeGetFunctionalWrapper(
    const Tensor& tensor) {
  auto functional_impl =
      static_cast<FunctionalTensorWrapper*>(tensor.unsafeGetTensorImpl());
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -320,11 +320,9 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
  auto stride = at::detail::computeStride(self.sym_sizes(), self.sym_strides(), inferred_size);

  if (!stride.has_value()) {
-    // With unbacked symints, computeStride could fail even on contiguous
-    // tensors. In this case, we can use the strides of an empty tensor of
-    // inferred_size.
-    TORCH_CHECK(
-        self.is_contiguous(),
+
+    TORCH_SYM_CHECK(
+        self.sym_is_contiguous(),
        "View is not valid from size:",
        self.sym_sizes(),
        " stride: ",
@ -333,6 +331,9 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
        inferred_size,
        " in case of unbacked symbols consider adding torch.check to guide computing strides.");

+    // With unbacked symints, computeStride could fail even on contiguous
+    // tensors. In this case, we can use the strides of an empty tensor of
+    // inferred_size.
    stride = at::detail::empty_symint_meta(
                 inferred_size,
                 std::nullopt,
--- a/aten/src/ATen/LegacyBatchedTensorImpl.cpp
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.cpp
@ -84,7 +84,7 @@ IntArrayRef BatchedTensorImpl::strides_custom() const {

 // TODO: implement proper contiguity on batched tensor, then put
 // sizes_strides_policy back to Default
-bool BatchedTensorImpl::is_contiguous_custom(at::MemoryFormat memory_format) const {
+c10::SymBool BatchedTensorImpl::sym_is_contiguous_custom(at::MemoryFormat memory_format) const {
  TORCH_CHECK(memory_format == MemoryFormat::Contiguous,
      "NYI: querying is_contiguous inside of vmap for memory_format ",
      "other than torch.contiguous_format");
--- a/aten/src/ATen/LegacyBatchedTensorImpl.h
+++ b/aten/src/ATen/LegacyBatchedTensorImpl.h
@ -82,7 +82,8 @@ struct TORCH_API BatchedTensorImpl : public c10::TensorImpl {
  IntArrayRef strides_custom() const override;
  // Override a bunch of methods inherited from TensorImpl to return error
  // messages.
-  bool is_contiguous_custom(at::MemoryFormat memory_format) const override;
+  c10::SymBool sym_is_contiguous_custom(
+      at::MemoryFormat memory_format) const override;
  void set_size(int64_t dim, int64_t new_size) override;
  void set_stride(int64_t dim, int64_t new_stride) override;
  void set_storage_offset(int64_t storage_offset) override;
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .3.1
 .4.0