[Minor][Inductor] move some combo kernel log from warning to debug (#166993 )

Combo kernel warns for long reduction and large pointwise. This becomes too spammy for users such as vLLM. This PR moves these logs from warn to debug. I validated the spammy log is removed on llama-3.1-8B. Pull Request resolved: https://github.com/pytorch/pytorch/pull/166993 Approved by: https://github.com/zou3519, https://github.com/eellison (cherry picked from commit e020fb3431371ea335a0d5db5094810c9f1e104d)
Update triton to 3.5.1 release (#166968 ) (#167008 )
2025-11-13 21:59:56 +08:00 · 2025-11-04 22:20:58 +00:00 · 2025-11-04 16:57:01 -05:00 · 2025-11-04 15:55:21 -05:00 · 2025-11-04 15:52:08 -05:00 · 2025-11-04 15:21:29 -05:00
170 changed files with 4017 additions and 1496 deletions
--- a/.ci/aarch64_linux/aarch64_ci_build.sh
+++ b/.ci/aarch64_linux/aarch64_ci_build.sh
@ -5,9 +5,11 @@ GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

 # Set CUDA architecture lists to match x86 build_cuda.sh
 if [[ "$GPU_ARCH_VERSION" == *"12.6"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;8.0;9.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0"
 elif [[ "$GPU_ARCH_VERSION" == *"12.8"* ]]; then
-    export TORCH_CUDA_ARCH_LIST="7.0;8.0;9.0;10.0;12.0"
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
+elif [[ "$GPU_ARCH_VERSION" == *"12.9"* ]]; then
+    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;12.0"
 elif [[ "$GPU_ARCH_VERSION" == *"13.0"* ]]; then
    export TORCH_CUDA_ARCH_LIST="8.0;9.0;10.0;11.0;12.0+PTX"
 fi
@ -15,6 +17,8 @@ fi
 # Compress the fatbin with -compress-mode=size for CUDA 13
 if [[ "$DESIRED_CUDA" == *"13"* ]]; then
    export TORCH_NVCC_FLAGS="-compress-mode=size"
+    # Bundle ptxas into the cu13 wheel, see https://github.com/pytorch/pytorch/issues/163801
+    export BUILD_BUNDLE_PTXAS=1
 fi

 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
@ -42,9 +46,6 @@ else
        echo "Bundling CUDA libraries with wheel for aarch64."
    else
        echo "Using nvidia libs from pypi for aarch64."
-        # Fix platform constraints in PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64
-        # Replace 'platform_machine == "x86_64"' with 'platform_machine == "aarch64"'
-        export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS//platform_machine == \'x86_64\'/platform_machine == \'aarch64\'}"
        echo "Updated PYTORCH_EXTRA_INSTALL_REQUIREMENTS for aarch64: $PYTORCH_EXTRA_INSTALL_REQUIREMENTS"
        export USE_NVIDIA_PYPI_LIBS=1
    fi
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -213,7 +213,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        ]

        # CUDA version-specific libraries
-        if "130" in desired_cuda:
+        if "13" in desired_cuda:
+            minor_version = desired_cuda[-1]
            version_specific_libs = [
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.13",
                "/usr/local/cuda/lib64/libcublas.so.13",
@ -223,7 +224,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libcusolver.so.12",
                "/usr/local/cuda/lib64/libnvJitLink.so.13",
                "/usr/local/cuda/lib64/libnvrtc.so.13",
-                "/usr/local/cuda/lib64/libnvrtc-builtins.so.13.0",
+                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.13.{minor_version}",
            ]
        elif "12" in desired_cuda:
            # Get the last character for libnvrtc-builtins version (e.g., "129" -> "9")
@ -239,6 +240,8 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
                "/usr/local/cuda/lib64/libnvrtc.so.12",
                f"/usr/local/cuda/lib64/libnvrtc-builtins.so.12.{minor_version}",
            ]
+        else:
+            raise ValueError(f"Unsupported CUDA version: {desired_cuda}.")

        # Combine all libraries
        libs_to_copy = common_libs + version_specific_libs
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -214,8 +214,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
-    # TODO (huydhn): Upgrade this to Python >= 3.10
-    ANACONDA_PYTHON_VERSION=3.9
+    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    KATEX=yes
@ -263,13 +262,10 @@ case "$tag" in
    TRITON_CPU=yes
    ;;
  pytorch-linux-jammy-linter)
-    # TODO: Use 3.9 here because of this issue https://github.com/python/mypy/issues/13627.
-    # We will need to update mypy version eventually, but that's for another day. The task
-    # would be to upgrade mypy to 1.0.0 with Python 3.11
-    PYTHON_VERSION=3.9
+    PYTHON_VERSION=3.10
    ;;
-  pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter)
-    PYTHON_VERSION=3.9
+  pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter)
+    PYTHON_VERSION=3.10
    CUDA_VERSION=12.8.1
    ;;
  pytorch-linux-jammy-aarch64-py3.10-gcc11)
--- a/.ci/docker/ci_commit_pins/triton.txt
+++ b/.ci/docker/ci_commit_pins/triton.txt
@ -1 +1 @@
-fccfc522864cf8bc172abe0cd58ae5581e2d44b9
+bfeb066872bc1e8b2d2bc0a3b295b99dd77206e7
--- a/.ci/docker/common/patch_libstdc.sh
+++ b/.ci/docker/common/patch_libstdc.sh
@ -0,0 +1,9 @@
+#!/bin/bash
+set -xe
+# Script used in Linux x86 and aarch64 CD pipeline
+
+# Workaround for exposing statically linked libstdc++ CXX11 ABI symbols.
+# see: https://github.com/pytorch/pytorch/issues/133437
+LIBNONSHARED=$(gcc -print-file-name=libstdc++_nonshared.a)
+nm -g $LIBNONSHARED | grep " T " | grep recursive_directory_iterator | cut -c 20-  > weaken-symbols.txt
+objcopy --weaken-symbols weaken-symbols.txt $LIBNONSHARED $LIBNONSHARED
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -130,7 +130,8 @@ ENV LD_LIBRARY_PATH=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64:/op
 RUN for cpython_version in "cp312-cp312" "cp313-cp313" "cp313-cp313t"; do \
    /opt/python/${cpython_version}/bin/python -m pip install setuptools wheel; \
    done;
-
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh

 # cmake-3.18.4 from pip; force in case cmake3 already exists
 RUN yum install -y python3-pip && \
--- a/.ci/docker/manywheel/Dockerfile_2_28_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_2_28_aarch64
@ -71,3 +71,5 @@ RUN rm -rf /opt/python/cp33-cp33m /opt/_internal/cpython-3.3.6
 RUN rm -rf /opt/python/cp34-cp34m /opt/_internal/cpython-3.4.6
 COPY --from=openblas     /opt/OpenBLAS/  /opt/OpenBLAS/
 ENV LD_LIBRARY_PATH=/opt/OpenBLAS/lib:$LD_LIBRARY_PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
--- a/.ci/docker/manywheel/Dockerfile_cuda_aarch64
+++ b/.ci/docker/manywheel/Dockerfile_cuda_aarch64
@ -95,3 +95,5 @@ COPY --from=nvpl /opt/nvpl/lib/  /usr/local/lib/
 COPY --from=nvpl /opt/nvpl/include/  /usr/local/include/
 RUN ln -sf /usr/local/cuda-${BASE_CUDA_VERSION} /usr/local/cuda
 ENV PATH=/usr/local/cuda/bin:$PATH
+ADD ./common/patch_libstdc.sh patch_libstdc.sh
+RUN bash ./patch_libstdc.sh && rm patch_libstdc.sh
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -93,8 +93,9 @@ librosa==0.10.2 ; python_version == "3.12" and platform_machine != "s390x"
 #Pinned versions:
 #test that import:

-mypy==1.16.0
+mypy==1.16.0 ; platform_system != "Windows"
 # Pin MyPy version because new errors are likely to appear with each release
+# Skip on Windows as lots of type annotations are POSIX specific
 #Description: linter
 #Pinned versions: 1.16.0
 #test that import: test_typing.py, test_type_hints.py
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,7 +1,7 @@
 sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 5.3.0
-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@1657ad2fc1acdc98aa719eebecbb0128a7c13ce4#egg=pytorch_sphinx_theme2
+-e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2

 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
--- a/.ci/docker/triton_version.txt
+++ b/.ci/docker/triton_version.txt
@ -1 +1 @@
-3.5.0
+3.5.1
--- a/.ci/libtorch/build.sh
+++ b/.ci/libtorch/build.sh
@ -7,4 +7,4 @@ set -ex

 SCRIPTPATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"

-USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.9" ${SCRIPTPATH}/../manywheel/build.sh
+USE_NVSHMEM=0 USE_CUSPARSELT=0 BUILD_PYTHONLESS=1 DESIRED_PYTHON="3.10" ${SCRIPTPATH}/../manywheel/build.sh
--- a/.ci/lumen_cli/cli/lib/core/vllm/lib.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm/lib.py
@ -41,7 +41,6 @@ def sample_vllm_test_library():
                "pytest -v -s basic_correctness/test_cumem.py",
                "pytest -v -s basic_correctness/test_basic_correctness.py",
                "pytest -v -s basic_correctness/test_cpu_offload.py",
-                "VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py",
            ],
        },
        "vllm_basic_models_test": {
@ -68,15 +67,12 @@ def sample_vllm_test_library():
                        "-v",
                        "-s",
                        "entrypoints/llm",
-                        "--ignore=entrypoints/llm/test_lazy_outlines.py",
                        "--ignore=entrypoints/llm/test_generate.py",
-                        "--ignore=entrypoints/llm/test_generate_multiple_loras.py",
                        "--ignore=entrypoints/llm/test_collective_rpc.py",
                    ]
                ),
-                "pytest -v -s entrypoints/llm/test_lazy_outlines.py",
-                "pytest -v -s entrypoints/llm/test_generate.py ",
-                "VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode",
+                "pytest -v -s entrypoints/llm/test_generate.py",
+                "pytest -v -s entrypoints/offline_mode",
            ],
        },
        "vllm_regression_test": {
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -187,19 +187,22 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            export USE_CUFILE=0
        else
            DEPS_LIST+=(
-                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
                "/usr/local/cuda/lib64/libcublas.so.12"
                "/usr/local/cuda/lib64/libcublasLt.so.12"
                "/usr/local/cuda/lib64/libcudart.so.12"
                "/usr/local/cuda/lib64/libnvrtc.so.12"
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
            DEPS_SONAME+=(
-                "libnvToolsExt.so.1"
                "libcublas.so.12"
                "libcublasLt.so.12"
                "libcudart.so.12"
                "libnvrtc.so.12"
                "libcupti.so.12")
+
+            if [[ $CUDA_VERSION != 12.9* ]]; then
+                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
+                DEPS_SONAME+=("libnvToolsExt.so.1")
+            fi
        fi
    else
        echo "Using nvidia libs from pypi."
--- a/.ci/pytorch/check_binary.sh
+++ b/.ci/pytorch/check_binary.sh
@ -67,7 +67,7 @@ fi
 #       wheels with cxx11-abi

 echo "Checking that the gcc ABI is what we expect"
-if [[ "$(uname)" != 'Darwin' ]]; then
+if [[ "$(uname)" != 'Darwin' &&  "$(uname -m)" != "s390x" ]]; then
  # We also check that there are cxx11 symbols in libtorch
  #
  echo "Checking that symbols in libtorch.so have the right gcc abi"
--- a/.ci/pytorch/cpp_doc_push_script.sh
+++ b/.ci/pytorch/cpp_doc_push_script.sh
@ -58,7 +58,7 @@ time python tools/setup_helpers/generate_code.py \

 # Build the docs
 pushd docs/cpp
-time make VERBOSE=1 html -j
+time make VERBOSE=1 html

 popd
 popd
--- a/.ci/pytorch/numba-cuda-13.patch
+++ b/.ci/pytorch/numba-cuda-13.patch
@ -0,0 +1,25 @@
+From 6e08c9d08e9de59c7af28b720289debbbd384764 Mon Sep 17 00:00:00 2001
+From: Michael Wang <13521008+isVoid@users.noreply.github.com>
+Date: Tue, 1 Apr 2025 17:28:05 -0700
+Subject: [PATCH] Avoid bumping certain driver API to avoid future breakage
+ (#185)
+
+Co-authored-by: isVoid <isVoid@users.noreply.github.com>
+---
+ numba_cuda/numba/cuda/cudadrv/driver.py | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/numba_cuda/numba/cuda/cudadrv/driver.py b/numba_cuda/numba/cuda/cudadrv/driver.py
+index 1641bf77..233e9ed7 100644
+--- a/numba_cuda/numba/cuda/cudadrv/driver.py
+++ b/numba_cuda/numba/cuda/cudadrv/driver.py
+@@ -365,6 +365,9 @@ def _find_api(self, fname):
+         else:
+             variants = ('_v2', '')
+ 
+        if fname in ("cuCtxGetDevice", "cuCtxSynchronize"):
+            return getattr(self.lib, fname)
+
+         for variant in variants:
+             try:
+                 return getattr(self.lib, f'{fname}{variant}')
--- a/.ci/pytorch/smoke_test/check_binary_symbols.py
+++ b/.ci/pytorch/smoke_test/check_binary_symbols.py
@ -32,6 +32,9 @@ LIBTORCH_NAMESPACE_LIST = (
    "torch::",
 )

+# Patterns for detecting statically linked libstdc++ symbols
+STATICALLY_LINKED_CXX11_ABI = [re.compile(r".*recursive_directory_iterator.*")]
+

 def _apply_libtorch_symbols(symbols):
    return [
@ -53,12 +56,17 @@ def get_symbols(lib: str) -> list[tuple[str, str, str]]:
    return [x.split(" ", 2) for x in lines.decode("latin1").split("\n")[:-1]]


-def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
+def grep_symbols(
+    lib: str, patterns: list[Any], symbol_type: str | None = None
+) -> list[str]:
    def _grep_symbols(
        symbols: list[tuple[str, str, str]], patterns: list[Any]
    ) -> list[str]:
        rc = []
        for _s_addr, _s_type, s_name in symbols:
+            # Filter by symbol type if specified
+            if symbol_type and _s_type != symbol_type:
+                continue
            for pattern in patterns:
                if pattern.match(s_name):
                    rc.append(s_name)
@ -80,6 +88,18 @@ def grep_symbols(lib: str, patterns: list[Any]) -> list[str]:
        return functools.reduce(list.__add__, (x.result() for x in tasks), [])


+def check_lib_statically_linked_libstdc_cxx_abi_symbols(lib: str) -> None:
+    cxx11_statically_linked_symbols = grep_symbols(
+        lib, STATICALLY_LINKED_CXX11_ABI, symbol_type="T"
+    )
+    num_statically_linked_symbols = len(cxx11_statically_linked_symbols)
+    print(f"num_statically_linked_symbols (T): {num_statically_linked_symbols}")
+    if num_statically_linked_symbols > 0:
+        raise RuntimeError(
+            f"Found statically linked libstdc++ symbols (recursive_directory_iterator): {cxx11_statically_linked_symbols[:100]}"
+        )
+
+
 def check_lib_symbols_for_abi_correctness(lib: str) -> None:
    print(f"lib: {lib}")
    cxx11_symbols = grep_symbols(lib, LIBTORCH_CXX11_PATTERNS)
@ -107,6 +127,7 @@ def main() -> None:

    libtorch_cpu_path = str(install_root / "lib" / "libtorch_cpu.so")
    check_lib_symbols_for_abi_correctness(libtorch_cpu_path)
+    check_lib_statically_linked_libstdc_cxx_abi_symbols(libtorch_cpu_path)


 if __name__ == "__main__":
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -32,6 +32,16 @@ if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /v
  git config --global --add safe.directory /var/lib/jenkins/workspace
 fi

+
+# Patch numba to avoid CUDA-13 crash, see https://github.com/pytorch/pytorch/issues/162878
+NUMBA_CUDA_DIR=$(python -c "import os;import numba.cuda; print(os.path.dirname(numba.cuda.__file__))" 2>/dev/null || true)
+if [ -n "$NUMBA_CUDA_DIR" ]; then
+  NUMBA_PATCH="$(dirname "$(realpath "${BASH_SOURCE[0]}")")/numba-cuda-13.patch"
+  pushd "$NUMBA_CUDA_DIR"
+  patch -p4 <"$NUMBA_PATCH"
+  popd
+fi
+
 echo "Environment variables:"
 env

@ -1614,6 +1624,25 @@ test_operator_benchmark() {
      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }

+test_operator_microbenchmark() {
+  TEST_REPORTS_DIR=$(pwd)/test/test-reports
+  mkdir -p "$TEST_REPORTS_DIR"
+  TEST_DIR=$(pwd)
+
+  cd benchmarks/operator_benchmark/pt_extension
+  python -m pip install .
+
+  cd "${TEST_DIR}"/benchmarks/operator_benchmark
+
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
+      --benchmark-name "PyTorch operator microbenchmark" --use-compile
+    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
+      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}.json" \
+      --benchmark-name "PyTorch operator microbenchmark"
+  done
+}

 if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then
  (cd test && python -c "import torch; print(torch.__config__.show())")
@ -1668,6 +1697,8 @@ elif [[ "${TEST_CONFIG}" == *operator_benchmark* ]]; then
    test_operator_benchmark cpu ${TEST_MODE}

  fi
+elif [[ "${TEST_CONFIG}" == *operator_microbenchmark* ]]; then
+  test_operator_microbenchmark
 elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then
  test_inductor_distributed
 elif [[ "${TEST_CONFIG}" == *inductor-halide* ]]; then
@ -1721,11 +1752,6 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
  test_inductor_shard "${SHARD_NUMBER}"
-  if [[ "${SHARD_NUMBER}" == 1 ]]; then
-    if [[ "${BUILD_ENVIRONMENT}" != linux-jammy-py3.9-gcc11-build ]]; then
-      test_inductor_distributed
-    fi
-  fi
 elif [[ "${TEST_CONFIG}" == *einops* ]]; then
  test_einops
 elif [[ "${TEST_CONFIG}" == *dynamo_wrapped* ]]; then
--- a/.ci/pytorch/win-test-helpers/build_pytorch.bat
+++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat
@ -137,7 +137,7 @@ sccache --show-stats
 python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])"
 (
  if "%BUILD_ENVIRONMENT%"=="" (
-    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3` in Command Prompt before running Git Bash.
+    echo NOTE: To run `import torch`, please make sure to activate the conda environment by running `call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%\envs\py_tmp` in Command Prompt before running Git Bash.
  ) else (
    copy /Y "dist\*.whl" "%PYTORCH_FINAL_PACKAGE_DIR%"

--- a/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
+++ b/.ci/pytorch/win-test-helpers/installation-helpers/activate_miniconda3.bat
@ -3,12 +3,12 @@ if "%BUILD_ENVIRONMENT%"=="" (
 ) else (
  set CONDA_PARENT_DIR=C:\Jenkins
 )
-
+set CONDA_ROOT_DIR=%CONDA_PARENT_DIR%\Miniconda3

 :: Be conservative here when rolling out the new AMI with conda. This will try
 :: to install conda as before if it couldn't find the conda installation. This
 :: can be removed eventually after we gain enough confidence in the AMI
-if not exist %CONDA_PARENT_DIR%\Miniconda3 (
+if not exist %CONDA_ROOT_DIR% (
  set INSTALL_FRESH_CONDA=1
 )

@ -17,10 +17,14 @@ if "%INSTALL_FRESH_CONDA%"=="1" (
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b

-  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_PARENT_DIR%\Miniconda3
+  %TMP_DIR_WIN%\Miniconda3-latest-Windows-x86_64.exe /InstallationType=JustMe /RegisterPython=0 /S /AddToPath=0 /D=%CONDA_ROOT_DIR%
  if errorlevel 1 exit /b
  if not errorlevel 0 exit /b
 )

 :: Activate conda so that we can use its commands, i.e. conda, python, pip
-call %CONDA_PARENT_DIR%\Miniconda3\Scripts\activate.bat %CONDA_PARENT_DIR%\Miniconda3
+call %CONDA_ROOT_DIR%\Scripts\activate.bat %CONDA_ROOT_DIR%
+:: Activate conda so that we can use its commands, i.e. conda, python, pip
+call conda activate py_tmp
+
+call pip install -r .ci/docker/requirements-ci.txt
--- a/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
+++ b/.ci/pytorch/win-test-helpers/setup_pytorch_env.bat
@ -14,7 +14,7 @@ if not errorlevel 0 exit /b
 :: build\torch. Rather than changing all these references, making a copy of torch folder
 :: from conda to the current workspace is easier. The workspace will be cleaned up after
 :: the job anyway
-xcopy /s %CONDA_PARENT_DIR%\Miniconda3\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\
+xcopy /s %CONDA_ROOT_DIR%\envs\py_tmp\Lib\site-packages\torch %TMP_DIR_WIN%\build\torch\

 pushd .
 if "%VC_VERSION%" == "" (
--- a/.ci/pytorch/win-test.sh
+++ b/.ci/pytorch/win-test.sh
@ -38,7 +38,14 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* ]]; then
 fi

 # TODO: Move both of them to Windows AMI
-python -m pip install pytest-rerunfailures==10.3 pytest-cpp==2.3.0 tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+python -m pip install tensorboard==2.13.0 protobuf==5.29.4 pytest-subtests==0.13.1
+
+# Copied from https://github.com/pytorch/test-infra/blob/be01a40157c36cd5a48391fdf44a7bc3ebd4c7e3/aws/ami/windows/scripts/Installers/Install-Pip-Dependencies.ps1#L16 with some adjustments
+# pytest-rerunfailures==10.3 as 10.2 fails with INTERNALERROR> pluggy._manager.PluginValidationError: unknown hook 'pytest_configure_node'
+# scipy from 1.6.3 to 1.10
+# expecttest from 0.1.3 to 0.3.0
+# xdoctest from 1.0.2 to 1.3.0
+python -m pip install "future==0.18.2" "hypothesis==5.35.1" "expecttest==0.3.0" "librosa>=0.6.2" "scipy==1.10.1" "psutil==5.9.1" "pynvml==11.4.1" "pillow==9.2.0" "unittest-xml-reporting<=3.2.0,>=2.0.0" "pytest==7.1.3" "pytest-xdist==2.5.0" "pytest-flakefinder==1.1.0" "pytest-rerunfailures==10.3" "pytest-shard==0.1.2" "sympy==1.11.1" "xdoctest==1.3.0" "pygments==2.12.0" "opt-einsum>=3.3" "networkx==2.8.8" "mpmath==1.2.1" "pytest-cpp==2.3.0" "boto3==1.35.42"

 # Install Z3 optional dependency for Windows builds.
 python -m pip install z3-solver==4.15.1.0
@ -52,9 +59,6 @@ python -m pip install parameterized==0.8.1
 # Install pulp for testing ilps under torch\distributed\_tools
 python -m pip install pulp==2.9.0

-# Install expecttest to merge https://github.com/pytorch/pytorch/pull/155308
-python -m pip install expecttest==0.3.0
-
 run_tests() {
    # Run nvidia-smi if available
    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
--- a/.ci/pytorch/windows/cuda128.bat
+++ b/.ci/pytorch/windows/cuda128.bat
@ -37,10 +37,10 @@ IF "%CUDA_PATH_V128%"=="" (
 )

 IF "%BUILD_VISION%" == "" (
-    set TORCH_CUDA_ARCH_LIST=6.1;7.0;7.5;8.0;8.6;9.0;10.0;12.0
+    set TORCH_CUDA_ARCH_LIST=7.0;7.5;8.0;8.6;9.0;10.0;12.0
    set TORCH_NVCC_FLAGS=-Xfatbin -compress-all
 ) ELSE (
-    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
+    set NVCC_FLAGS=-D__CUDA_NO_HALF_OPERATORS__ --expt-relaxed-constexpr -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=compute_80 -gencode=arch=compute_86,code=compute_86 -gencode=arch=compute_90,code=compute_90 -gencode=arch=compute_100,code=compute_100 -gencode=arch=compute_120,code=compute_120
 )

 set "CUDA_PATH=%CUDA_PATH_V128%"
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -71,14 +71,7 @@ export PYTORCH_BUILD_NUMBER=1

 # Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS
 TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt)
-
-# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT
-TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64'"
-
-# CUDA 12.9/13.0 builds have triton for Linux and Linux aarch64 binaries.
-if [[ "$DESIRED_CUDA" == "cu129" ]] || [[ "$DESIRED_CUDA" == "cu130" ]]; then
-  TRITON_CONSTRAINT="platform_system == 'Linux'"
-fi
+TRITON_CONSTRAINT="platform_system == 'Linux'"

 if [[ "$PACKAGE_TYPE" =~ .*wheel.* &&  -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" && ! "$PYTORCH_BUILD_VERSION" =~ .*xpu.* ]]; then
  TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}"
--- a/.github/actions/setup-win/action.yml
+++ b/.github/actions/setup-win/action.yml
@ -6,6 +6,12 @@ inputs:
  cuda-version:
    description: which cuda version to install, 'cpu' for none
    required: true
+  python-version:
+    required: false
+    type: string
+    default: "3.10"
+    description: |
+      The python version to be used. Will be 3.10 by default

 runs:
  using: composite
@ -38,18 +44,24 @@ runs:
        CONDA="C:\Jenkins\Miniconda3\condabin\conda.bat"

        {
+          echo "CONDA=${CONDA}";
          echo "CONDA_RUN=${CONDA} run --no-capture-output";
          echo "CONDA_BUILD=${CONDA} run conda-build";
          echo "CONDA_INSTALL=${CONDA} install";
        } >> "${GITHUB_ENV}"

    - name: Setup Python3
+      env:
+          PYTHON_VERSION: ${{ inputs.python-version }}
      shell: bash
      run: |
        set +e
        set -x

-        PYTHON3=$(${CONDA_RUN} which python3)
+        # Create new py_tmp env with python-version
+        ${CONDA} create -y -n py_tmp python=${PYTHON_VERSION} intel-openmp libuv
+
+        PYTHON3=$(${CONDA_RUN} -n py_tmp which python3)
        EXIT_CODE=$?

        if [[ "${EXIT_CODE}" == "0" ]]; then
@ -62,7 +74,7 @@ runs:
          # installation, which is Python 3 based. Its Python is default to Python 3. Further, there
          # is also the Miniconda installation that is Python 2 based, and both can be installed if
          # needed. In both cases, Python binary is just called python
-          PYTHON=$(${CONDA_RUN} which python)
+          PYTHON=$(${CONDA_RUN} -n py_tmp which python)
          EXIT_CODE=$?

          if [[ "${EXIT_CODE}" == "0" ]]; then
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-e10fef08838612b4560e9c72e5cb1414a5edfa13
+78a47f87ce259a48f0391fa9ae15add05ea7432b
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -16,16 +16,18 @@ from typing import Optional


 # NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
-CUDA_ARCHES = ["12.6", "12.8", "13.0"]
+CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
    "12.6": "12.6.3",
    "12.8": "12.8.1",
+    "12.9": "12.9.1",
    "13.0": "13.0.0",
 }
 CUDA_ARCHES_CUDNN_VERSION = {
    "12.6": "9",
    "12.8": "9",
+    "12.9": "9",
    "13.0": "9",
 }

@ -38,60 +40,77 @@ CPU_AARCH64_ARCH = ["cpu-aarch64"]

 CPU_S390X_ARCH = ["cpu-s390x"]

-CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "13.0-aarch64"]
+CUDA_AARCH64_ARCHES = ["12.6-aarch64", "12.8-aarch64", "12.9-aarch64", "13.0-aarch64"]


 PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
    "12.6": (
-        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
    ),
    "12.8": (
-        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
+    ),
+    "12.9": (
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'"
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
+        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
+        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
@ -222,7 +241,11 @@ def generate_libtorch_matrix(
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
        elif os == "windows":
-            arches += CUDA_ARCHES
+            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            # in 2.10
+            windows_cuda_arches = CUDA_ARCHES.copy()
+            windows_cuda_arches.remove("12.9")
+            arches += windows_cuda_arches
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -286,7 +309,11 @@ def generate_wheels_matrix(
        if os == "linux":
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
-            arches += CUDA_ARCHES + XPU_ARCHES
+            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            # in 2.10
+            windows_cuda_arches = CUDA_ARCHES.copy()
+            windows_cuda_arches.remove("12.9")
+            arches += windows_cuda_arches + XPU_ARCHES
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -322,7 +349,7 @@ def generate_wheels_matrix(
            # cuda linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install

            if (
-                arch_version in ["13.0", "12.8", "12.6"]
+                arch_version in ["13.0", "12.9", "12.8", "12.6"]
                and os == "linux"
                or arch_version in CUDA_AARCH64_ARCHES
            ):
@ -386,5 +413,6 @@ def generate_wheels_matrix(


 validate_nccl_dep_consistency("13.0")
+validate_nccl_dep_consistency("12.9")
 validate_nccl_dep_consistency("12.8")
 validate_nccl_dep_consistency("12.6")
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@ -135,7 +135,7 @@ ROCM_SMOKE_WORKFLOWS = [
        build_configs=generate_binary_build_matrix.generate_wheels_matrix(
            OperatingSystem.LINUX,
            arches=["6.4"],
-            python_versions=["3.9"],
+            python_versions=["3.10"],
        ),
        ciflow_config=CIFlowConfig(
            labels={
--- a/.github/templates/linux_binary_build_workflow.yml.j2
+++ b/.github/templates/linux_binary_build_workflow.yml.j2
@ -77,6 +77,9 @@ jobs:
      runs_on: linux.s390x
      ALPINE_IMAGE: "docker.io/s390x/alpine"
      timeout-minutes: 420
+      {%- elif config["gpu_arch_type"] == "rocm" %}
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      {%- elif "conda" in build_environment and config["gpu_arch_type"] == "cuda" %}
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runs_on: linux.24xlarge.ephemeral
--- a/.github/workflows/_binary-test-linux.yml
+++ b/.github/workflows/_binary-test-linux.yml
@ -186,8 +186,6 @@ jobs:

      - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG
        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
-        with:
-          driver-version: ${{ startsWith(inputs.GPU_ARCH_VERSION, '13') && '580.65.06' || '570.133.07' }}
        if: ${{ inputs.GPU_ARCH_TYPE == 'cuda' && steps.filter.outputs.is-test-matrix-empty == 'False' }}

      - name: configure aws credentials
--- a/.github/workflows/_docs.yml
+++ b/.github/workflows/_docs.yml
@ -67,7 +67,7 @@ jobs:
            # an OOM issue when running the job, so this upgrades the runner from 4xlarge
            # to the next available tier of 12xlarge. So much memory just to generate cpp
            # doc
-            runner: ${{ inputs.runner_prefix }}linux.12xlarge
+            runner: ${{ inputs.runner_prefix }}linux.12xlarge.memory
            # TODO: Nightly cpp docs take longer and longer to finish (more than 3h now)
            # Let's try to figure out how this can be improved
            timeout-minutes: 360
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -169,7 +169,7 @@ jobs:
        id: install-nvidia-driver
        uses: pytorch/test-infra/.github/actions/setup-nvidia@release/2.9
        with:
-          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '570.133.07' }}
+          driver-version: ${{ matrix.config == 'legacy_nvidia_driver' && '525.105.17' || '580.82.07' }}
        if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_container_runner.outputs.IN_CONTAINER_RUNNER == 'false' && !contains(matrix.runner, 'b200') }}

      - name: Setup GPU_FLAG for docker run
@ -273,6 +273,8 @@ jobs:
          TEST_CONFIG: ${{ matrix.config }}
          SHARD_NUMBER: ${{ matrix.shard }}
          NUM_TEST_SHARDS: ${{ matrix.num_shards }}
+          EXTRA_FLAGS: ${{ matrix.extra_flags || '' }}
+          OP_BENCHMARK_TESTS: ${{ matrix.op_benchmark_tests }}
          REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }}
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -151,7 +151,7 @@ jobs:
          BUILD_WHEEL: 1
          MAX_JOBS: 8
          CUDA_VERSION: ${{ inputs.cuda-version }}
-          PYTHON_VERSION: "3.9"
+          PYTHON_VERSION: "3.10"
          SCCACHE_BUCKET: "ossci-compiler-cache"
          SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }}
          SCCACHE_REGION: us-east-1
--- a/.github/workflows/_win-test.yml
+++ b/.github/workflows/_win-test.yml
@ -184,7 +184,7 @@ jobs:
        env:
          USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }}
          INSTALL_WINDOWS_SDK: 1
-          PYTHON_VERSION: 3.9
+          PYTHON_VERSION: "3.10"
          CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }}
          VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }}
          TEST_SHOWLOCALS: ${{ steps.keep-going.outputs.ci-test-showlocals }}
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -46,10 +46,12 @@ jobs:
      fail-fast: false
      matrix:
        include: [
-          { name: "manylinux2_28-builder",          tag: "cuda13.0",         runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda13.0",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.8",          runner: "linux.9xlarge.ephemeral" },
+          { name: "manylinux2_28-builder",          tag: "cuda12.9",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cuda12.6",          runner: "linux.9xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda13.0",          runner: "linux.arm64.2xlarge.ephemeral" },
+          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.3",           runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -70,7 +70,7 @@ jobs:
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
-          pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter,
+          pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter,
          # Executorch pin needs update
          # pytorch-linux-jammy-py3-clang12-executorch,
          pytorch-linux-jammy-py3.12-triton-cpu,
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -204,6 +204,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_10-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_10-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -224,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -335,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -407,6 +453,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -427,7 +519,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -538,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -584,7 +676,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -610,6 +702,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_12-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_12-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -630,7 +768,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -741,7 +879,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -787,7 +925,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -813,6 +951,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_13-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_13-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -833,7 +1017,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -944,7 +1128,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -990,7 +1174,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1016,6 +1200,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_13t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_13t-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1036,7 +1266,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1147,7 +1377,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1193,7 +1423,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1219,6 +1449,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_14-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_14-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1239,7 +1515,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1350,7 +1626,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1396,7 +1672,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1422,6 +1698,52 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_14t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9-aarch64"
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_14t-cuda-aarch64-13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1442,7 +1764,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -248,6 +248,74 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  libtorch-cuda12_9-shared-with-deps-release-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - libtorch-cuda12_9-shared-with-deps-release-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+      build_environment: linux-binary-libtorch
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: libtorch-cuda12_9-shared-with-deps-release-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: libtorch
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: libtorch-cxx11-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      LIBTORCH_CONFIG: release
+      LIBTORCH_VARIANT: shared-with-deps
+      build_name: libtorch-cuda12_9-shared-with-deps-release
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  libtorch-cuda13_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -333,6 +401,7 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: libtorch-rocm6_3-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
@ -446,6 +515,7 @@ jobs:
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -241,6 +241,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_10-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_10-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_10-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_10-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.10"
+      build_name: manywheel-py3_10-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_10-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -259,7 +325,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda13_0-test:  # Testing
@ -323,6 +389,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -433,6 +500,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -716,7 +784,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -782,7 +850,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -830,6 +898,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_11-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_11-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_11-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_11-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.11"
+      build_name: manywheel-py3_11-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_11-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -848,7 +982,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda13_0-test:  # Testing
@ -912,6 +1046,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1022,6 +1157,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.11"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_11-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -1305,7 +1441,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -1371,7 +1507,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -1419,6 +1555,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_12-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_12-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_12-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_12-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.12"
+      build_name: manywheel-py3_12-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_12-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -1437,7 +1639,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda13_0-test:  # Testing
@ -1501,6 +1703,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -1611,6 +1814,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.12"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_12-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -1894,7 +2098,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -1960,7 +2164,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2008,6 +2212,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_13-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13"
+      build_name: manywheel-py3_13-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_13-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -2026,7 +2296,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda13_0-test:  # Testing
@ -2090,6 +2360,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2200,6 +2471,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -2483,7 +2755,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -2549,7 +2821,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -2597,6 +2869,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_13t-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_13t-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_13t-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_13t-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_13t-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.13t"
+      build_name: manywheel-py3_13t-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_13t-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -2615,7 +2953,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda13_0-test:  # Testing
@ -2679,6 +3017,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -2789,6 +3128,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.13t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_13t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3072,7 +3412,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3138,7 +3478,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3186,6 +3526,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_14-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_14-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -3204,7 +3610,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda13_0-test:  # Testing
@ -3268,6 +3674,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3378,6 +3785,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
@ -3661,7 +4069,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -3727,7 +4135,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -3775,6 +4183,72 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml

+  manywheel-py3_14t-cuda12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      build_name: manywheel-py3_14t-cuda12_9
+      build_environment: linux-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda12_9-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cuda12_9-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda12_9
+      build_environment: linux-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.g4dn.4xlarge.nvidia.gpu # 12.8+ builds need sm_70+ runner
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda12_9-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: "12.9"
+      GPU_ARCH_TYPE: cuda
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
  manywheel-py3_14t-cuda13_0-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -3793,7 +4267,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda13_0
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand==10.4.0.35; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile==1.15.0.42; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda13_0-test:  # Testing
@ -3857,6 +4331,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.3
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_3
      build_environment: linux-binary-manywheel
    secrets:
@ -3967,6 +4442,7 @@ jobs:
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      DESIRED_PYTHON: "3.14t"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      timeout-minutes: 300
      build_name: manywheel-py3_14t-rocm6_4
      build_environment: linux-binary-manywheel
    secrets:
--- a/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-rocm-main.yml
@ -44,7 +44,7 @@ jobs:
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
-  manywheel-py3_9-rocm6_4-build:
+  manywheel-py3_10-rocm6_4-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
@ -58,16 +58,17 @@ jobs:
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build_name: manywheel-py3_9-rocm6_4
+      timeout-minutes: 300
+      build_name: manywheel-py3_10-rocm6_4
      build_environment: linux-binary-manywheel-rocm
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
-  manywheel-py3_9-rocm6_4-test:  # Testing
+  manywheel-py3_10-rocm6_4-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
-      - manywheel-py3_9-rocm6_4-build
+      - manywheel-py3_10-rocm6_4-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
@ -82,14 +83,14 @@ jobs:
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: manylinux2_28-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
-      DESIRED_PYTHON: "3.9"
+      DESIRED_PYTHON: "3.10"
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
-          name: manywheel-py3_9-rocm6_4
+          name: manywheel-py3_10-rocm6_4
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/inductor-nightly.yml
+++ b/.github/workflows/inductor-nightly.yml
@ -37,7 +37,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -56,7 +56,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: nightly-dynamo-benchmarks-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.nightly-dynamo-benchmarks-build.outputs.docker-image }}
      test-matrix: ${{ needs.nightly-dynamo-benchmarks-build.outputs.test-matrix }}
      timeout-minutes: 720
--- a/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86-zen.yml
@ -75,7 +75,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -101,7 +101,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -118,7 +118,7 @@ jobs:
    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-perf-test-nightly-x86.yml
+++ b/.github/workflows/inductor-perf-test-nightly-x86.yml
@ -80,7 +80,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -107,7 +107,7 @@ jobs:
    needs: inductor-build
    if: github.event.schedule == '0 7 * * *'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-false-inference-true-default-true-dynamic-true-cppwrapper-true-aotinductor-true-freezing-true
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
@ -124,7 +124,7 @@ jobs:
    needs: inductor-build
    if: github.event_name == 'workflow_dispatch'
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-freezing-${{ inputs.freezing }}
      docker-image: ${{ needs.inductor-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-build.outputs.test-matrix }}
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -154,7 +154,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-default-label-prefix
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-default-label-prefix.outputs.label-type }}"
      test-matrix: |
@ -200,7 +200,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: periodic-dynamo-benchmarks-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.periodic-dynamo-benchmarks-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -110,7 +110,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -127,7 +127,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -79,7 +79,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
@ -101,7 +101,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: inductor-cpu-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.inductor-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.inductor-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -53,7 +53,7 @@ jobs:
    with:
      timeout: 120
      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
-      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.9-linter
+      docker-image: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.10-linter
      # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout
      # to run git rev-parse HEAD~:.ci/docker when a new image is needed
      fetch-depth: 0
@ -265,10 +265,10 @@ jobs:
        with:
          submodules: false
          fetch-depth: 1
-      - name: Setup Python 3.9
+      - name: Setup Python 3.10
        uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
        with:
-          python-version: '3.9'
+          python-version: '3.10'
          architecture: x64
          cache: pip
      - name: Install dependencies
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -14,6 +14,10 @@ on:
  schedule:
    # Run at 07:00 UTC every Sunday
    - cron: 0 7 * * 0
+  pull_request:
+    paths:
+      - benchmarks/operator_benchmark/**
+      - .github/workflows/operator_benchmark.yml

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
@ -29,7 +33,7 @@ jobs:
    name: opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -42,7 +46,7 @@ jobs:
    name: opbenchmark-on-demand-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -55,7 +59,7 @@ jobs:
    uses: ./.github/workflows/_linux-test.yml
    needs: opbenchmark-build
    with:
-      build-environment: linux-jammy-py3.9-gcc11-build
+      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/operator_microbenchmark.yml
+++ b/.github/workflows/operator_microbenchmark.yml
@ -0,0 +1,46 @@
+name: operator_microbenchmark
+
+on:
+  push:
+    tags:
+      - ciflow/op-benchmark/*
+  workflow_dispatch:
+  schedule:
+    # Run at 06:00 UTC everyday
+    - cron: 0 6 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  opmicrobenchmark-build:
+    if: github.repository_owner == 'pytorch'
+    name: opmicrobenchmark-build
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      runner: linux.12xlarge.memory
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
+      cuda-arch-list: '8.0 9.0'
+      test-matrix: |
+        { include: [
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.h100" },
+          { config: "operator_microbenchmark_test", shard: 1, num_shards: 1, runner: "linux.aws.a100" },
+        ]}
+    secrets: inherit
+
+  opmicrobenchmark-test:
+    name: opmicrobenchmark-test
+    uses: ./.github/workflows/_linux-test.yml
+    needs: opmicrobenchmark-build
+    with:
+      timeout-minutes: 500
+      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-sm80
+      docker-image: ${{ needs.opmicrobenchmark-build.outputs.docker-image }}
+      test-matrix: ${{ needs.opmicrobenchmark-build.outputs.test-matrix }}
+    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -59,13 +59,14 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.4-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11
+      cuda-arch-list: 7.5
      test-matrix: |
        { include: [
-          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
-          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 1, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 2, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 3, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 4, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
+          { config: "legacy_nvidia_driver", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g4dn.4xlarge.nvidia.gpu" },
        ]}
    secrets: inherit

--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -240,7 +240,7 @@ jobs:
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
@ -255,7 +255,7 @@ jobs:
      - verify-cachebench-cpu-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.9-gcc11
+      build-environment: linux-jammy-py3.10-gcc11
      docker-image: ${{ needs.verify-cachebench-cpu-build.outputs.docker-image }}
      test-matrix: ${{ needs.verify-cachebench-cpu-build.outputs.test-matrix }}
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -82,6 +82,7 @@ torch/return_types.pyi
 torch/nn/functional.pyi
 torch/utils/data/datapipes/datapipe.pyi
 torch/csrc/autograd/generated/*
+torch/csrc/functionalization/generated/*
 torch/csrc/lazy/generated/*.[!m]*
 torch_compile_debug/
 # Listed manually because some files in this directory are not generated
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -91,6 +91,8 @@ generated_cpu_cpp = [
    "aten/src/ATen/NativeMetaFunctions.h",
    "aten/src/ATen/RegistrationDeclarations.h",
    "aten/src/ATen/VmapGeneratedPlumbing.h",
+    "aten/src/ATen/ViewMetaClasses.h",
+    "aten/src/ATen/ViewMetaClasses.cpp",
    "aten/src/ATen/core/aten_interned_strings.h",
    "aten/src/ATen/core/enum_tag.h",
    "aten/src/ATen/core/TensorBody.h",
@ -1106,6 +1108,7 @@ test_suite(
        "aten/src/ATen/templates/LazyNonNativeIr.h",
        "aten/src/ATen/templates/RegisterDispatchKey.cpp",
        "aten/src/ATen/templates/RegisterDispatchDefinitions.ini",
+        "aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp",
        "aten/src/ATen/native/native_functions.yaml",
        "aten/src/ATen/native/tags.yaml",
        "aten/src/ATen/native/ts_native_functions.yaml",
--- a/5
+++ b/5
@ -50,11 +50,10 @@ RUN git submodule update --init --recursive
 FROM conda as conda-installs
 ARG PYTHON_VERSION=3.11
 ARG CUDA_PATH=cu121
-ARG CUDA_CHANNEL=nvidia
 ARG INSTALL_CHANNEL=whl/nightly
 # Automatically set by buildx
-RUN /opt/conda/bin/conda update -y -n base -c defaults conda
-RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION}
+# pinning version of conda here see: https://github.com/pytorch/pytorch/issues/164574
+RUN /opt/conda/bin/conda install -y python=${PYTHON_VERSION} conda=25.7.0

 ARG TARGETPLATFORM

--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)

 --------------------------------------------------------------------------------

@ -72,7 +72,7 @@ Elaborating Further:

 If you use NumPy, then you have used Tensors (a.k.a. ndarray).

-![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)

 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.

-![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)

 ### Python First

--- a/aten/src/ATen/FunctionalStorageImpl.cpp
+++ b/aten/src/ATen/FunctionalStorageImpl.cpp
@ -9,11 +9,6 @@

 namespace at::functionalization {

-ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
-  if (out_idx == this->out_index) return *this;
-  return ViewMeta(forward_fn, reverse_fn, has_symbolic_inputs, is_multi_output, is_as_strided, out_idx);
-}
-
 // Note [Functionalization: Alias Removal Part 2]
 // See Note [Functionalization: Alias Removal] for more details.
 // This function applies a single update from one of the views to the StorageImpl.
@ -42,12 +37,12 @@ ViewMeta ViewMeta::to_out_idx(int64_t out_idx) {
 static const Tensor apply_update(const FunctionalStorageImpl::Update& update, const Tensor& base) {
  at::Tensor t = update.new_val;
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  if (update.view_metas.empty()) return t;
+  if (update.view_metas.empty()) { return t; }

  std::vector<at::Tensor> tmp_values({base});
  tmp_values.reserve(update.view_metas.size());
  for (size_t i = 0; i < update.view_metas.size() - 1; ++i) {
-    at::Tensor next_view = update.view_metas[i].forward_fn(tmp_values.back(), update.view_metas[i].out_index);
+    at::Tensor next_view = update.view_metas[i]->forward(tmp_values.back());
    // NB: We only actually need tmp_values for ops like select/slice/diagonal/squeeze/as_strided
    // All of these ops require additional information to recover the sizes of the original tensor.
    // If need to, we could probably apply this optimization and only bother computing tmp_values
@ -55,9 +50,8 @@ static const Tensor apply_update(const FunctionalStorageImpl::Update& update, co
    tmp_values.push_back(std::move(next_view));
  }
  for(int64_t i = static_cast<int64_t>(update.view_metas.size()) - 1; i >= 0; --i) {
-    int64_t out_idx = update.view_metas[i].out_index;
    // Each view inverse is implemented in ViewInverses.cpp.
-    t = update.view_metas[i].reverse_fn(tmp_values[i], t, out_idx);
+    t = update.view_metas[i]->reverse(tmp_values[i], t);
  }
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
  return t;
@ -111,13 +105,13 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base)
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(base_));
 }

-void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<ViewMeta>& metas) {
+void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector<std::shared_ptr<ViewMeta>>& metas) {
  TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage");

  if (metas.size() > 1) {
    for (size_t i = 1; i < metas.size(); ++i) {
      // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI
-      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided,
+      TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i]->is_as_strided,
 "During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i,
 " was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today,"
 "so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you "
--- a/aten/src/ATen/FunctionalStorageImpl.h
+++ b/aten/src/ATen/FunctionalStorageImpl.h
@ -8,44 +8,89 @@ namespace at::functionalization {

 // See Note [Functionalization Pass In Core]

+enum class InverseReturnMode {
+  /// Specifies that functional inverses should always return a view.
+  AlwaysView,
+  /// Specifies that functional inverses should always return a non-view / copy.
+  NeverView,
+  /// Specifies that functional inverses should return a view unless a (copying)
+  /// scatter
+  /// inverse exists, in which case that will be used instead.
+  /// This avoids as_strided() calls that can be difficult for subclasses to
+  /// handle.
+  ViewOrScatterInverse,
+};
+
+#define FUNCTIONALIZATION_VIEWMETA_NAME(TYPE) \
+  static const char* name() {                 \
+    return #TYPE;                             \
+  }
+
+#define FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(...) \
+  using SerializableTuple = std::tuple<__VA_ARGS__>
+
 // ViewMeta is a class used by the functionalization pass to navigate between
 // a base tensor and a view tensor.
 // For example, if I call `b = a.view1(...)`
-// the functionalization pass will generate and store a ViewMeta on b that looks
-// like:
+// the functionalization pass will generate and store a ViewMeta specialization
+// for `view1` operation on b that looks like:
 //
-// ViewMeta(
-//   [<captures>](const Tensor& base, int64_t mutated_view_idx) {
-//     return base.view1(...);
-//   },
-//   [<captures>](const at::Tensor& base, const at::Tensor& mutated_view,
-//   int64_t mutated_view_idx) -> at::Tensor {
-//     return at::functionalization::impl::view1_inverse(base, mutated_view,
-//     ...);
+// struct TORCH_API view1_ViewMeta : public ViewMeta {
+//   FUNCTIONALIZATION_VIEWMETA_NAME(view1_ViewMeta);
+//   FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+//       bool /* reapply_views */,
+//       const std::vector<int64_t>&);
+//
+//   view1_ViewMeta(const SerializableTuple& tpl)
+//       : view1_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+//
+//   view1_ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+//       : ViewMeta(/*has_symbolic_inputs=*/false),
+//         reapply_views(reapply_views),
+//         size(size) {}
+//
+//   Tensor forward(const Tensor& base) override {
+//       return base.view1(...);
 //   }
 //
-// The forward_fn lambda describes how to replay view1 on a tensor.
+//   Tensor reverse(const Tensor& base, const Tensor& mutated_view) override {
+//       return at::functionalization::impl::view1_inverse(base, mutated_view,
+//       ...);
+//   }
 //
-// The reverse_fn lambda describes how, given a tensor that is already a view,
+//   SerializableTuple to_serializable_tuple() {
+//     return std::make_tuple(reapply_views, size);
+//   }
+//
+//   bool reapply_views;
+//   std::vector<int64_t> size;
+// };
+//
+// The forward function describes how to replay view1 on a tensor.
+//
+// The reverse function describes how, given a tensor that is already a view,
 // how to get the corresponding base tensor. See Note [Functionalization Pass:
 // View Inverses] for details.
+//
+// `SerializedTuple` is a typedef that defines an `std::tuple<...>` type
+// representing the `ViewMeta` instance state. Methods that take in/return such
+// a type are used for supporting pickle serialization.
 struct ViewMeta {
  ViewMeta(
-      std::function<Tensor(const Tensor&, int64_t)> forward,
-      std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse,
      bool has_symbolic_inputs,
      bool is_multi_output = false,
      bool is_as_strided = false,
      int64_t out_idx = 0)
-      : forward_fn(std::move(forward)),
-        reverse_fn(std::move(reverse)),
-        out_index(out_idx),
+      : out_index(out_idx),
        is_multi_output(is_multi_output),
        is_as_strided(is_as_strided),
        has_symbolic_inputs(has_symbolic_inputs) {}

-  std::function<Tensor(const Tensor&, int64_t)> forward_fn;
-  std::function<Tensor(const Tensor&, const Tensor&, int64_t)> reverse_fn;
+  virtual ~ViewMeta() = default;
+
+  virtual Tensor forward(const Tensor& base) = 0;
+  virtual Tensor reverse(const Tensor& base, const Tensor& mutated_view) = 0;
+
  // See Note [out_idx in ViewMeta]
  int64_t out_index;

@ -57,10 +102,17 @@ struct ViewMeta {
  // Tells us if this view operation has any symbolic inputs
  bool has_symbolic_inputs;

-  // Returns a copy of the current ViewMeta, if out_idx matches the current
-  // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse
+  // Returns a new ViewMeta with the same forward/reverse
  // functions, but a new out index.
-  ViewMeta to_out_idx(int64_t out_idx);
+  //
+  // This method should be implemented by those `ViewMeta` that have more than
+  // one output.
+  virtual std::shared_ptr<ViewMeta> to_out_index(int64_t out_index) {
+    TORCH_CHECK_NOT_IMPLEMENTED(
+        false,
+        "ViewMeta::to_out_index not implemented. ",
+        "Likely because there's only one output.");
+  }
 };

 // FunctionalStorageImpl is a subclass of StorageImpl used by the
@ -93,14 +145,14 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl {
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
    const at::Tensor new_val;
    // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members)
-    const std::vector<ViewMeta> view_metas;
+    const std::vector<std::shared_ptr<ViewMeta>> view_metas;
  };

  explicit FunctionalStorageImpl(const Tensor& value);

  void add_update(
      const Tensor& updated_val,
-      const std::vector<ViewMeta>& view_metas);
+      const std::vector<std::shared_ptr<ViewMeta>>& view_metas);
  bool apply_updates();
  const Tensor& base() {
    return base_;
--- a/aten/src/ATen/FunctionalTensorWrapper.cpp
+++ b/aten/src/ATen/FunctionalTensorWrapper.cpp
@ -129,17 +129,19 @@ void FunctionalTensorWrapper::freeze_storage() const {
 // - view_value: The output tensor that we need to wrap.
 // - base: The "base" of the view that `view_value` was generated from.
 // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic.
-FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta)
-  : c10::TensorImpl(
-      c10::DispatchKeySet(DispatchKey::Functionalize),
-      view_value.dtype(),
-      view_value.device()
-    ),
-    value_(view_value),
-    is_multi_output_view_(base->is_multi_output_view_ || meta.is_multi_output),
-    was_storage_changed_(base->was_storage_changed_),
-    is_symbolic_(base->is_symbolic_)
-{
+FunctionalTensorWrapper::FunctionalTensorWrapper(
+    const Tensor& view_value,
+    const FunctionalTensorWrapper* base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta)
+    : c10::TensorImpl(
+          c10::DispatchKeySet(DispatchKey::Functionalize),
+          view_value.dtype(),
+          view_value.device()),
+      value_(view_value),
+      is_multi_output_view_(
+          base->is_multi_output_view_ || meta->is_multi_output),
+      was_storage_changed_(base->was_storage_changed_),
+      is_symbolic_(base->is_symbolic_) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(value_));
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
  set_constructor_metadata();
@ -148,11 +150,10 @@ FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const
      view_metas_ = base->view_metas_;  // copy
  }
  view_metas_.push_back(meta);
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
  storage_ = base->storage_; // alias this tensor's storage with the base tensor's
 }

-
 functionalization::FunctionalStorageImpl* FunctionalTensorWrapper::functional_storage_impl() const {
  return static_cast<functionalization::FunctionalStorageImpl*>(storage_.unsafeGetStorageImpl());
 }
@ -176,18 +177,18 @@ bool FunctionalTensorWrapper::is_up_to_date() const {
 }

 // See Note [Functionalization Pass - Inplace View Ops]
-void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) {
+void FunctionalTensorWrapper::mutate_view_meta(const std::shared_ptr<at::functionalization::ViewMeta>& meta) {
  view_metas_.push_back(meta);
  // Manually track the fact that this tensor received a metadata mutation!
  has_metadata_mutation_ = true;
  // Mark this tensor as being symbolic if there are any symbolic inputs used by the view operation.
-  maybe_mark_symbolic(meta);
+  maybe_mark_symbolic(meta.get());
  // Note [Functionalization Pass - Inplace View Ops]
  // So, these ops are special - they're mutation AND view ops. They get special codegen.
  // An example is transpose_, e.g. `a.transpose_()`
  // Calling transpose_() should ensure that a gets an alias, and append the new ViewMeta to a's current list of ViewMetas.
  at::AutoDispatchSkipFunctionalize guard;
-  value_ = meta.forward_fn(value_, meta.out_index);
+  value_ = meta->forward(value_);
  TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize));
 }

@ -368,15 +369,8 @@ void FunctionalTensorWrapper::sync_() {
  regenerate_from_base();
 }

-Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) {
-  auto t = base;
-
-  // Reapply views to get the viewed tensor from the base in alias_
-  for (auto& view_meta: view_metas_) {
-    t = view_meta.forward_fn(t, view_meta.out_index);
-  }
-
-  return t;
+const std::vector<std::shared_ptr<functionalization::ViewMeta>>& FunctionalTensorWrapper::view_metas() const {
+  return view_metas_;
 }

 void FunctionalTensorWrapper::regenerate_from_base() {
@ -385,7 +379,7 @@ void FunctionalTensorWrapper::regenerate_from_base() {
  auto t = storage_impl->base();

  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));
-  t = apply_view_metas(t);
+  t = at::functionalization::impl::apply_view_meta_sequence(t, view_metas_);
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t));

  replace_(t, /*from_lazy_regenerate=*/true);
@ -724,11 +718,11 @@ bool isFunctionalTensor(const std::optional<Tensor>& t) {
 }

 bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {
-  if (t_list.empty()) return false;
+  if (t_list.empty()) { return false; }
  auto functional_count = 0;
  for (const auto i : c10::irange(t_list.size())) {
    auto const & e= t_list[i];
-    if (!e.has_value() || !e->defined()) continue;
+    if (!e.has_value() || !e->defined()) { continue; }
    if (isFunctionalTensor(e)) {
      ++functional_count;
    }
@ -738,10 +732,10 @@ bool isFunctionalTensor(const c10::List<::std::optional<Tensor>>& t_list) {

 template <typename T>
 static bool isFunctionalTensorIListRef(c10::IListRef<T> list) {
-  if (list.size() == 0) return false;
+  if (list.size() == 0) { return false; }
  auto functional_count = 0;
  for (const auto& tensor : list) {
-    if (!tensor.defined()) continue;
+    if (!tensor.defined()) { continue; }
    if (isFunctionalTensor(tensor)) {
      ++functional_count;
    }
@ -759,20 +753,28 @@ void freeze_functional_tensor(const Tensor& tensor) {
  functional_base_impl->freeze_storage();
 }

-Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta, int64_t out_idx) {
+Tensor create_functional_tensor_with_view_meta(
+    const at::Tensor& view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
+    int64_t out_idx) {
  TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(view_to_wrap));
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(base));
  auto functional_base_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(base);
+  auto meta_ = meta;
  if (out_idx != 0) {
    // Note [out_idx in ViewMeta]
    // When a view op outputs multiple tensors, each output needs its own separate ViewMeta.
    // Each ViewMeta also tracks the index of the particular output tensor, which is needed in the reverse function.
-    meta = meta.to_out_idx(out_idx);
+    meta_ = meta->to_out_index(out_idx);
  }
-  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta);
+  return at::detail::make_tensor<FunctionalTensorWrapper>(view_to_wrap, functional_base_impl, meta_);
 }

-std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) {
+std::vector<Tensor> create_functional_tensor_with_view_meta(
+    ITensorListRef view_to_wrap,
+    const at::Tensor& base,
+    const std::shared_ptr<functionalization::ViewMeta>& meta) {
  std::vector<Tensor> outputs(view_to_wrap.size());
  int64_t i = 0;
  for (const auto& tensor : view_to_wrap) {
@ -782,12 +784,22 @@ std::vector<Tensor> create_functional_tensor_with_view_meta(ITensorListRef view_
  return outputs;
 }

-void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) {
+void mutate_view_meta(const at::Tensor& self, const std::shared_ptr<functionalization::ViewMeta>& meta) {
  TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self));
  auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self);
  self_impl->mutate_view_meta(meta);
 }

+Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence) {
+  Tensor r = base;
+  for (auto& vm : sequence) {
+    r = vm->forward(r);
+  }
+  return r;
+}
+
 // Note [Propagating strides in the functionalization pass]
 // In order to properly compute stride information, the functionalization pass
 // calls each {view} reference implementations with meta tensors.
@ -881,7 +893,7 @@ void functionalize_op_helper(const c10::OperatorHandle& op, torch::jit::Stack* s
    const auto& ivalue = returns[idx];
    if (ivalue.isTensor()) {
      const auto& t = ivalue.toTensor();
-      if (!t.defined()) continue;
+      if (!t.defined()) { continue; }
      at::functionalization::impl::sync(t);
      auto t_new = c10::IValue(at::functionalization::impl::from_functional_tensor(t));
      (*stack)[returns_begin + idx] = t_new;
--- a/aten/src/ATen/FunctionalTensorWrapper.h
+++ b/aten/src/ATen/FunctionalTensorWrapper.h
@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  explicit FunctionalTensorWrapper(
      const Tensor& view_value,
      const FunctionalTensorWrapper* base,
-      const functionalization::ViewMeta& meta);
+      const std::shared_ptr<functionalization::ViewMeta>& meta);

  // Get the underlying, actual tensor, that doesn't know anything about
  // functionalization.
@ -99,17 +99,17 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
        ->are_all_mutations_under_no_grad_or_inference_mode();
  }

-  void maybe_mark_symbolic(const functionalization::ViewMeta& meta) {
-    is_symbolic_ = is_symbolic_ | meta.has_symbolic_inputs;
+  void maybe_mark_symbolic(functionalization::ViewMeta* meta) {
+    is_symbolic_ = is_symbolic_ | meta->has_symbolic_inputs;
  }

  bool is_symbolic() const {
    return is_symbolic_;
  }

-  // Runs the forward_fn of every ViewMeta collected in the current instance
-  // to some other base.
-  Tensor apply_view_metas(const Tensor& base);
+  // Retrieves the ViewMeta sequence of this tensor.
+  const std::vector<std::shared_ptr<functionalization::ViewMeta>>& view_metas()
+      const;

  // Sync's the underlying tensor with its alias, if it's out of date. This
  // involves two steps: 1) Apply any pending updates/mutations to the alias 2)
@ -146,7 +146,8 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  // from the base tensor. This method is used by inplace-view ops like
  // transpose_. It appends a ViewMeta to the existing stack, and refreshes the
  // tensor by replaying the views off of the alias.
-  void mutate_view_meta(const at::functionalization::ViewMeta& meta);
+  void mutate_view_meta(
+      const std::shared_ptr<at::functionalization::ViewMeta>& meta);

  // Custom implementation of self.set_(src)
  void set__impl(const FunctionalTensorWrapper* other);
@ -285,7 +286,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl {
  bool is_symbolic_ = false;

  size_t generation_ = 0;
-  std::vector<at::functionalization::ViewMeta> view_metas_;
+  std::vector<std::shared_ptr<at::functionalization::ViewMeta>> view_metas_;

 protected:
  static void copy_tensor_metadata(
@ -377,16 +378,20 @@ TORCH_API void propagate_xla_data_direct(
 Tensor create_functional_tensor_with_view_meta(
    const Tensor& view_to_wrap,
    const Tensor& base,
-    functionalization::ViewMeta meta,
+    const std::shared_ptr<functionalization::ViewMeta>& meta,
    int64_t out_idx = 0);
 std::vector<Tensor> create_functional_tensor_with_view_meta(
    ITensorListRef view_to_wrap,
    const Tensor& base,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);

 void mutate_view_meta(
    const Tensor& self,
-    const functionalization::ViewMeta& meta);
+    const std::shared_ptr<functionalization::ViewMeta>& meta);
+
+TORCH_API Tensor apply_view_meta_sequence(
+    const Tensor& base,
+    const std::vector<std::shared_ptr<functionalization::ViewMeta>>& sequence);

 void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out);
 void set_sizes_strides_offset(
--- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp
@ -1,3 +1,5 @@
+#include <ATen/FunctionalizeFallbackKernel.h>
+
 #include <ATen/core/dispatch/Dispatcher.h>
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
@ -7,7 +9,6 @@
 #include <torch/library.h>
 #include <c10/util/irange.h>
 #include <c10/util/strides.h>
-#include <ATen/EmptyTensor.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/ATen.h>
@ -28,6 +29,31 @@
 #include <utility>
 #endif

+namespace at::functionalization {
+
+Tensor resize__ViewMeta::forward(const Tensor& base) {
+  if (reapply_views) {
+    return base.as_strided(size, c10::contiguous_strides(size));
+  } else {
+    return at::as_strided_copy(base, size, c10::contiguous_strides(size));
+  }
+}
+
+Tensor resize__ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return base.as_strided_scatter(
+      mutated_view, size, c10::contiguous_strides(size));
+}
+
+Tensor _unsafe_view_ViewMeta::forward(const Tensor& base) {
+  return at::_unsafe_view_symint(base, size);
+}
+
+Tensor _unsafe_view_ViewMeta::reverse(const Tensor& base, const Tensor& mutated_view) {
+  return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
+}
+
+} // namespace at::functionalization
+
 namespace {
  void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet [[maybe_unused]], torch::jit::Stack* stack) {
    const auto& schema = op.schema();
@ -106,7 +132,9 @@ namespace {
      const auto& ivalue = returns[idx];
      if (ivalue.isTensor() && should_wrap_outputs) {
        const auto& t = ivalue.toTensor();
-        if (!t.defined()) continue;
+        if (!t.defined()) {
+          continue;
+        }
        auto t_new = c10::IValue(at::functionalization::impl::to_functional_tensor(t));
        (*stack)[returns_begin + idx] = t_new;
      } else if (ivalue.isTensorList() && should_wrap_outputs) {
@ -169,19 +197,8 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch
  // The output of resizing is equivalent to taking a slice of a larger tensor.
  // We have to emulate this "slicing" with an as_strided call.
  auto reapply_views = at::functionalization::impl::getFunctionalizationReapplyViewsTLS();
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [reapply_views = reapply_views, size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      if (reapply_views) {
-        return base.as_strided(size, c10::contiguous_strides(size));
-      } else {
-        return at::as_strided_copy(base, size, c10::contiguous_strides(size));
-      }
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size));
-    },
-    /*has_symbolic_inputs=*/false
-  );
+  auto view_meta = std::make_shared<at::functionalization::resize__ViewMeta>(
+      reapply_views, size.vec());
  at::functionalization::impl::mutate_view_meta(self, view_meta);
  return self;
 }
@ -300,17 +317,11 @@ static at::Tensor _unsafe_view_functionalize(const at::Tensor & self, at::SymInt
    tmp_output = at::_unsafe_view_symint(self_, size);
  }

-  bool has_symbolic_inputs = std::any_of(size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
-
-  at::functionalization::ViewMeta view_meta = at::functionalization::ViewMeta(
-    [size = size.vec()](const at::Tensor & base, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(base, size);
-    },
-    [size = size.vec()](const at::Tensor & base, const at::Tensor & mutated_view, int64_t mutated_view_idx [[maybe_unused]]) -> at::Tensor {
-      return at::_unsafe_view_symint(mutated_view, base.sym_sizes());
-    },
-    /*has_symbolic_inputs=*/has_symbolic_inputs
-  );
+  bool has_symbolic_inputs = std::any_of(
+      size.begin(), size.end(), [=](auto& s) { return s.is_symbolic(); });
+  auto view_meta =
+      std::make_shared<at::functionalization::_unsafe_view_ViewMeta>(
+          has_symbolic_inputs, size.vec());

  auto out = at::functionalization::impl::create_functional_tensor_with_view_meta(tmp_output, self, std::move(view_meta));
  // See  Note [Propagating strides in the functionalization pass]
--- a/aten/src/ATen/FunctionalizeFallbackKernel.h
+++ b/aten/src/ATen/FunctionalizeFallbackKernel.h
@ -0,0 +1,58 @@
+#pragma once
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at::functionalization {
+
+// `ViewMeta` implementation for `resize_` operation.
+struct TORCH_API resize__ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(resize__ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* reapply_views */,
+      const std::vector<int64_t>&);
+
+  resize__ViewMeta(const SerializableTuple& tpl)
+      : resize__ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  resize__ViewMeta(bool reapply_views, const std::vector<int64_t>& size)
+      : ViewMeta(/*has_symbolic_inputs=*/false),
+        reapply_views(reapply_views),
+        size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(reapply_views, size);
+  }
+
+  bool reapply_views;
+  std::vector<int64_t> size;
+};
+
+// `ViewMeta` implementation for `_unsafe_view` operation.
+struct TORCH_API _unsafe_view_ViewMeta : public ViewMeta {
+  FUNCTIONALIZATION_VIEWMETA_NAME(_unsafe_view_ViewMeta)
+  FUNCTIONALIZATION_VIEWMETA_SERIALIZABLE_TUPLE(
+      bool /* has_symbolic_inputs */,
+      const std::vector<c10::SymInt>&);
+
+  _unsafe_view_ViewMeta(const SerializableTuple& tpl)
+      : _unsafe_view_ViewMeta(std::get<0>(tpl), std::get<1>(tpl)) {}
+
+  _unsafe_view_ViewMeta(
+      bool has_symbolic_inputs,
+      const std::vector<c10::SymInt>& size)
+      : ViewMeta(has_symbolic_inputs), size(size) {}
+
+  Tensor forward(const Tensor& base) override;
+  Tensor reverse(const Tensor& base, const Tensor& mutated_view) override;
+
+  SerializableTuple to_serializable_tuple() {
+    return std::make_tuple(has_symbolic_inputs, size);
+  }
+
+  std::vector<c10::SymInt> size;
+};
+
+} // namespace at::functionalization
--- a/aten/src/ATen/mps/EmptyTensor.cpp
+++ b/aten/src/ATen/mps/EmptyTensor.cpp
@ -12,7 +12,7 @@

 #define MPS_ERROR_NOT_COMPILED "PyTorch code is not compiled with MPS enabled"
 #define MPS_ERROR_RUNTIME_TOO_LOW \
-  "The MPS backend is supported on MacOS 13.0+.", \
+  "The MPS backend is supported on MacOS 14.0+. ", \
  "Current OS version can be queried using `sw_vers`"
 #define MPS_ERROR_DOUBLE_NOT_SUPPORTED "Cannot convert a MPS Tensor to float64 dtype " \
  "as the MPS framework doesn't support float64. Please use float32 instead."
--- a/aten/src/ATen/mps/MPSHooks.mm
+++ b/aten/src/ATen/mps/MPSHooks.mm
@ -70,7 +70,10 @@ void MPSHooks::commitStream() const {
 }

 void* MPSHooks::getCommandBuffer() const {
-  return at::mps::getDefaultMPSStream()->commandBuffer();
+  auto stream = at::mps::getDefaultMPSStream();
+  // Release pending computeCommandEncoder, as extensions is likely to allocate new one
+  stream->endKernelCoalescing();
+  return stream->commandBuffer();
 }

 void* MPSHooks::getDispatchQueue() const {
--- a/aten/src/ATen/mps/MPSStream.mm
+++ b/aten/src/ATen/mps/MPSStream.mm
@ -158,7 +158,18 @@ void MPSStream::fill(id<MTLBuffer> buffer, uint8_t value, size_t length, size_t
      endKernelCoalescing();
      id<MTLBlitCommandEncoder> blitEncoder = [commandBuffer() blitCommandEncoder];

-      [blitEncoder fillBuffer:buffer range:NSMakeRange(offset, length) value:value];
+      // For some reason fillBufferfor stopped working for lengh > 4Gb on MacOS 26
+      // See https://github.com/pytorch/pytorch/issues/163962
+      // Workaround by batching copy commands into 4Gb chunks
+      constexpr size_t max_copy_size = 0x100000000; // 4GB
+      size_t bytes_filled = 0;
+      size_t bytes_remains = length;
+      while (bytes_remains > 0) {
+        NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains);
+        [blitEncoder fillBuffer:buffer range:NSMakeRange(offset + bytes_filled, bytes_to_copy) value:value];
+        bytes_filled += bytes_to_copy;
+        bytes_remains -= bytes_to_copy;
+      }
      [blitEncoder endEncoding];
      synchronize(syncType);
    }
--- a/aten/src/ATen/native/Convolution.cpp
+++ b/aten/src/ATen/native/Convolution.cpp
@ -410,11 +410,23 @@ struct ConvParams {
  // cudnn and miopen are guaranteed not to be on mobile, and T102591915 / T110194934 suggest
  // that maybe the compiledWithCuDNN() check sometimes segfaults (though I can't imagine how)
 #if !defined(C10_MOBILE)
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda() || !cudnn_enabled) {
      return false;
    }
+    static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
+    // broken on cuDNN 9.8 - 9.14
+    if (cudnn_version >= 90800 && cudnn_version < 91500) {
+      if (cudnn_conv_suggest_memory_format(input, weight) == at::MemoryFormat::Contiguous &&
+          (input.scalar_type() == at::kBFloat16 || input.scalar_type() == at::kHalf) &&
+          weight.dim() == 5) {
+        for (int i = 2; i < weight.dim(); i++) {
+          if (weight.size(i) != 1) {
+            return false;
+          }
+        }
+      }
+    }
    if (needs_64bit_indexing_no_split(input, weight)) {
-      static long cudnn_version = detail::getCUDAHooks().versionCuDNN();
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -422,9 +434,6 @@ struct ConvParams {
        return false;
      }
    }
-    if (!input.is_cuda() || !cudnn_enabled) {
-      return false;
-    }
    if (input.scalar_type() == at::kBFloat16 || weight.scalar_type() == at::kBFloat16) {
      if (!(detail::getCUDAHooks().supportsBFloat16ConvolutionWithCuDNNv8() && at::native::cudnnv8_enabled_check_debug())) {
        return false;
@ -443,16 +452,19 @@ struct ConvParams {

  // Use cudnn for FP16 depthwise convolutions
  bool use_cudnn_depthwise(const at::Tensor& input, const at::Tensor& weight) const  {
-    if (!detail::getCUDAHooks().compiledWithCuDNN()) {
+    if (!cudnn_enabled || !detail::getCUDAHooks().compiledWithCuDNN() || !input.is_cuda()) {
      return false;
    }
-    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous && use_cudnn(input, weight)) {
-      // always use cudnn_depthwise for channels_last format
-      return true;
-    }
    // native kernel doesn't support 64-bit non-splittable case
-    if (cudnn_enabled && !(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
+    if (!(canUse32BitIndexMath(input) && canUse32BitIndexMath(weight))) {
      static long cudnn_version = detail::getCUDAHooks().compiledWithCuDNN() ? detail::getCUDAHooks().versionCuDNN() : -1;
+      // TODO(eqy): remove this once cuDNN fixes 64-bit depthwise support, first broken in 9.11x
+      if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+        if (cudnn_version < 0 || cudnn_version > 91000) {
+          return false;
+        }
+      }
+
      if (!(cudnn_version >= 90300 && at::native::cudnnv8_enabled_check_debug())) {
        TORCH_WARN_ONCE("cuDNN cannot be used for large non-batch-splittable convolutions"
                        " if the V8 API is not enabled or before cuDNN version 9.3+."
@ -462,6 +474,10 @@ struct ConvParams {
        return true;
      }
    }
+    if (cudnn_conv_suggest_memory_format(input, weight) != at::MemoryFormat::Contiguous) {
+      // always use cudnn_depthwise for channels_last format
+      return true;
+    }
    if (detail::getCUDAHooks().supportsDepthwiseConvolutionWithCuDNN()) {
      bool kernel_cond =  (use_cudnn(input, weight) &&
                           input.scalar_type() == kHalf && // only for FP16
--- a/aten/src/ATen/native/mps/kernels/Attention.metal
+++ b/aten/src/ATen/native/mps/kernels/Attention.metal
@ -14,8 +14,8 @@ template <typename T, int D, int V = D>
    device T* out [[buffer(3)]],
    const constant uint& gqa_factor [[buffer(4)]],
    const constant uint& N [[buffer(5)]],
-    const constant uint2& k_head_seq_stride [[buffer(6)]],
-    const constant uint2& v_head_seq_stride [[buffer(7)]],
+    const constant uint3& qkv_head_strides [[buffer(6)]],
+    const constant uint3& qkv_seq_strides [[buffer(7)]],
    const constant float& scale [[buffer(8)]],
    const device bool* mask [[buffer(9)]],
    const constant uint3& mask_strides [[buffer(10)]],
@ -28,10 +28,12 @@ template <typename T, int D, int V = D>
  constexpr uint BD = 32;
  constexpr uint qk_per_thread = D / BD;
  constexpr uint v_per_thread = V / BD;
-  const uint k_head_stride = k_head_seq_stride.x;
-  const uint k_seq_stride = k_head_seq_stride.y;
-  const uint v_head_stride = v_head_seq_stride.x;
-  const uint v_seq_stride = v_head_seq_stride.y;
+  const uint q_head_stride = qkv_head_strides.x;
+  const uint q_seq_stride = qkv_seq_strides.x;
+  const uint k_head_stride = qkv_head_strides.y;
+  const uint k_seq_stride = qkv_seq_strides.y;
+  const uint v_head_stride = qkv_head_strides.z;
+  const uint v_seq_stride = qkv_seq_strides.z;
  const uint mask_head_stride = mask_strides.x;
  const uint mask_kv_seq_stride = mask_strides.y;
  const uint mask_q_seq_stride = mask_strides.z;
@ -54,9 +56,9 @@ template <typename T, int D, int V = D>
  const int kv_head_idx = head_idx / gqa_factor;
  const int Q = tpg.y;
  const int group_offset = head_idx * Q + q_seq_idx;
-  const int q_offset = group_offset;
  const int o_offset = group_offset;
-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride + simd_gid * k_seq_stride +
      simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride + simd_gid * v_seq_stride +
@ -156,8 +158,8 @@ template <typename T, int D, int V = D>
    device float* maxs [[buffer(5)]],
    const constant uint& gqa_factor [[buffer(6)]],
    const constant uint& N [[buffer(7)]],
-    const constant uint2& k_head_seq_stride [[buffer(8)]],
-    const constant uint2& v_head_seq_stride [[buffer(9)]],
+    const constant uint3& qkv_head_strides [[buffer(8)]],
+    const constant uint3& qkv_seq_strides [[buffer(9)]],
    const constant float& scale [[buffer(10)]],
    const device bool* mask [[buffer(11)]],
    const constant uint3& mask_strides [[buffer(12)]],
@ -170,10 +172,12 @@ template <typename T, int D, int V = D>
  constexpr int BD = 32;
  constexpr int qk_per_thread = D / BD;
  constexpr int v_per_thread = V / BD;
-  const int k_head_stride = k_head_seq_stride.x;
-  const int k_seq_stride = k_head_seq_stride.y;
-  const int v_head_stride = v_head_seq_stride.x;
-  const int v_seq_stride = v_head_seq_stride.y;
+  const int q_head_stride = qkv_head_strides.x;
+  const int q_seq_stride = qkv_seq_strides.x;
+  const int k_head_stride = qkv_head_strides.y;
+  const int k_seq_stride = qkv_seq_strides.y;
+  const int v_head_stride = qkv_head_strides.z;
+  const int v_seq_stride = qkv_seq_strides.z;
  const int mask_kv_seq_stride = mask_strides.x;
  const int mask_q_seq_stride = mask_strides.y;
  const int mask_head_stride = mask_strides.z;
@ -196,10 +200,10 @@ template <typename T, int D, int V = D>
  const int head_idx = tid.x;
  const int q_seq_idx = tid.y;
  const int o_offset = head_idx * tpg.y + q_seq_idx;
-  const int q_offset = o_offset;
  const int kv_head_idx = head_idx / gqa_factor;

-  queries += q_offset * D + simd_lid * qk_per_thread;
+  queries += head_idx * q_head_stride + q_seq_idx * q_seq_stride +
+      simd_lid * qk_per_thread;
  keys += kv_head_idx * k_head_stride +
      (block_idx * BN + simd_gid) * k_seq_stride + simd_lid * qk_per_thread;
  values += kv_head_idx * v_head_stride +
@ -520,25 +524,25 @@ kernel void attention(
  }
 }

-#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)    \
-  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM     \
-                       "_" #VALUE_DIM)]] kernel void         \
-  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                     \
-      const device DTYPE* queries [[buffer(0)]],             \
-      const device DTYPE* keys [[buffer(1)]],                \
-      const device DTYPE* values [[buffer(2)]],              \
-      device DTYPE* out [[buffer(3)]],                       \
-      const constant uint& gqa_factor [[buffer(4)]],         \
-      const constant uint& N [[buffer(5)]],                  \
-      const constant uint2& k_head_seq_stride [[buffer(6)]], \
-      const constant uint2& v_head_seq_stride [[buffer(7)]], \
-      const constant float& scale [[buffer(8)]],             \
-      const device bool* mask [[buffer(9)]],                 \
-      const constant uint3& mask_strides [[buffer(10)]],     \
-      const constant bool& has_mask [[buffer(11)]],          \
-      uint3 tid [[threadgroup_position_in_grid]],            \
-      uint3 tpg [[threadgroups_per_grid]],                   \
-      uint simd_gid [[simdgroup_index_in_threadgroup]],      \
+#define INSTANTIATE_SDPA_VECTOR(DTYPE, QK_DIM, VALUE_DIM)   \
+  template [[host_name("sdpa_vector_" #DTYPE "_" #QK_DIM    \
+                       "_" #VALUE_DIM)]] kernel void        \
+  sdpa_vector<DTYPE, QK_DIM, VALUE_DIM>(                    \
+      const device DTYPE* queries [[buffer(0)]],            \
+      const device DTYPE* keys [[buffer(1)]],               \
+      const device DTYPE* values [[buffer(2)]],             \
+      device DTYPE* out [[buffer(3)]],                      \
+      const constant uint& gqa_factor [[buffer(4)]],        \
+      const constant uint& N [[buffer(5)]],                 \
+      const constant uint3& qkv_head_strides [[buffer(6)]], \
+      const constant uint3& qkv_seq_strides [[buffer(7)]],  \
+      const constant float& scale [[buffer(8)]],            \
+      const device bool* mask [[buffer(9)]],                \
+      const constant uint3& mask_strides [[buffer(10)]],    \
+      const constant bool& has_mask [[buffer(11)]],         \
+      uint3 tid [[threadgroup_position_in_grid]],           \
+      uint3 tpg [[threadgroups_per_grid]],                  \
+      uint simd_gid [[simdgroup_index_in_threadgroup]],     \
      uint simd_lid [[thread_index_in_simdgroup]]);

 #define INSTANTIATE_SDPA_VECTOR_2PASS_1(DTYPE, QK_DIM, VALUE_DIM) \
@ -553,8 +557,8 @@ kernel void attention(
      device float* maxs [[buffer(5)]],                           \
      const constant uint& gqa_factor [[buffer(6)]],              \
      const constant uint& N [[buffer(7)]],                       \
-      const constant uint2& k_head_seq_stride [[buffer(8)]],      \
-      const constant uint2& v_head_seq_stride [[buffer(9)]],      \
+      const constant uint3& qkv_head_strides [[buffer(8)]],       \
+      const constant uint3& qkv_seq_strides [[buffer(9)]],        \
      const constant float& scale [[buffer(10)]],                 \
      const device bool* mask [[buffer(11)]],                     \
      const constant uint3& mask_strides [[buffer(12)]],          \
--- a/aten/src/ATen/native/mps/operations/Attention.mm
+++ b/aten/src/ATen/native/mps/operations/Attention.mm
@ -182,6 +182,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
  uint maxSeqLength = k_.size(2);
  uint N = k_.size(2);
  uint B = q_.size(0) * q_.size(1);
+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -209,8 +211,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_fast_mps(const Tensor& q_,
                  out,
                  1,
                  N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
@ -257,6 +259,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
  uint B = batchSize * num_heads;
  uint gqa_factor = q_.size(1) / k_.size(1);

+  uint q_head_stride = q_.stride(1);
+  uint q_seq_stride = q_.stride(2);
  uint k_head_stride = k_.stride(1);
  uint k_seq_stride = k_.stride(2);
  uint v_head_stride = v_.stride(1);
@ -294,8 +298,8 @@ static std::tuple<Tensor, Tensor> sdpa_vector_2pass_mps(const Tensor& q_,
                  maxs,
                  gqa_factor,
                  N,
-                  std::array<uint32_t, 2>{k_head_stride, k_seq_stride},
-                  std::array<uint32_t, 2>{v_head_stride, v_seq_stride},
+                  std::array<uint32_t, 3>{q_head_stride, k_head_stride, v_head_stride},
+                  std::array<uint32_t, 3>{q_seq_stride, k_seq_stride, v_seq_stride},
                  scale_factor);

      if (has_mask) {
--- a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
+++ b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@ -666,6 +666,15 @@ bool can_use_cudnn_attention(const sdp_params& params, bool debug) {
    TORCH_WARN(CUDNN_VERSION, " cuDNN version too old to use cuDNN Attention (< v9.0.0)");
  }
  return false;
+#endif
+#if defined(CUDNN_VERSION)
+  static auto cudnn_version = cudnnGetVersion();
+  if (params.dropout > 0.0 && cudnn_version > 91100 && cudnn_version < 91400) {
+    if (debug) {
+      TORCH_WARN(CUDNN_VERSION, " cuDNN version does not support droppout in SDPA (9.11 - 9.13).");
+    }
+    return false;
+  }
 #endif
  // Define gate functions that determine if a flash kernel can be ran
  // Replace with std::to_array when we migrate to c++20
--- a/aten/src/ATen/templates/FunctionalInverses.h
+++ b/aten/src/ATen/templates/FunctionalInverses.h
@ -2,22 +2,12 @@

 // ${generated_comment}

+#include <ATen/FunctionalStorageImpl.h>
 #include <ATen/Tensor.h>

 namespace at {
 namespace functionalization {

-enum class InverseReturnMode {
-  /// Specifies that functional inverses should always return a view.
-  AlwaysView,
-  /// Specifies that functional inverses should always return a non-view / copy.
-  NeverView,
-  /// Specifies that functional inverses should return a view unless a (copying) scatter
-  /// inverse exists, in which case that will be used instead.
-  /// This avoids as_strided() calls that can be difficult for subclasses to handle.
-  ViewOrScatterInverse,
-};
-
 struct FunctionalInverses {

 ${view_inverse_declarations}
--- a/aten/src/ATen/templates/RegisterFunctionalization.cpp
+++ b/aten/src/ATen/templates/RegisterFunctionalization.cpp
@ -4,7 +4,7 @@
 #include <ATen/core/LegacyTypeDispatch.h>
 #include <ATen/EmptyTensor.h>
 #include <ATen/FunctionalTensorWrapper.h>
-#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
 #include <ATen/MemoryOverlap.h>
 #include <torch/library.h>

--- a/aten/src/ATen/templates/ViewMetaClasses.cpp
+++ b/aten/src/ATen/templates/ViewMetaClasses.cpp
@ -0,0 +1,19 @@
+// ${generated_comment}
+
+#include <ATen/FunctionalInverses.h>
+#include <ATen/ViewMetaClasses.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Operators.h>
+#include <ATen/NativeFunctions.h>
+#else
+${op_headers}
+#endif
+
+namespace at {
+namespace functionalization {
+
+${view_meta_implementations}
+
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClasses.h
+++ b/aten/src/ATen/templates/ViewMetaClasses.h
@ -0,0 +1,12 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include <ATen/FunctionalStorageImpl.h>
+
+namespace at {
+namespace functionalization {
+
+${view_meta_declarations}
+
+} // namespace functionalization
+} // namespace at
--- a/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
+++ b/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp
@ -0,0 +1,11 @@
+#include <ATen/ViewMetaClasses.h>
+#include <torch/csrc/functionalization/Module.h>
+
+namespace torch::functionalization {
+
+void initGenerated(PyObject* module) {
+  auto functionalization = py::handle(module).cast<py::module>();
+  $view_meta_bindings
+}
+
+} // namespace torch::functionalization
--- a/aten/src/ATen/test/cuda_allocator_test.cpp
+++ b/aten/src/ATen/test/cuda_allocator_test.cpp
@ -5,51 +5,6 @@

 #include <ATen/test/allocator_clone_test.h>

-#include <torch/csrc/cuda/CUDAPluggableAllocator.h>
-
 TEST(AllocatorTestCUDA, test_clone) {
  test_allocator_clone(c10::cuda::CUDACachingAllocator::get());
 }
-
-static int called_dummy_free_0 = 0;
-static int called_dummy_free_1 = 0;
-
-void* dummy_alloc_0(size_t size, int device, void* stream) {return nullptr;}
-void dummy_free_0(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_0++;
-}
-void dummy_free_1(void* data, size_t size, int device, void* stream) {
-  called_dummy_free_1++;
-}
-
-// Tests that data_ptrs have their respective deleters
-// when mixing allocators
-TEST(AllocatorTestCUDA, test_pluggable_allocator_deleters) {
-  // Create a tensor with dummy_allocator_0, where dummy_free_0 is the deleter
-  auto dummy_allocator_0 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_0);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_0.get());
-  at::Tensor a = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Create a tensor with dummy_allocator_1, where dummy_free_1 is the deleter
-  auto dummy_allocator_1 = torch::cuda::CUDAPluggableAllocator::createCustomAllocator(dummy_alloc_0, dummy_free_1);
-  c10::cuda::CUDACachingAllocator::allocator.store(dummy_allocator_1.get());
-  at::Tensor b = at::empty({0}, at::TensorOptions().device(at::kCUDA));
-
-  // Manually use a's deleter
-  auto* ctx = a.storage().data_ptr().get_context();
-  a.storage().data_ptr().get_deleter()(ctx);
-  a.storage().mutable_data_ptr().release_context();
-
-  // a's deleter is dummy_free_0
-  // dummy_free_0 should be called above, so called_dummy_free_0 should be 1
-  ASSERT_TRUE(called_dummy_free_0 == 1);
-
-  // Manually use b's deleter
-  ctx = b.storage().data_ptr().get_context();
-  b.storage().data_ptr().get_deleter()(ctx);
-  b.storage().mutable_data_ptr().release_context();
-
-  // b's deleter is dummy_free_1
-  // dummy_free_1 should be called above, so called_dummy_free_1 should be 1
-  ASSERT_TRUE(called_dummy_free_1 == 1);
-}
--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_amp_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/cpu_inductor_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_inductor_torchbench_inference.csv
@ -82,11 +82,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/dynamic_cpu_max_autotune_inductor_amp_freezing_torchbench_inference.csv
@ -98,11 +98,11 @@ dlrm,pass,0



-doctr_det_predictor,pass,5
+doctr_det_predictor,pass,3



-doctr_reco_predictor,pass,4
+doctr_reco_predictor,pass,1



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/aot_eager_torchbench_inference.csv
@ -106,11 +106,11 @@ dlrm,pass,0



-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3



-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_aot_eager_torchbench_inference.csv
@ -106,11 +106,11 @@ dlrm,pass,0



-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3



-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamic_inductor_torchbench_inference.csv
@ -106,11 +106,11 @@ dlrm,pass,0



-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3



-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/dynamo_eager_torchbench_inference.csv
@ -106,11 +106,11 @@ dlrm,pass,0



-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3



-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1



--- a/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
+++ b/benchmarks/dynamo/ci_expected_accuracy/rocm/inductor_torchbench_inference.csv
@ -106,11 +106,11 @@ dlrm,pass,0



-doctr_det_predictor,eager_fail_to_run,5
+doctr_det_predictor,eager_fail_to_run,3



-doctr_reco_predictor,eager_fail_to_run,4
+doctr_reco_predictor,eager_fail_to_run,1



--- a/benchmarks/operator_benchmark/benchmark_core.py
+++ b/benchmarks/operator_benchmark/benchmark_core.py
@ -4,6 +4,7 @@ import csv
 import functools
 import json
 import os
+import platform
 import timeit
 from collections import namedtuple
 from dataclasses import asdict, dataclass
@ -17,6 +18,7 @@ import torch

 # needs to be imported after torch
 import torch.utils.cpp_extension as cpp_extension  # noqa: F401
+from torch.utils.benchmark import Timer


 """Performance microbenchmarks.
@ -191,6 +193,11 @@ class BenchmarkRunner:
        self.predefined_minimum_secs = 1
        self.max_iters = 1e6
        self.use_jit = args.use_jit
+        self.use_compile = args.use_compile
+        if self.use_jit and self.use_compile:
+            raise ValueError(
+                "use_jit and use_compile are mutually exclusive, please specify one."
+            )
        self.num_runs = args.num_runs
        self.print_per_iter = False
        self.output_csv = args.output_csv
@ -222,7 +229,7 @@ class BenchmarkRunner:
            if self.args.operators:
                print(f"# {self.args.operators}")

-    def _print_perf_result(self, reported_run_time_us, test_case):
+    def _print_perf_result(self, results, test_case):
        if self.args.report_aibench:
            # Output for AIBench
            # Print out per iteration execution time instead of avg time
@ -236,12 +243,14 @@ class BenchmarkRunner:
                            "type": test_name,
                            "metric": "latency",
                            "unit": "us",
-                            "value": str(reported_run_time_us[run]),
+                            "value": str(results["reported_run_time_us"[run]]),
                        }
                    )
                )
        else:
-            print(f"# Mode: {'JIT' if self.use_jit else 'Eager'}")
+            print(
+                f"# Mode: {'JIT' if self.use_jit else 'Compile' if self.use_compile else 'Eager'}"
+            )
            print(
                f"# Name: {test_case.test_config.test_name}\n# Input: {test_case.test_config.input_config}"
            )
@ -250,25 +259,33 @@ class BenchmarkRunner:
            if self.num_runs > 1:
                for run in range(self.num_runs):
                    print(
-                        f"Run: {run}, {mode} Execution Time (us) : {reported_run_time_us[run]:.3f}"
+                        f"Run: {run}, {mode} Execution Time (us) : {results['reported_run_time_us'][run]:.3f}"
                    )
                print()
            else:
-                print(f"{mode} Execution Time (us) : {reported_run_time_us[0]:.3f}\n")
+                print(
+                    f"{mode} Execution Time (us) : {results['reported_run_time_us'][0]:.3f}"
+                )
+                print(f"Peak Memory (KB) : {results['peak_memory']}\n")

-    def _perf_result_to_dict(self, reported_run_time_us, test_case):
+    def _perf_result_to_dict(self, results, test_case):
        """This function is the parallel of _print_perf_result, which instead of
        writing information to terminal, returns a dictionary.
        """
        if self.args.report_aibench:
            return {}
+
        out = {
            "test_name": test_case.test_config.test_name,
            "input_config": test_case.test_config.input_config,
-            "mode": "JIT" if self.use_jit else "Eager",
+            "runtime": (
+                "JIT" if self.use_jit else "Compile" if self.use_compile else "Eager"
+            ),
            "run": "Backward" if test_case.test_config.run_backward else "Forward",
-            "latency": round(reported_run_time_us[0], 3),
+            "latency": round(results["reported_run_time_us"][0], 3),
            "latency unit": "us",
+            "peak memory": results["peak_memory"],
+            "memory unit": "KB",
        }

        # parsing test_case.test_config.input_config, adding it as entries to the 'out' dictionary
@ -330,10 +347,26 @@ class BenchmarkRunner:
        func = test_case.run_forward
        if self.use_jit:
            func = test_case.run_jit_forward
-        forward_time = timeit.timeit(
-            functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+        if self.use_compile:
+            func = test_case.run_compile_forward
+
+        if not cuda_sync:
+            forward_time = timeit.timeit(
+                functools.partial(func, iters, print_per_iter, cuda_sync), number=1
+            )
+            return forward_time
+        # Stable timing with Timer
+        timer = Timer(
+            stmt="func(iters, print_per_iter, cuda_sync)",
+            globals={
+                "func": func,
+                "iters": iters,
+                "print_per_iter": print_per_iter,
+                "cuda_sync": cuda_sync,
+            },
        )
-        return forward_time
+        result = timer.adaptive_autorange(min_run_time=0.0001)
+        return result.median * iters

    def _launch_backward(self, test_case, iters, print_per_iter=False):
        """This function runs forward path of an op to get an output. Then the backward path is executed
@ -346,7 +379,7 @@ class BenchmarkRunner:
        )
        return backward_time

-    def _measure_time(self, launch_test, test_case, iters, print_per_iter):
+    def _measure_metrics(self, launch_test, test_case, iters, print_per_iter):
        """
        This function execute the operator for <iters> iterations then look at the time.
        If it's not significant, the number of iterations will be increased before rerun.
@ -354,8 +387,25 @@ class BenchmarkRunner:
        """
        curr_test_total_time = 0
        time_trace = []
+        peak_memory = 0
+        input_values = test_case.op_bench.inputs.values()
+        device, device_module = None, None
+        if input_values and isinstance(next(iter(input_values)), torch.Tensor):
+            # The device and device module information are crucial for memory metric calculation,
+            # In case of ops where inputs are integers (not tensor), memory metrics need not be calculated.
+            sample_input = next(iter(input_values))
+            device = sample_input.device
+            device_module = torch.get_device_module(device.type)
+        # TODO: add support for cpu memory measurement
        while True:
+            if hasattr(device_module, "reset_peak_memory_stats"):
+                device_module.reset_peak_memory_stats(device)
            run_time_sec = launch_test(test_case, iters, print_per_iter)
+            if hasattr(device_module, "synchronize"):
+                device_module.synchronize(device)
+            # Memory measurement process
+            if hasattr(device_module, "max_memory_allocated"):
+                peak_memory = device_module.max_memory_allocated(device)
            curr_test_total_time += run_time_sec
            # Analyze time after each run to decide if the result is stable
            results_are_significant = self._iteration_result_is_significant(
@ -369,7 +419,13 @@ class BenchmarkRunner:
            time_trace.append(report_run_time)
            # Print out the time spent in each epoch in ms
            if self.args.report_aibench:
-                mode = "JIT" if self.use_jit else "Eager"
+                mode = (
+                    "JIT"
+                    if self.use_jit
+                    else "Compile"
+                    if self.use_compile
+                    else "Eager"
+                )
                test_name = "_".join(
                    [test_case.framework, test_case.test_config.test_name, mode]
                )
@ -381,7 +437,7 @@ class BenchmarkRunner:
                            "metric": "latency",
                            "unit": "ms",
                            "value": str(report_run_time / 1e3),
-                        }
+                        },
                    )
                )
            if results_are_significant:
@ -391,7 +447,7 @@ class BenchmarkRunner:
            # iteration count, and run the benchmark again...
            iters = self._predict_num_iter_needed(iters)
        reported_run_time_us = np.percentile(np.array(time_trace), 50)
-        return reported_run_time_us
+        return reported_run_time_us, peak_memory / 1024

    def _check_keep(self, test_flag, cmd_flag):
        return cmd_flag is None or test_flag == cmd_flag
@ -478,6 +534,7 @@ class BenchmarkRunner:
        self,
        perf_list,
        output_file,
+        benchmark_name="PyTorch operator benchmark",
    ):
        """
        Write the result into JSON format, so that it can be uploaded to the benchmark database
@ -495,8 +552,10 @@ class BenchmarkRunner:
            input_config = perf_item.get("input_config", "")
            run_type = perf_item.get("run")
            latency = perf_item.get("latency", 0)
-
-            dtype = "float32"  # default
+            peak_memory = perf_item.get("peak memory", 0)
+            device = perf_item.get("device", "unknown")
+            dtype = perf_item.get("dtype", "torch.float").split(".")[1]
+            runtime = perf_item.get("runtime", None)

            # Extract mode based on run_type
            mode = None
@ -505,6 +564,22 @@ class BenchmarkRunner:
            elif run_type == "Backward":
                mode = "training"

+            # Extract use_compile from it
+            if runtime == "Compile":
+                use_compile = True
+            elif runtime == "Eager":
+                use_compile = False
+            else:
+                use_compile = None
+
+            device_arch = (
+                torch.cuda.get_device_name(0)
+                if device == "cuda"
+                else platform.processor()
+                if device == "cpu"
+                else "unknown"
+            )
+
            # Create the record
            @dataclass
            class BenchmarkInfo:
@ -532,12 +607,18 @@ class BenchmarkRunner:
                model: ModelInfo
                metric: MetricInfo

-            record = BenchmarkRecord(
+            # Add record for latency
+            record_latency = BenchmarkRecord(
                benchmark=BenchmarkInfo(
-                    name="PyTorch operator benchmark",
+                    name=benchmark_name,
                    mode=mode,
                    dtype=dtype,
-                    extra_info={"input_config": input_config},
+                    extra_info={
+                        "input_config": input_config,
+                        "device": device,
+                        "arch": device_arch,
+                        "use_compile": use_compile,
+                    },
                ),
                model=ModelInfo(
                    name=test_name, type="micro-benchmark", origins=["pytorch"]
@ -549,8 +630,17 @@ class BenchmarkRunner:
                    target_value=None,
                ),
            )
+            records.append(asdict(record_latency))

-            records.append(asdict(record))
+            # Add record for peak memory
+            record_memory = copy.deepcopy(record_latency)
+            record_memory.metric = MetricInfo(
+                name="peak memory",
+                unit="KB",
+                benchmark_values=[peak_memory],
+                target_value=None,
+            )
+            records.append(asdict(record_memory))

        # Write all records to the output file
        with open(output_file, "w", encoding="utf-8") as f:
@ -566,6 +656,7 @@ class BenchmarkRunner:
            "tag",
            "run_backward",
            "Execution Time",
+            "Peak Memory (KB)",
        ]

        if self.args.output_json or self.args.output_json_for_dashboard:
@ -603,13 +694,16 @@ class BenchmarkRunner:
                    test_case, self.args.warmup_iterations, print_per_iter=False
                )
                # Actual Execution
-                reported_time = [
-                    self._measure_time(
+                results = [
+                    self._measure_metrics(
                        launch_func, test_case, self.iters, self.print_per_iter
                    )
                    for _ in range(self.num_runs)
                ]
-                self._print_perf_result(reported_time, test_case)
+                result_dict = dict()
+                result_dict["reported_run_time_us"] = [r[0] for r in results]
+                result_dict["peak_memory"] = results[0][1]
+                self._print_perf_result(results=result_dict, test_case=test_case)

                # output results to csv
                self._output_csv(
@ -625,16 +719,17 @@ class BenchmarkRunner:
                        ),
                        test_case.test_config.tag,
                        test_case.test_config.run_backward,
-                        reported_time[0],
+                        result_dict["reported_run_time_us"][0],
+                        result_dict["peak_memory"],
                    ],
                )
                if self.args.output_json or self.args.output_json_for_dashboard:
-                    perf_list.append(
-                        self._perf_result_to_dict(reported_time, test_case)
-                    )
+                    perf_list.append(self._perf_result_to_dict(result_dict, test_case))

        if self.args.output_json_for_dashboard:
-            self._output_json(perf_list, self.args.output_json_for_dashboard)
+            self._output_json(
+                perf_list, self.args.output_json_for_dashboard, self.args.benchmark_name
+            )

        if self.args.output_json:
            with open(self.args.output_json, "w") as f:
--- a/benchmarks/operator_benchmark/benchmark_pytorch.py
+++ b/benchmarks/operator_benchmark/benchmark_pytorch.py
@ -4,6 +4,15 @@ import time
 import torch


+# Import the C++ extension to register the _consume operator
+try:
+    import benchmark_cpp_extension  # noqa: F401
+except ImportError as err:
+    # If the extension isn't built, the script must raise an error
+    raise ImportError(
+        "Failed to import C++ extension, please build it using \ncd pt_extension \npython -m pip install ."
+    ) from err
+
 """PyTorch performance microbenchmarks.

 This module contains PyTorch-specific functionalities for performance
@ -71,6 +80,16 @@ class TorchBenchmarkBase(torch.nn.Module):
        for _ in range(iters):
            torch.ops.operator_benchmark._consume(self.forward_impl())

+    def forward_impl_eager(self):
+        # This is to supply the inputs to the forward function which
+        # will be called in both the eager and compile mode of local runs
+        return self.forward(*self.get_inputs())
+
+    def forward_consume_eager(self, iters: int):
+        # Eager version of forward_consume without decorators (compilation handled by torch.compile)
+        for _ in range(iters):
+            torch.ops.operator_benchmark._consume(self.forward_impl_eager())
+
    def module_name(self):
        """this is used to label the operator being benchmarked"""
        if self.user_given_name:
@ -117,18 +136,34 @@ class PyTorchOperatorTestCase:
        self.framework = "PyTorch"
        self.time_series = []
        self._jit_forward_graph = None
+        self._compile_forward_graph = None

    def _generate_jit_forward_graph(self):
        """generate a graph for the forward function via scripting"""
        scripted_op_bench = torch.jit.script(self.op_bench)
        return scripted_op_bench.forward_consume

+    def _generate_compile_forward_graph(self):
+        """generate a compiled graph for the forward function via torch.compile"""
+        compiled_forward_consume = torch.compile(
+            self.op_bench.forward_consume_eager, backend="inductor"
+        )
+        return compiled_forward_consume
+
    def run_jit_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
        """Run the forward path of an op with JIT mode"""
        if self._jit_forward_graph is None:
            self._jit_forward_graph = self._generate_jit_forward_graph()
        self._jit_forward_graph(num_runs)

+    def run_compile_forward(self, num_runs, print_per_iter=False, cuda_sync=False):
+        """Run the forward path of an op with compile mode"""
+        if self._compile_forward_graph is None:
+            self._compile_forward_graph = self._generate_compile_forward_graph()
+        self._compile_forward_graph(num_runs)
+        if cuda_sync:
+            torch.cuda.synchronize(torch.cuda.current_device())
+
    def _print_per_iter(self):
        # print last 50 values
        length = min(len(self.time_series), 50)
@ -150,14 +185,14 @@ class PyTorchOperatorTestCase:
        if print_per_iter:
            for _ in range(num_runs):
                start_time = time.time()
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
                if cuda_sync:
                    torch.cuda.synchronize(torch.cuda.current_device())
                end_time = time.time()
                self.time_series.append((end_time - start_time) * 1e3)
        else:
            for _ in range(num_runs):
-                self.output = self.op_bench.forward_impl()
+                self.output = self.op_bench.forward_impl_eager()
            if cuda_sync:
                torch.cuda.synchronize(torch.cuda.current_device())

--- a/benchmarks/operator_benchmark/benchmark_runner.py
+++ b/benchmarks/operator_benchmark/benchmark_runner.py
@ -62,6 +62,13 @@ def parse_args():
        default=None,
    )

+    parser.add_argument(
+        "--benchmark-name",
+        "--benchmark_name",
+        help="Name of the benchmark to store results to",
+        default="PyTorch operator benchmark",
+    )
+
    parser.add_argument(
        "--list-tests",
        "--list_tests",
@ -135,6 +142,16 @@ def parse_args():
        help="Run operators with PyTorch JIT mode",
    )

+    parser.add_argument(
+        "--use-compile",
+        "--use_compile",
+        type=benchmark_utils.str2bool,
+        nargs="?",
+        const=True,
+        default=False,
+        help="Run operators with PyTorch Compile mode",
+    )
+
    parser.add_argument(
        "--forward-only",
        "--forward_only",
@ -162,7 +179,7 @@ def parse_args():
        "--output-json-for-dashboard",
        "--output_json_for_dashboard",
        help="Save results in JSON format for display on the OSS dashboard",
-        default="False",
+        default="benchmark-results.json",
    )

    args, _ = parser.parse_known_args()
--- a/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
+++ b/benchmarks/operator_benchmark/expected_ci_operator_benchmark_eager_float32_cpu.csv
@ -1,5 +1,5 @@
 Benchmarking Framework,Benchmarking Module Name,Case Name,tag,run_backward,Execution Time
-PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,3.9497
+PyTorch,add,add_M1_N1_K1_cpu,short,FALSE,2.459
 PyTorch,add,add_M64_N64_K64_cpu,short,FALSE,14.3181
 PyTorch,add,add_M64_N64_K128_cpu,short,FALSE,14.6826
 PyTorch,add,add_M1_N1_K1_cpu_bwdall_BACKWARD,short,TRUE,58.1449
@ -376,10 +376,10 @@ PyTorch,relu6,"relu6_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",sho
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,9.6588
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,9.5969
 PyTorch,relu6,"relu6_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,9.547
-PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,68.739
+PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.21375
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,45.14133333
 PyTorch,relu6,"relu6_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,52.6664
-PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,69.1875
+PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,51.49525
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,48.3458
 PyTorch,relu6,"relu6_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,62.0719
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,7.5728
@ -388,10 +388,10 @@ PyTorch,functional.hardtanh,"functional.hardtanh_dims(3,4,5)_contigFalse_inplace
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,8.1647
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,8.1768
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(2,3,4,5)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,8.0619
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.118
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,48.88475
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,43.702
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(512,512)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,50.3613
-PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,67.436
+PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,50.3995
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint8",short,FALSE,46.9813
 PyTorch,functional.hardtanh,"functional.hardtanh_dims(256,1024)_contigFalse_inplaceFalse_dtypetorch.qint32",short,FALSE,59.2295
 PyTorch,functional.hardsigmoid,"functional.hardsigmoid_dims(3,4,5)_contigFalse_inplaceFalse_dtypetorch.quint8",short,FALSE,6.5189
@ -1316,4 +1316,4 @@ PyTorch,where,"where_cond_shape(8,16,1)_input_shape(1,)_other_shape(1,)_cpu_dtyp
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(16,1)_other_shape(8,16,1)_cpu_dtypetorch.float32",short,FALSE,5.763
 PyTorch,where,"where_cond_shape(8,16,1)_input_shape(8,1,1)_other_shape(1,)_cpu_dtypetorch.float32",short,FALSE,5.744666667
 PyTorch,clamp,clamp_M512_N512_cpu,short,FALSE,15.26233333
-PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
+PyTorch,gelu,gelu_M512_N512_cpu,short,FALSE,31.33166667
--- a/benchmarks/operator_benchmark/pt/add_test.py
+++ b/benchmarks/operator_benchmark/pt/add_test.py
@ -52,27 +52,6 @@ class AddBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_test(add_long_configs + add_short_configs, AddBenchmark)
 op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddBenchmark)

-
-"""Mircobenchmark for addmm operator."""
-
-
-class AddmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(M, K, device=device, requires_grad=self.auto_set()),
-            "mat1": torch.rand(M, N, device=device, requires_grad=self.auto_set()),
-            "mat2": torch.rand(N, K, device=device, requires_grad=self.auto_set()),
-        }
-        self.set_module_name("addmm")
-
-    def forward(self, input_one, mat1, mat2):
-        return torch.addmm(input_one, mat1, mat2)
-
-
-op_bench.generate_pt_test(add_long_configs + add_short_configs, AddmmBenchmark)
-op_bench.generate_pt_gradient_test(add_long_configs + add_short_configs, AddmmBenchmark)
-
-
 """Mircobenchmark for addr operator."""


@ -106,46 +85,5 @@ addr_configs = op_bench.cross_product_configs(
 op_bench.generate_pt_test(addr_configs, AddrBenchmark)
 op_bench.generate_pt_gradient_test(addr_configs, AddrBenchmark)

-
-"""Mircobenchmark for addbmm operator."""
-
-
-class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, B, M, N, K, device):
-        self.inputs = {
-            "input_one": torch.rand(
-                (M, N), device=device, requires_grad=self.auto_set()
-            ),
-            "batch1": torch.rand(
-                (B, M, K), device=device, requires_grad=self.auto_set()
-            ),
-            "batch2": torch.rand(
-                (
-                    B,
-                    K,
-                    N,
-                ),
-                device=device,
-                requires_grad=self.auto_set(),
-            ),
-        }
-        self.set_module_name("addbmm")
-
-    def forward(self, input_one, batch1, batch2):
-        return torch.addbmm(input_one, batch1, batch2)
-
-
-addbmm_configs = op_bench.cross_product_configs(
-    B=[2, 100],
-    M=[8, 256],
-    N=[256, 16],
-    K=[15, 16],
-    device=["cpu", "cuda"],
-    tags=["addbmm"],
-)
-
-op_bench.generate_pt_test(addbmm_configs, AddbmmBenchmark)
-op_bench.generate_pt_gradient_test(addbmm_configs, AddbmmBenchmark)
-
 if __name__ == "__main__":
    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/pt/addmm_test.py
+++ b/benchmarks/operator_benchmark/pt/addmm_test.py
@ -0,0 +1,115 @@
+import operator_benchmark as op_bench
+
+import torch
+
+
+"""Microbenchmarks for add_(matmul) operator. Supports both Caffe2/PyTorch."""
+
+# Configs for PT add operator
+addmm_long_configs = op_bench.cross_product_configs(
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    tags=["long"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+)
+
+
+addmm_short_configs = op_bench.config_list(
+    attr_names=["M", "N", "K"],
+    attrs=[
+        [1, 1, 1],
+        [64, 64, 64],
+        [64, 64, 128],
+    ],
+    cross_product_configs={
+        "device": ["cpu", "cuda"],
+        "dtype": [torch.float],
+    },
+    tags=["short"],
+)
+
+
+"""Mircobenchmark for addmm operator."""
+
+
+class AddmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                M, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat1": torch.rand(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "mat2": torch.rand(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+        }
+        self.set_module_name("addmm")
+
+    def forward(self, input_one, mat1, mat2):
+        return torch.addmm(input_one, mat1, mat2)
+
+
+op_bench.generate_pt_test(addmm_long_configs + addmm_long_configs, AddmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addmm_long_configs + addmm_long_configs, AddmmBenchmark
+)
+
+"""Mircobenchmark for addbmm operator."""
+
+
+class AddbmmBenchmark(op_bench.TorchBenchmarkBase):
+    def init(self, B, M, N, K, device, dtype):
+        self.inputs = {
+            "input_one": torch.rand(
+                (M, N), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch1": torch.rand(
+                (B, M, K), device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "batch2": torch.rand(
+                (
+                    B,
+                    K,
+                    N,
+                ),
+                device=device,
+                requires_grad=self.auto_set(),
+                dtype=dtype,
+            ),
+        }
+        self.set_module_name("addbmm")
+
+    def forward(self, input_one, batch1, batch2):
+        return torch.addbmm(input_one, batch1, batch2)
+
+
+addbmm_long_configs = op_bench.cross_product_configs(
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["long"],
+)
+addbmm_short_configs = op_bench.cross_product_configs(
+    B=[1, 8],
+    M=[8, 128],
+    N=[32, 64],
+    K=[256, 512],
+    device=["cpu", "cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
+    tags=["short"],
+)
+
+op_bench.generate_pt_test(addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark)
+op_bench.generate_pt_gradient_test(
+    addbmm_long_configs + addbmm_short_configs, AddbmmBenchmark
+)
+
+if __name__ == "__main__":
+    op_bench.benchmark_runner.main()
--- a/benchmarks/operator_benchmark/pt/bmm_test.py
+++ b/benchmarks/operator_benchmark/pt/bmm_test.py
@ -27,12 +27,12 @@ batched_binary_configs_short = op_bench.config_list(
 )

 batched_binary_configs_long = op_bench.cross_product_configs(
-    B=[1, 128],
-    M=[8, 128],
-    N=[32, 64],
-    K=[4, 256],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    B=[8, 32],
+    M=[256, 1024],
+    N=[256, 1024],
+    K=[64, 128],
+    device=["cuda"],
+    dtype=[torch.float32, torch.bfloat16, torch.float16],
    tags=["long"],
 )

@ -40,8 +40,12 @@ batched_binary_configs_long = op_bench.cross_product_configs(
 class BatchedBinaryOpBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, B, M, N, K, device, dtype, op_func):
        self.inputs = {
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
        }
        self.op_func = op_func

@ -54,6 +58,11 @@ op_bench.generate_pt_tests_from_op_list(
    batched_binary_configs_short + batched_binary_configs_long,
    BatchedBinaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_binary_ops,
+    batched_binary_configs_long,
+    BatchedBinaryOpBenchmark,
+)


 # batched ternary ops
@ -66,9 +75,15 @@ batched_ternary_ops = op_bench.op_list(
 class BatchedTernaryOpBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, B, M, N, K, device, dtype, op_func):
        self.inputs = {
-            "input_": torch.rand((B, M, K), device=device).to(dtype=dtype),
-            "batch1": torch.rand((B, M, N), device=device).to(dtype=dtype),
-            "batch2": torch.rand((B, N, K), device=device).to(dtype=dtype),
+            "input_": torch.rand(
+                (B, M, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch1": torch.rand(
+                (B, M, N), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
+            "batch2": torch.rand(
+                (B, N, K), device=device, dtype=dtype, requires_grad=self.auto_set()
+            ),
        }
        self.op_func = op_func

@ -81,6 +96,12 @@ op_bench.generate_pt_tests_from_op_list(
    batched_binary_configs_short + batched_binary_configs_long,
    BatchedTernaryOpBenchmark,
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    batched_ternary_ops,
+    batched_binary_configs_long,
+    BatchedTernaryOpBenchmark,
+)
+

 # TODO: does it automatically register new scripts?

--- a/benchmarks/operator_benchmark/pt/matmul_test.py
+++ b/benchmarks/operator_benchmark/pt/matmul_test.py
@ -13,33 +13,46 @@ mm_short_configs = op_bench.config_list(
        [128, 128, 128, True, False],
        [256, 256, 256, False, True],
    ],
-    cross_product_configs={
-        "device": ["cpu", "cuda"],
-    },
+    cross_product_configs={"device": ["cpu", "cuda"]},
    tags=["short"],
 )


 mm_long_configs = op_bench.cross_product_configs(
-    M=[32],
-    N=[512, 128],
-    K=[64],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
    trans_a=[False, True],
    trans_b=[True, False],
-    device=["cpu", "cuda"],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
    tags=["long"],
 )


 class MatMulBenchmark(op_bench.TorchBenchmarkBase):
-    def init(self, M, N, K, trans_a, trans_b, device):
+    def init(self, M, N, K, trans_a, trans_b, device, dtype=torch.float):
+        # Create tensors without requires_grad first, then set it separately
+        # This avoids creating graph leaves that cannot be deep copied
+        if trans_a:
+            input_one = torch.rand(M, N, device=device, dtype=dtype)
+        else:
+            input_one = torch.rand(N, M, device=device, dtype=dtype).t()
+
+        if trans_b:
+            input_two = torch.rand(N, K, device=device, dtype=dtype)
+        else:
+            input_two = torch.rand(K, N, device=device, dtype=dtype).t()
+
+        # Set requires_grad after tensor creation to avoid graph leaf issues
+        if self.auto_set():
+            input_one.requires_grad_(True)
+        if self.auto_set():
+            input_two.requires_grad_(True)
+
        self.inputs = {
-            "input_one": torch.rand(M, N, device=device)
-            if trans_a
-            else torch.rand(N, M, device=device).t(),
-            "input_two": torch.rand(N, K, device=device)
-            if trans_b
-            else torch.rand(K, N, device=device).t(),
+            "input_one": input_one,
+            "input_two": input_two,
        }
        self.set_module_name("matmul")

@ -48,6 +61,7 @@ class MatMulBenchmark(op_bench.TorchBenchmarkBase):


 op_bench.generate_pt_test(mm_long_configs + mm_short_configs, MatMulBenchmark)
+op_bench.generate_pt_gradient_test(mm_long_configs, MatMulBenchmark)


 if __name__ == "__main__":
--- a/benchmarks/operator_benchmark/pt/mm_test.py
+++ b/benchmarks/operator_benchmark/pt/mm_test.py
@ -23,11 +23,11 @@ mm_short_configs = op_bench.config_list(
 )

 mm_long_configs = op_bench.cross_product_configs(
-    M=[8, 128],
-    N=[32, 64],
-    K=[256, 512],
-    device=["cpu", "cuda"],
-    dtype=[torch.float, torch.bfloat16],
+    M=[256, 1024, 3000],
+    N=[512, 4096],
+    K=[512, 4096],
+    device=["cuda"],
+    dtype=[torch.float16, torch.bfloat16, torch.float32],
    tags=["long"],
 )

@ -35,8 +35,12 @@ mm_long_configs = op_bench.cross_product_configs(
 class MmOpBenchmark(op_bench.TorchBenchmarkBase):
    def init(self, M, N, K, device, dtype, op_func):
        self.inputs = {
-            "input_one": torch.randn(M, N, device=device).to(dtype=dtype),
-            "input_two": torch.randn(N, K, device=device).to(dtype=dtype),
+            "input_one": torch.randn(
+                M, N, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
+            "input_two": torch.randn(
+                N, K, device=device, requires_grad=self.auto_set(), dtype=dtype
+            ),
        }
        self.op_func = op_func

@ -47,6 +51,9 @@ class MmOpBenchmark(op_bench.TorchBenchmarkBase):
 op_bench.generate_pt_tests_from_op_list(
    ops_list, mm_short_configs + mm_long_configs, MmOpBenchmark
 )
+op_bench.generate_pt_gradient_tests_from_op_list(
+    ops_list, mm_long_configs, MmOpBenchmark
+)


 if __name__ == "__main__":
--- a/buckbuild.bzl
+++ b/buckbuild.bzl
@ -391,6 +391,8 @@ def get_aten_generated_files(enabled_backends):
        "CompositeExplicitAutogradFunctions_inl.h",
        "CompositeExplicitAutogradNonFunctionalFunctions.h",
        "CompositeExplicitAutogradNonFunctionalFunctions_inl.h",
+        "ViewMetaClasses.h",
+        "ViewMetaClasses.cpp",
        "VmapGeneratedPlumbing.h",
        "core/ATenOpList.cpp",
        "core/TensorBody.h",
@ -1192,6 +1194,7 @@ def define_buck_targets(
            "NativeMetaFunctions.h": ":gen_aten[NativeMetaFunctions.h]",
            "Operators.h": ":gen_aten[Operators.h]",
            "RedispatchFunctions.h": ":gen_aten[RedispatchFunctions.h]",
+            "ViewMetaClasses.h": ":gen_aten[ViewMetaClasses.h]",
            "core/TensorBody.h": ":gen_aten[core/TensorBody.h]",
            "core/aten_interned_strings.h": ":gen_aten[core/aten_interned_strings.h]",
            "core/enum_tag.h": ":gen_aten[core/enum_tag.h]",
--- a/build.bzl
+++ b/build.bzl
@ -118,6 +118,9 @@ def define_targets(rules):
            ":LazyNonNativeIr.h",
            ":RegisterDispatchDefinitions.ini",
            ":RegisterDispatchKey.cpp",
+            ":ViewMetaClassesPythonBinding.cpp",
+            ":ViewMetaClasses.cpp",
+            ":ViewMetaClasses.h",
            ":native_functions.yaml",
            ":shape_inference.h",
            ":tags.yaml",
@ -170,6 +173,7 @@ GENERATED_H = [
    "FunctionalInverses.h",
    "RedispatchFunctions.h",
    "RegistrationDeclarations.h",
+    "ViewMetaClasses.h",
    "VmapGeneratedPlumbing.h",
 ]

@ -246,6 +250,7 @@ GENERATED_CPP = [
    "RegisterFunctionalization_1.cpp",
    "RegisterFunctionalization_2.cpp",
    "RegisterFunctionalization_3.cpp",
+    "ViewMetaClasses.cpp",
 ]

 GENERATED_CPP_CORE = [
@ -307,6 +312,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
    "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
    "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
    "torch/csrc/autograd/generated/python_variable_methods.cpp",
+    "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
 ]

 GENERATED_AUTOGRAD_PYTHON = _GENERATED_AUTOGRAD_PYTHON_HEADERS + _GENERATED_AUTOGRAD_PYTHON_CPP
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -1007,6 +1007,7 @@ libtorch_python_core_sources = [
    "torch/csrc/utils/disable_torch_function.cpp",
    "torch/csrc/utils/verbose.cpp",
    "torch/csrc/cpu/Module.cpp",
+    "torch/csrc/functionalization/Module.cpp",
    "torch/csrc/instruction_counter/Module.cpp",
    "torch/nativert/python/Bindings.cpp",
 ] + lazy_tensor_core_python_sources
@ -1049,6 +1050,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
        "torch/csrc/autograd/generated/python_torch_functions_1.cpp",
        "torch/csrc/autograd/generated/python_torch_functions_2.cpp",
        "torch/csrc/autograd/generated/python_variable_methods.cpp",
+        "torch/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp",
    ]]

    _libtorch_python_sources.extend(libtorch_python_core_sources)
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -316,6 +316,7 @@ set(GENERATED_CXX_PYTHON
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
+  "${TORCH_SRC_DIR}/csrc/functionalization/generated/ViewMetaClassesPythonBinding.cpp"
  )

 set(GENERATED_H_PYTHON
@ -379,6 +380,9 @@ add_custom_command(
    "${TORCH_ROOT}/aten/src/ATen/templates/LazyIr.h"
    "${TORCH_ROOT}/aten/src/ATen/templates/LazyNonNativeIr.h"
    "${TORCH_ROOT}/aten/src/ATen/templates/RegisterDispatchKey.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.h"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClasses.cpp"
+    "${TORCH_ROOT}/aten/src/ATen/templates/ViewMetaClassesPythonBinding.cpp"
    ${autograd_python}
    ${autograd_yaml}
    ${autograd_templates}
--- a/docs/cpp/source/conf.py
+++ b/docs/cpp/source/conf.py
@ -40,7 +40,34 @@ extensions = [
    "sphinx.ext.intersphinx",
 ] + (["breathe", "exhale"] if run_doxygen else [])

-intersphinx_mapping = {"pytorch": ("https://pytorch.org/docs/main", None)}
+intersphinx_mapping = {"pytorch": ("https://docs.pytorch.org/docs/main", None)}
+
+# Configure Sphinx warnings and error handling
+suppress_warnings = [
+    "ref.citation",
+    "ref.footnote",
+    "ref.doc",
+    "toc.excluded",
+    "toc.not_readable",
+    "misc.highlighting_failure",
+]
+
+# Configure Breathe
+breathe_show_define_initializer = True
+breathe_show_enumvalue_initializer = True
+breathe_default_members = ("members", "undoc-members")
+
+
+# Fix for Python 3.10+ compatibility with exhale 2.3.0
+# MutableMapping was moved from collections to collections.abc in Python 3.10
+try:
+    import collections
+    from collections.abc import MutableMapping
+
+    if not hasattr(collections, "MutableMapping"):
+        collections.MutableMapping = MutableMapping
+except ImportError:
+    pass

 # Setup absolute paths for communicating with breathe / exhale where
 # items are expected / should be trimmed by.
@ -101,6 +128,21 @@ exhale_args = {
        Welcome to the developer reference for the PyTorch C++ API.
    """
    ),
+    ############################################################################
+    # Duplicate handling and error management.                                 #
+    ############################################################################
+    # Note: Using Doxyfile instead of stdin configuration
+    # "exhaleDoxygenStdin" is not compatible with "exhaleUseDoxyfile"
+    # Handle unresolved references more gracefully
+    "unabridgedOrphanKinds": {
+        "function",
+        "define",
+        "enum",
+        "enumvalue",
+        "typedef",
+        "variable",
+    },
+    "fullToctreeMaxDepth": 2,
 }

 # Tell sphinx what the primary language being documented is.
@ -174,6 +216,7 @@ html_theme = "pytorch_sphinx_theme2"
 #
 html_theme_options = {
    "canonical_url": "https://pytorch.org/docs/stable/",
+    "analytics_id": "GTM-T8XT4PS",
    "collapse_navigation": False,
    "logo": {"text": "Home"},
    "icon_links": [
--- a/Show More
+++ b/Show More
 @ -1 +1 @@
 .5.0
 .5.1