[dict] Support dict.update() with no args

ghstack-source-id: c36750bbc4a12c016986ba84ee25a549d96b4b52 Pull-Request: https://github.com/pytorch/pytorch/pull/158061
[OrderedDict] Implement hasattr(..., IteratorVariable)
2025-11-06 09:17:11 +08:00 · 2025-10-14 12:28:27 -03:00 · 2025-10-14 12:28:27 -03:00 · 2025-10-14 12:28:26 -03:00 · 2025-10-14 12:28:25 -03:00 · 2025-10-14 12:28:25 -03:00
2375 changed files with 30914 additions and 75865 deletions
--- a/.bc-linter.yml
+++ b/.bc-linter.yml
@ -13,4 +13,3 @@ exclude:
  - "**/benchmarks/**"
  - "**/test_*.py"
  - "**/*_test.py"
  - "tools/**"
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -113,7 +113,6 @@ case "$tag" in
    UCX_COMMIT=${_UCX_COMMIT}
    UCC_COMMIT=${_UCC_COMMIT}
    TRITON=yes
    INSTALL_MINGW=yes
    ;;
  pytorch-linux-jammy-cuda13.0-cudnn9-py3-gcc11)
    CUDA_VERSION=13.0.0
@ -195,16 +194,13 @@ case "$tag" in
    NINJA_VERSION=1.9.0
    TRITON=yes
    ;;
-  pytorch-linux-jammy-xpu-n-py3 | pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks)
+  pytorch-linux-jammy-xpu-n-py3)
    ANACONDA_PYTHON_VERSION=3.10
    GCC_VERSION=11
    VISION=yes
    XPU_VERSION=2025.2
    NINJA_VERSION=1.9.0
    TRITON=yes
    if [[ $tag =~ "benchmarks" ]]; then
      INDUCTOR_BENCHMARKS=yes
    fi
    ;;
  pytorch-linux-jammy-py3-gcc11-inductor-benchmarks)
    ANACONDA_PYTHON_VERSION=3.10
@ -365,7 +361,6 @@ docker build \
       --build-arg "OPENBLAS=${OPENBLAS:-}" \
       --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \
       --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \
       --build-arg "INSTALL_MINGW=${INSTALL_MINGW:-}" \
       -f $(dirname ${DOCKERFILE})/Dockerfile \
       -t "$tmp_tag" \
       "$@" \
--- a/.ci/docker/common/install_acl.sh
+++ b/.ci/docker/common/install_acl.sh
@ -3,7 +3,7 @@
 set -eux
-ACL_VERSION=${ACL_VERSION:-"v52.6.0"}
+ACL_VERSION=${ACL_VERSION:-"v25.02"}
 ACL_INSTALL_DIR="/acl"
 # Clone ACL
--- a/.ci/docker/common/install_conda.sh
+++ b/.ci/docker/common/install_conda.sh
@ -49,20 +49,12 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then
    export SYSROOT_DEP="sysroot_linux-64=2.17"
  fi
 # Install correct Python version
 # Also ensure sysroot is using a modern GLIBC to match system compilers
 if [ "$ANACONDA_PYTHON_VERSION" = "3.14" ]; then
  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
             python="3.14.0" \
             ${SYSROOT_DEP} \
             -c conda-forge
 else
  # Install correct Python version
  # Also ensure sysroot is using a modern GLIBC to match system compilers
  as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\
             python="$ANACONDA_PYTHON_VERSION" \
             ${SYSROOT_DEP}
-fi
+
  # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30
  # which is provided in libstdcxx 12 and up.
  conda_install libstdcxx-ng=12.3.0 --update-deps -c conda-forge
--- a/.ci/docker/common/install_cpython.sh
+++ b/.ci/docker/common/install_cpython.sh
@ -83,6 +83,10 @@ function build_cpython {
        py_suffix=${py_ver::-1}
        py_folder=$py_suffix
    fi
    # Update to rc2 due to https://github.com/python/cpython/commit/c72699086fe4
    if [ "$py_suffix" == "3.14.0" ]; then
        py_suffix="3.14.0rc2"
    fi
    wget -q $PYTHON_DOWNLOAD_URL/$py_folder/Python-$py_suffix.tgz -O Python-$py_ver.tgz
    do_cpython_build $py_ver Python-$py_suffix
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi
-NVSHMEM_VERSION=3.4.5
+NVSHMEM_VERSION=3.3.24
 function install_cuda {
  version=$1
@ -150,7 +150,7 @@ function install_130 {
  CUDNN_VERSION=9.13.0.50
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
-  install_cuda 13.0.2 cuda_13.0.2_580.95.05_linux
+  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 13 $CUDNN_VERSION
--- a/.ci/docker/common/install_mingw.sh
+++ b/.ci/docker/common/install_mingw.sh
@ -1,10 +0,0 @@
 #!/bin/bash
 set -ex
 # Install MinGW-w64 for Windows cross-compilation
 apt-get update
 apt-get install -y g++-mingw-w64-x86-64-posix
 echo "MinGW-w64 installed successfully"
 x86_64-w64-mingw32-g++ --version
--- a/.ci/docker/common/install_onnx.sh
+++ b/.ci/docker/common/install_onnx.sh
@ -19,8 +19,8 @@ pip_install \
  transformers==4.36.2
 pip_install coloredlogs packaging
-pip_install onnxruntime==1.23.1
+pip_install onnxruntime==1.23.0
-pip_install onnxscript==0.5.4
+pip_install onnxscript==0.5.3
 # Cache the transformers model to be used later by ONNX tests. We need to run the transformers
 # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/
--- a/.ci/docker/common/install_rocm.sh
+++ b/.ci/docker/common/install_rocm.sh
@ -40,7 +40,11 @@ EOF
    # Default url values
    rocm_baseurl="http://repo.radeon.com/rocm/apt/${ROCM_VERSION}"
    amdgpu_baseurl="https://repo.radeon.com/amdgpu/${ROCM_VERSION}/ubuntu"
    # Add amdgpu repository
    UBUNTU_VERSION_NAME=`cat /etc/os-release | grep UBUNTU_CODENAME | awk -F= '{print $2}'`
    echo "deb [arch=amd64] ${amdgpu_baseurl} ${UBUNTU_VERSION_NAME} main" > /etc/apt/sources.list.d/amdgpu.list
    # Add rocm repository
    wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
--- a/.ci/docker/common/install_rocm_magma.sh
+++ b/.ci/docker/common/install_rocm_magma.sh
@ -12,8 +12,8 @@ function do_install() {
    rocm_version_nodot=${rocm_version//./}
-    # post merge of https://github.com/icl-utk-edu/magma/pull/65
+    # https://github.com/icl-utk-edu/magma/pull/65
-    MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+    MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
    magma_archive="magma-rocm${rocm_version_nodot}-${MAGMA_VERSION}-1.tar.bz2"
    rocm_dir="/opt/rocm"
--- a/.ci/docker/libtorch/build.sh
+++ b/.ci/docker/libtorch/build.sh
@ -39,13 +39,9 @@ case ${DOCKER_TAG_PREFIX} in
        DOCKER_GPU_BUILD_ARG=""
        ;;
    rocm*)
        # we want the patch version of 7.0 instead
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        # we want the patch version of 6.4 instead
        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        BASE_TARGET=rocm
        GPU_IMAGE=rocm/dev-ubuntu-22.04:${GPU_ARCH_VERSION}-complete
--- a/.ci/docker/manywheel/Dockerfile_2_28
+++ b/.ci/docker/manywheel/Dockerfile_2_28
@ -149,7 +149,7 @@ FROM cpu_final as rocm_final
 ARG ROCM_VERSION=6.0
 ARG PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH ${PYTORCH_ROCM_ARCH}
-ARG DEVTOOLSET_VERSION=13
+ARG DEVTOOLSET_VERSION=11
 ENV LDFLAGS="-Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib64 -Wl,-rpath=/opt/rh/gcc-toolset-${DEVTOOLSET_VERSION}/root/usr/lib"
 # Somewhere in ROCm stack, we still use non-existing /opt/rocm/hip path,
 # below workaround helps avoid error
--- a/.ci/docker/manywheel/build.sh
+++ b/.ci/docker/manywheel/build.sh
@ -75,13 +75,9 @@ case ${image} in
        DOCKERFILE_SUFFIX="_cuda_aarch64"
        ;;
    manylinux2_28-builder:rocm*)
        # we want the patch version of 7.0 instead
        if [[ "$GPU_ARCH_VERSION" == *"7.0"* ]]; then
            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        # we want the patch version of 6.4 instead
        if [[ "$GPU_ARCH_VERSION" == *"6.4"* ]]; then
-            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.4"
+            GPU_ARCH_VERSION="${GPU_ARCH_VERSION}.2"
        fi
        TARGET=rocm_final
        MANY_LINUX_VERSION="2_28"
@ -97,7 +93,7 @@ case ${image} in
    manylinux2_28-builder:xpu)
        TARGET=xpu_final
        GPU_IMAGE=amd64/almalinux:8
-        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=13"
+        DOCKER_GPU_BUILD_ARG=" --build-arg DEVTOOLSET_VERSION=11"
        MANY_LINUX_VERSION="2_28"
        ;;
    *)
--- a/.ci/docker/requirements-ci.txt
+++ b/.ci/docker/requirements-ci.txt
@ -138,12 +138,10 @@ numba==0.60.0 ; python_version == "3.12" and platform_machine != "s390x"
 #test_binary_ufuncs.py
 numpy==1.22.4; python_version == "3.10"
 numpy==1.26.2; python_version == "3.11" or python_version == "3.12"
-numpy==2.1.2; python_version >= "3.13" and python_version < "3.14"
+numpy==2.1.2; python_version >= "3.13"
 numpy==2.3.4; python_version >= "3.14"
 pandas==2.0.3; python_version < "3.13"
-pandas==2.2.3; python_version >= "3.13" and python_version < "3.14"
+pandas==2.2.3; python_version >= "3.13"
 pandas==2.3.3; python_version >= "3.14"
 #onnxruntime
 #Description: scoring engine for Open Neural Network Exchange (ONNX) models
@ -155,8 +153,7 @@ opt-einsum==3.3
 #Pinned versions: 3.3
 #test that import: test_linalg.py
-optree==0.13.0 ; python_version < "3.14"
+optree==0.13.0
 optree==0.17.0 ; python_version >= "3.14"
 #Description: A library for tree manipulation
 #Pinned versions: 0.13.0
 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py,
@ -255,8 +252,7 @@ scikit-image==0.22.0
 #test that import:
 scipy==1.10.1 ; python_version <= "3.11"
-scipy==1.14.1 ; python_version > "3.11" and python_version < "3.14"
+scipy==1.14.1 ; python_version >= "3.12"
 scipy==1.16.2 ; python_version >= "3.14"
 # Pin SciPy because of failing distribution tests (see #60347)
 #Description: scientific python
 #Pinned versions: 1.10.1
@ -328,8 +324,7 @@ pywavelets==1.7.0 ; python_version >= "3.12"
 #Pinned versions: 1.4.1
 #test that import:
-lxml==5.3.0 ; python_version < "3.14"
+lxml==5.3.0
 lxml==6.0.2 ; python_version >= "3.14"
 #Description: This is a requirement of unittest-xml-reporting
 PyGithub==2.3.0
@ -339,14 +334,12 @@ sympy==1.13.3
 #Pinned versions:
 #test that import:
-onnx==1.19.1 ; python_version < "3.14"
+onnx==1.18.0
 # Unpin once Python 3.14 is supported. See  onnxruntime issue 26309.
 onnx==1.18.0 ; python_version == "3.14"
 #Description: Required by onnx tests, and mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
-onnxscript==0.5.4
+onnxscript==0.5.3
 #Description: Required by mypy and test_public_bindings.py when checking torch.onnx._internal
 #Pinned versions:
 #test that import:
@ -366,7 +359,7 @@ pwlf==2.2.1
 #test that import: test_sac_estimator.py
 # To build PyTorch itself
-pyyaml==6.0.3
+pyyaml==6.0.2
 pyzstd
 setuptools==78.1.1
 packaging==23.1
--- a/.ci/docker/requirements-docs.txt
+++ b/.ci/docker/requirements-docs.txt
@ -1,11 +1,15 @@
-sphinx==7.2.6
+sphinx==5.3.0
 #Description: This is used to generate PyTorch docs
-#Pinned versions: 7.2.6
+#Pinned versions: 5.3.0
-pytorch_sphinx_theme2==0.2.0
+standard-imghdr==3.13.0; python_version >= "3.13"
-#Description: This is needed to generate PyTorch docs
+#Description: This is needed by Sphinx, so it needs to be added here.
-#Pinned versions: 0.2.0
+# The reasons are as follows:
 # 1) This module has been removed from the Python standard library since Python 3.13(https://peps.python.org/pep-0594/#imghdr);
 # 2) The current version of Sphinx (5.3.0) is not compatible with Python 3.13.
 # Once Sphinx is upgraded to a version compatible with Python 3.13 or later, we can remove this dependency.
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git@71e55749be14ceb56e7f8211a9fb649866b87ad4#egg=pytorch_sphinx_theme2
 # TODO: sphinxcontrib.katex 0.9.0 adds a local KaTeX server to speed up pre-rendering
 # but it doesn't seem to work and hangs around idly. The initial thought that it is probably
 # something related to Docker setup. We can investigate this later.
@ -32,17 +36,17 @@ tensorboard==2.18.0 ; python_version >= "3.13"
 #Description: This is used to generate PyTorch docs
 #Pinned versions: 2.13.0
-breathe==4.36.0
+breathe==4.34.0
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 4.36.0
+#Pinned versions: 4.34.0
-exhale==0.3.7
+exhale==0.2.3
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.3.7
+#Pinned versions: 0.2.3
-docutils==0.20
+docutils==0.16
 #Description: This is used to generate PyTorch C++ docs
-#Pinned versions: 0.20
+#Pinned versions: 0.16
 bs4==0.0.1
 #Description: This is used to generate PyTorch C++ docs
@ -52,13 +56,13 @@ IPython==8.12.0
 #Description: This is used to generate PyTorch functorch docs
 #Pinned versions: 8.12.0
-myst-nb==1.3.0
+myst-nb==0.17.2
 #Description: This is used to generate PyTorch functorch and torch.compile docs.
-#Pinned versions: 1.3.0
+#Pinned versions: 0.17.2
 # The following are required to build torch.distributed.elastic.rendezvous.etcd* docs
 python-etcd==0.4.5
 sphinx-copybutton==0.5.0
-sphinx-design==0.6.1
+sphinx-design==0.4.0
 sphinxcontrib-mermaid==1.0.0
-myst-parser==4.0.1
+myst-parser==0.18.1
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -54,15 +54,12 @@ ENV OPENSSL_DIR /opt/openssl
 RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -100,16 +100,9 @@ COPY ./common/common_utils.sh common_utils.sh
 COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 # Only build aoti cpp tests when INDUCTOR_BENCHMARKS is set to True
 ENV BUILD_AOT_INDUCTOR_TEST ${INDUCTOR_BENCHMARKS}
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
 RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
 ARG INSTALL_MINGW
 COPY ./common/install_mingw.sh install_mingw.sh
 RUN if [ -n "${INSTALL_MINGW}" ]; then bash ./install_mingw.sh; fi
 RUN rm install_mingw.sh
 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/lumen_cli/cli/lib/common/git_helper.py
+++ b/.ci/lumen_cli/cli/lib/common/git_helper.py
@ -57,8 +57,8 @@ def clone_external_repo(target: str, repo: str, dst: str = "", update_submodules
        logger.info("Successfully cloned %s", target)
        return r, commit
-    except GitCommandError:
+    except GitCommandError as e:
-        logger.exception("Git operation failed")
+        logger.error("Git operation failed: %s", e)
        raise
--- a/.ci/lumen_cli/pyproject.toml
+++ b/.ci/lumen_cli/pyproject.toml
@ -6,7 +6,7 @@ dependencies = [
    "GitPython==3.1.45",
    "docker==7.1.0",
    "pytest==7.3.2",
-    "uv==0.9.6"
+    "uv==0.8.6"
 ]
 [tool.setuptools]
--- a/.ci/magma-rocm/Makefile
+++ b/.ci/magma-rocm/Makefile
@ -1,7 +1,7 @@
 SHELL=/usr/bin/env bash
 DOCKER_CMD ?= docker
-DESIRED_ROCM ?= 7.1
+DESIRED_ROCM ?= 7.0
 DESIRED_ROCM_SHORT = $(subst .,,$(DESIRED_ROCM))
 PACKAGE_NAME = magma-rocm
 # inherit this from underlying docker image, do not pass this env var to docker
@ -16,7 +16,6 @@ DOCKER_RUN = set -eou pipefail; ${DOCKER_CMD} run --rm -i \
 	magma-rocm/build_magma.sh
 .PHONY: all
 all: magma-rocm71
 all: magma-rocm70
 all: magma-rocm64
@ -25,11 +24,6 @@ clean:
 	$(RM) -r magma-*
 	$(RM) -r output
 .PHONY: magma-rocm71
 magma-rocm71: DESIRED_ROCM := 7.1
 magma-rocm71:
 	$(DOCKER_RUN)
 .PHONY: magma-rocm70
 magma-rocm70: DESIRED_ROCM := 7.0
 magma-rocm70:
--- a/.ci/magma-rocm/build_magma.sh
+++ b/.ci/magma-rocm/build_magma.sh
@ -6,8 +6,8 @@ set -eou pipefail
 # The script expects DESIRED_CUDA and PACKAGE_NAME to be set
 ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
-# post merge of https://github.com/icl-utk-edu/magma/pull/65
+# https://github.com/icl-utk-edu/magma/pull/65
-MAGMA_VERSION=c0792ae825fb36872784892ea643dd6f3456bc5f
+MAGMA_VERSION=d6e4117bc88e73f06d26c6c2e14f064e8fc3d1ec
 # Folders for the build
 PACKAGE_FILES=${ROOT_DIR}/magma-rocm/package_files # metadata
@ -20,7 +20,7 @@ mkdir -p ${PACKAGE_DIR} ${PACKAGE_OUTPUT}/linux-64 ${PACKAGE_BUILD} ${PACKAGE_RE
 # Fetch magma sources and verify checksum
 pushd ${PACKAGE_DIR}
-git clone https://github.com/icl-utk-edu/magma
+git clone https://github.com/jeffdaily/magma
 pushd magma
 git checkout ${MAGMA_VERSION}
 popd
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -187,22 +187,19 @@ if [[ $CUDA_VERSION == 12* || $CUDA_VERSION == 13* ]]; then
            export USE_CUFILE=0
        else
            DEPS_LIST+=(
                "/usr/local/cuda/lib64/libnvToolsExt.so.1"
                "/usr/local/cuda/lib64/libcublas.so.12"
                "/usr/local/cuda/lib64/libcublasLt.so.12"
                "/usr/local/cuda/lib64/libcudart.so.12"
                "/usr/local/cuda/lib64/libnvrtc.so.12"
                "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12")
            DEPS_SONAME+=(
                "libnvToolsExt.so.1"
                "libcublas.so.12"
                "libcublasLt.so.12"
                "libcudart.so.12"
                "libnvrtc.so.12"
                "libcupti.so.12")
            if [[ $CUDA_VERSION != 12.9* ]]; then
                DEPS_LIST+=("/usr/local/cuda/lib64/libnvToolsExt.so.1")
                DEPS_SONAME+=("libnvToolsExt.so.1")
            fi
        fi
    else
        echo "Using nvidia libs from pypi."
--- a/.ci/pytorch/build.sh
+++ b/.ci/pytorch/build.sh
@ -426,7 +426,7 @@ fi
 if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; then
  # export test times so that potential sharded tests that'll branch off this build will use consistent data
  # don't do this for libtorch as libtorch is C++ only and thus won't have python tests run on its build
-  PYTHONPATH=. python tools/stats/export_test_times.py
+  python tools/stats/export_test_times.py
 fi
 # don't do this for bazel or s390x or riscv64 as they don't use sccache
 if [[ "$BUILD_ENVIRONMENT" != *s390x* && "$BUILD_ENVIRONMENT" != *riscv64* && "$BUILD_ENVIRONMENT" != *-bazel-* ]]; then
--- a/.ci/pytorch/python_doc_push_script.sh
+++ b/.ci/pytorch/python_doc_push_script.sh
@ -89,41 +89,23 @@ if [ "$is_main_doc" = true ]; then
  make coverage
  # Now we have the coverage report, we need to make sure it is empty.
-  # Sphinx 7.2.6+ format: python.txt contains a statistics table with a TOTAL row
+  # Count the number of lines in the file and turn that number into a variable
-  # showing the undocumented count in the third column.
+  # $lines. The `cut -f1 ...` is to only parse the number, not the filename
-  # Example: | TOTAL | 99.83% | 2 |
+  # Skip the report header by subtracting 2: the header will be output even if
  # there are no undocumented items.
  #
  # Also: see docs/source/conf.py for "coverage_ignore*" items, which should
  # be documented then removed from there.
-
+  lines=$(wc -l build/coverage/python.txt 2>/dev/null |cut -f1 -d' ')
-  # Extract undocumented count from TOTAL row in Sphinx 7.2.6 statistics table
+  undocumented=$((lines - 2))
-  # The table format is: | Module | Coverage | Undocumented |
+  if [ $undocumented -lt 0 ]; then
  # Extract the third column (undocumented count) from the TOTAL row
  undocumented=$(grep "| TOTAL" build/coverage/python.txt | awk -F'|' '{print $4}' | tr -d ' ')
  if [ -z "$undocumented" ] || ! [[ "$undocumented" =~ ^[0-9]+$ ]]; then
    echo coverage output not found
    exit 1
-  elif [ "$undocumented" -gt 0 ]; then
+  elif [ $undocumented -gt 0 ]; then
-    set +x  # Disable command echoing for cleaner output
+    echo undocumented objects found:
-    echo ""
+    cat build/coverage/python.txt
    echo "====================="
    echo "UNDOCUMENTED OBJECTS:"
    echo "====================="
    echo ""
    # Find the line number of the TOTAL row and print only what comes after it
    total_line=$(grep -n "| TOTAL" build/coverage/python.txt | cut -d: -f1)
    if [ -n "$total_line" ]; then
      # Print only the detailed list (skip the statistics table)
      tail -n +$((total_line + 2)) build/coverage/python.txt
    else
      # Fallback to showing entire file if TOTAL line not found
      cat build/coverage/python.txt
    fi
    echo ""
    echo "Make sure you've updated relevant .rsts in docs/source!"
-    echo "You can reproduce locally by running 'cd docs && make coverage && tail -n +\$((grep -n \"| TOTAL\" build/coverage/python.txt | cut -d: -f1) + 2)) build/coverage/python.txt'"
+    echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'"
    set -x  # Re-enable command echoing
    exit 1
  fi
 else
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -337,7 +337,7 @@ test_python() {
 test_python_smoke() {
  # Smoke tests for H100/B200
-  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune inductor/test_cutedsl_grouped_mm $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
+  time python test/run_test.py --include test_matmul_cuda test_scaled_matmul_cuda inductor/test_fp8 inductor/test_max_autotune $PYTHON_TEST_EXTRA_OPTION --upload-artifacts-while-running
  assert_git_not_dirty
 }
@ -460,37 +460,31 @@ test_inductor_shard() {
    --verbose
 }
-test_inductor_aoti_cpp() {
+test_inductor_aoti() {
  # docker build uses bdist_wheel which does not work with test_aot_inductor
  # TODO: need a faster way to build
  if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then
    # We need to hipify before building again
    python3 tools/amd_build/build_amd.py
  fi
  if [[ "$BUILD_ENVIRONMENT" == *sm86* ]]; then
    BUILD_COMMAND=(TORCH_CUDA_ARCH_LIST=8.6 USE_FLASH_ATTENTION=OFF python -m pip install --no-build-isolation -v -e .)
    # TODO: Replace me completely, as one should not use conda libstdc++, nor need special path to TORCH_LIB
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="/opt/conda/envs/py_3.10/lib:${TORCH_LIB_DIR}:${LD_LIBRARY_PATH}")
  else
    BUILD_COMMAND=(python -m pip install --no-build-isolation -v -e .)
    TEST_ENVS=(CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}")
  fi
  # aoti cmake custom command requires `torch` to be installed
  # initialize the cmake build cache and install torch
  /usr/bin/env "${BUILD_COMMAND[@]}"
  # rebuild with the build cache with `BUILD_AOT_INDUCTOR_TEST` enabled
  /usr/bin/env CMAKE_FRESH=1 BUILD_AOT_INDUCTOR_TEST=1 "${BUILD_COMMAND[@]}"
  /usr/bin/env "${TEST_ENVS[@]}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference cpp/test_vec_half_AVX2 -dist=loadfile
 }
 test_inductor_aoti_cross_compile_for_windows() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  # Set WINDOWS_CUDA_HOME environment variable
  WINDOWS_CUDA_HOME="$(pwd)/win-torch-wheel-extracted"
  export WINDOWS_CUDA_HOME
  echo "WINDOWS_CUDA_HOME is set to: $WINDOWS_CUDA_HOME"
  echo "Contents:"
  ls -lah "$(pwd)/win-torch-wheel-extracted/lib/x64/" || true
  python test/inductor/test_aoti_cross_compile_windows.py -k compile --package-dir "$TEST_REPORTS_DIR" --win-torch-lib-dir "$(pwd)/win-torch-wheel-extracted/torch/lib"
 }
 test_inductor_cpp_wrapper_shard() {
  if [[ -z "$NUM_TEST_SHARDS" ]]; then
    echo "NUM_TEST_SHARDS must be defined to run a Python test shard"
@ -572,8 +566,6 @@ fi
 if [[ "${TEST_CONFIG}" == *cpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device cpu)
 elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
  DYNAMO_BENCHMARK_FLAGS+=(--device xpu)
 else
  DYNAMO_BENCHMARK_FLAGS+=(--device cuda)
 fi
@ -667,8 +659,6 @@ test_perf_for_dashboard() {
    device=cuda_b200
  elif [[ "${TEST_CONFIG}" == *rocm* ]]; then
    device=rocm
  elif [[ "${TEST_CONFIG}" == *xpu* ]]; then
    device=xpu
  fi
  for mode in "${modes[@]}"; do
@ -910,7 +900,7 @@ test_inductor_set_cpu_affinity(){
  export LD_PRELOAD="$JEMALLOC_LIB":"$LD_PRELOAD"
  export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1"
-  if [[ "$(uname -m)" != "aarch64" ]]; then
+  if [[ "${TEST_CONFIG}" != *aarch64* ]]; then
    # Use Intel OpenMP for x86
    IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so"
    export LD_PRELOAD="$IOMP_LIB":"$LD_PRELOAD"
@ -924,7 +914,7 @@ test_inductor_set_cpu_affinity(){
  cores=$((cpus / thread_per_core))
  # Set number of cores to 16 on aarch64 for performance runs
-  if [[ "$(uname -m)" == "aarch64" && $cores -gt 16 ]]; then
+  if [[ "${TEST_CONFIG}" == *aarch64* && $cores -gt 16 ]]; then
    cores=16
  fi
  export OMP_NUM_THREADS=$cores
@ -1625,7 +1615,6 @@ test_operator_benchmark() {
  TEST_REPORTS_DIR=$(pwd)/test/test-reports
  mkdir -p "$TEST_REPORTS_DIR"
  TEST_DIR=$(pwd)
  ARCH=$(uname -m)
  test_inductor_set_cpu_affinity
@ -1640,7 +1629,7 @@ test_operator_benchmark() {
  pip_install pandas
  python check_perf_csv.py \
      --actual "${TEST_REPORTS_DIR}/operator_benchmark_eager_float32_cpu.csv" \
-      --expected "${ARCH}_expected_ci_operator_benchmark_eager_float32_cpu.csv"
+      --expected "expected_ci_operator_benchmark_eager_float32_cpu.csv"
 }
 test_operator_microbenchmark() {
@ -1653,7 +1642,7 @@ test_operator_microbenchmark() {
  cd "${TEST_DIR}"/benchmarks/operator_benchmark
-  for OP_BENCHMARK_TESTS in matmul mm addmm bmm conv; do
+  for OP_BENCHMARK_TESTS in matmul mm addmm bmm; do
    $TASKSET python -m pt.${OP_BENCHMARK_TESTS}_test --tag-filter long \
      --output-json-for-dashboard "${TEST_REPORTS_DIR}/operator_microbenchmark_${OP_BENCHMARK_TESTS}_compile.json" \
      --benchmark-name "PyTorch operator microbenchmark" --use-compile
@ -1677,7 +1666,7 @@ if [[ "${TEST_CONFIG}" == *numpy_2* ]]; then
    python -m pip install --pre numpy==2.0.2 scipy==1.13.1 numba==0.60.0
  fi
  python test/run_test.py --include dynamo/test_functions.py dynamo/test_unspec.py test_binary_ufuncs.py test_fake_tensor.py test_linalg.py test_numpy_interop.py test_tensor_creation_ops.py test_torch.py torch_np/test_basic.py
-elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" == 'default' ]]; then
+elif [[ "${BUILD_ENVIRONMENT}" == *aarch64* && "${TEST_CONFIG}" != *perf_cpu_aarch64* ]]; then
  test_linux_aarch64
 elif [[ "${TEST_CONFIG}" == *backward* ]]; then
  test_forward_backward_compatibility
@ -1728,8 +1717,6 @@ elif [[ "${TEST_CONFIG}" == *inductor-triton-cpu* ]]; then
  test_inductor_triton_cpu
 elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then
  test_inductor_micro_benchmark
 elif [[ "${TEST_CONFIG}" == *aoti_cross_compile_for_windows* ]]; then
  test_inductor_aoti_cross_compile_for_windows
 elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then
  install_torchvision
  id=$((SHARD_NUMBER-1))
@ -1761,7 +1748,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  else
    # Do this after checkout_install_torchbench to ensure we clobber any
    # nightlies that torchbench may pull in
-    if [[ "${TEST_CONFIG}" != *cpu* && "${TEST_CONFIG}" != *xpu* ]]; then
+    if [[ "${TEST_CONFIG}" != *cpu* ]]; then
      install_torchrec_and_fbgemm
    fi
    PYTHONPATH=/torchbench test_dynamo_benchmark torchbench "$id"
@ -1770,7 +1757,7 @@ elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
-    test_inductor_aoti_cpp
+    test_inductor_aoti
  fi
 elif [[ "${TEST_CONFIG}" == *inductor* ]]; then
  install_torchvision
--- a/.ci/pytorch/windows/internal/install_python.bat
+++ b/.ci/pytorch/windows/internal/install_python.bat
@ -7,9 +7,12 @@ if "%DESIRED_PYTHON%" == "3.13t" (
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.13.0/python-3.13.0-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.13t"
 ) else if "%DESIRED_PYTHON%"=="3.14" (
    echo Python version is set to 3.14 or 3.14t
    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
 ) else if "%DESIRED_PYTHON%"=="3.14t" (
    echo Python version is set to 3.14 or 3.14t
-    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0-amd64.exe"
+    set "PYTHON_INSTALLER_URL=https://www.python.org/ftp/python/3.14.0/python-3.14.0rc1-amd64.exe"
    set ADDITIONAL_OPTIONS="Include_freethreaded=1"
    set PYTHON_EXEC="python3.14t"
 ) else (
--- a/.circleci/scripts/binary_populate_env.sh
+++ b/.circleci/scripts/binary_populate_env.sh
@ -163,13 +163,8 @@ if [[ "$(uname)" != Darwin ]]; then
  MEMORY_LIMIT_MAX_JOBS=12
  NUM_CPUS=$(( $(nproc) - 2 ))
-  if [[ "$(uname)" == Linux ]]; then
+  # Defaults here for **binary** linux builds so they can be changed in one place
-    # Defaults here for **binary** linux builds so they can be changed in one place
+  export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
    export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
  else
    # For other builds
    export MAX_JOBS=${NUM_CPUS}
  fi
  cat >>"$envfile" <<EOL
  export MAX_JOBS="${MAX_JOBS}"
--- a/.clang-tidy
+++ b/.clang-tidy
@ -60,11 +60,9 @@ performance-*,
 readability-container-size-empty,
 readability-delete-null-pointer,
 readability-duplicate-include,
 readability-named-parameter,
 readability-misplaced-array-index,
 readability-redundant*,
 readability-simplify-subscript-expr,
 readability-static-definition-in-anonymous-namespace
 readability-string-compare,
 -readability-redundant-access-specifiers,
 -readability-redundant-control-flow,
--- a/.claude/skills/add-uint-support/SKILL.md
+++ b/.claude/skills/add-uint-support/SKILL.md
@ -1,319 +0,0 @@
 ---
 name: add-uint-support
 description: Add unsigned integer (uint) type support to PyTorch operators by updating AT_DISPATCH macros. Use when adding support for uint16, uint32, uint64 types to operators, kernels, or when user mentions enabling unsigned types, barebones unsigned types, or uint support.
 ---
 # Add Unsigned Integer (uint) Support to Operators
 This skill helps add support for unsigned integer types (uint16, uint32, uint64) to PyTorch operators by updating their AT_DISPATCH macros.
 ## When to use this skill
 Use this skill when:
 - Adding uint16, uint32, or uint64 support to an operator
 - User mentions "unsigned types", "uint support", "barebones unsigned types"
 - Enabling support for kUInt16, kUInt32, kUInt64 in kernels
 - Working with operator implementations that need expanded type coverage
 ## Quick reference
 **Add unsigned types to existing dispatch:**
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES));
 // After (method 1: add unsigned types explicitly)
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES));
 // After (method 2: use V2 integral types if AT_INTEGRAL_TYPES present)
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
 ```
 ## Type group reference
 **Unsigned type groups:**
 - `AT_BAREBONES_UNSIGNED_TYPES`: kUInt16, kUInt32, kUInt64
 - `AT_INTEGRAL_TYPES_V2`: AT_INTEGRAL_TYPES + AT_BAREBONES_UNSIGNED_TYPES
 **Relationship:**
 ```cpp
 AT_INTEGRAL_TYPES          // kByte, kChar, kInt, kLong, kShort
 AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
 AT_INTEGRAL_TYPES_V2       // INTEGRAL_TYPES + BAREBONES_UNSIGNED_TYPES
 ```
 ## Instructions
 ### Step 1: Determine if conversion to V2 is needed
 Check if the file uses AT_DISPATCH_V2:
 **If using old AT_DISPATCH:**
 - First convert to AT_DISPATCH_V2 using the at-dispatch-v2 skill
 - Then proceed with adding uint support
 **If already using AT_DISPATCH_V2:**
 - Proceed directly to Step 2
 ### Step 2: Analyze the current dispatch macro
 Identify what type groups are currently in use:
 ```cpp
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  // body
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
    ^^^^^^^^^^^^^^^^^^^^^^^^^
    Current type coverage
 ```
 Common patterns:
 - `AT_EXPAND(AT_ALL_TYPES)` → includes AT_INTEGRAL_TYPES + AT_FLOATING_TYPES
 - `AT_EXPAND(AT_INTEGRAL_TYPES)` → signed integers only
 - `AT_EXPAND(AT_FLOATING_TYPES)` → floating point types
 ### Step 3: Choose the uint addition method
 Two approaches:
 **Method 1: Add AT_BAREBONES_UNSIGNED_TYPES explicitly**
 - Use when: You want to be explicit about adding uint support
 - Add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the type list
 **Method 2: Substitute AT_INTEGRAL_TYPES with AT_INTEGRAL_TYPES_V2**
 - Use when: The dispatch already uses `AT_EXPAND(AT_INTEGRAL_TYPES)`
 - More concise: replaces one type group with its superset
 - Only applicable if AT_INTEGRAL_TYPES is present
 ### Step 4: Apply the transformation
 **Method 1 example:**
 ```cpp
 // Before
 AT_DISPATCH_V2(
    dtype,
    "min_values_cuda",
    AT_WRAP([&]() {
      kernel_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    kBFloat16, kHalf, kBool
 );
 // After (add unsigned types)
 AT_DISPATCH_V2(
    dtype,
    "min_values_cuda",
    AT_WRAP([&]() {
      kernel_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
    kBFloat16, kHalf, kBool
 );
 ```
 **Method 2 example:**
 ```cpp
 // Before
 AT_DISPATCH_V2(
    dtype,
    "integral_op",
    AT_WRAP([&]() {
      kernel<scalar_t>();
    }),
    AT_EXPAND(AT_INTEGRAL_TYPES)
 );
 // After (substitute with V2)
 AT_DISPATCH_V2(
    dtype,
    "integral_op",
    AT_WRAP([&]() {
      kernel<scalar_t>();
    }),
    AT_EXPAND(AT_INTEGRAL_TYPES_V2)
 );
 ```
 ### Step 5: Handle AT_ALL_TYPES vs individual type groups
 If the dispatch uses `AT_EXPAND(AT_ALL_TYPES)`:
 - `AT_ALL_TYPES` = `AT_INTEGRAL_TYPES` + `AT_FLOATING_TYPES`
 - To add uint: add `AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)` to the list
 If the dispatch separately lists INTEGRAL and FLOATING:
 ```cpp
 // Before
 AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES)
 // After (Method 2 preferred)
 AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES)
 ```
 ### Step 6: Verify all dispatch sites
 Check the file for ALL dispatch macros that need uint support:
 - Some operators have multiple dispatch sites (CPU, CUDA, different functions)
 - Apply the transformation consistently across all sites
 - Ensure each gets the same type coverage updates
 ### Step 7: Validate the changes
 Check that:
 - [ ] AT_DISPATCH_V2 format is used (not old AT_DISPATCH)
 - [ ] Unsigned types are added via one of the two methods
 - [ ] All relevant dispatch sites in the file are updated
 - [ ] Type groups use `AT_EXPAND()`
 - [ ] Arguments are properly formatted and comma-separated
 ## Common patterns
 ### Pattern 1: AT_ALL_TYPES + extras
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
 ```
 ### Pattern 2: Separate INTEGRAL + FLOATING
 ```cpp
 // Before
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES), AT_EXPAND(AT_FLOATING_TYPES));
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_INTEGRAL_TYPES_V2), AT_EXPAND(AT_FLOATING_TYPES));
 ```
 ### Pattern 3: Old dispatch needs conversion first
 ```cpp
 // Before (needs v2 conversion first)
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
  kernel<scalar_t>();
 });
 // After v2 conversion
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 // After adding uint support
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kHalf, kBFloat16);
 ```
 ## Multiple dispatch sites example
 For a file with multiple functions:
 ```cpp
 void min_values_kernel_cuda(TensorIterator& iter) {
  AT_DISPATCH_V2(iter.dtype(), "min_values_cuda", AT_WRAP([&]() {
    impl<scalar_t>(iter);
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  //                           Added uint support
 }
 void min_launch_kernel(TensorIterator &iter) {
  AT_DISPATCH_V2(iter.input_dtype(), "min_cuda", AT_WRAP([&]() {
    gpu_reduce_kernel<scalar_t>(iter);
  }), AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES), kBFloat16, kHalf);
  //                           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  //                           Added uint support here too
 }
 ```
 ## Decision tree
 Use this decision tree to determine the approach:
 ```
 Is the file using AT_DISPATCH_V2?
 ├─ No → Use at-dispatch-v2 skill first, then continue
 └─ Yes
   └─ Does it use AT_EXPAND(AT_INTEGRAL_TYPES)?
      ├─ Yes → Replace with AT_EXPAND(AT_INTEGRAL_TYPES_V2)
      └─ No → Add AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) to type list
 ```
 ## Edge cases
 ### Case 1: Dispatch with only floating types
 If the operator only supports floating point types, don't add uint support:
 ```cpp
 // Leave as-is - floating point only operator
 AT_DISPATCH_V2(dtype, "float_op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf);
 ```
 ### Case 2: Complex types present
 Unsigned types work alongside complex types:
 ```cpp
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES),
    AT_EXPAND(AT_COMPLEX_TYPES),
    kHalf, kBFloat16);
 ```
 ### Case 3: Already has uint support
 Check if uint types are already present:
 - If `AT_INTEGRAL_TYPES_V2` is used → already has uint support
 - If `AT_BAREBONES_UNSIGNED_TYPES` is already in list → already has uint support
 - Skip the file if uint support is already present
 ## Workflow
 When asked to add uint support:
 1. Read the target file
 2. Check if using AT_DISPATCH_V2:
   - If not → use at-dispatch-v2 skill first
 3. Identify all dispatch macro sites
 4. For each dispatch:
   - Analyze current type groups
   - Choose method (add BAREBONES_UNSIGNED or upgrade to V2)
   - Apply transformation with Edit tool
 5. Show the user the changes
 6. Explain what was modified
 ## Important notes
 - Always check if v2 conversion is needed first
 - Apply changes consistently across all dispatch sites in the file
 - Method 2 (AT_INTEGRAL_TYPES_V2) is cleaner when applicable
 - Method 1 (explicit AT_BAREBONES_UNSIGNED_TYPES) is more explicit
 - Unsigned types are: kUInt16, kUInt32, kUInt64 (not kByte which is uint8)
 - Some operators may not semantically support unsigned types - use judgment
 ## Testing
 After adding uint support, the operator should accept uint16, uint32, and uint64 tensors. The user is responsible for functional testing.
--- a/.claude/skills/at-dispatch-v2/SKILL.md
+++ b/.claude/skills/at-dispatch-v2/SKILL.md
@ -1,305 +0,0 @@
 ---
 name: at-dispatch-v2
 description: Convert PyTorch AT_DISPATCH macros to AT_DISPATCH_V2 format in ATen C++ code. Use when porting AT_DISPATCH_ALL_TYPES_AND*, AT_DISPATCH_FLOATING_TYPES*, or other dispatch macros to the new v2 API. For ATen kernel files, CUDA kernels, and native operator implementations.
 ---
 # AT_DISPATCH to AT_DISPATCH_V2 Converter
 This skill helps convert PyTorch's legacy AT_DISPATCH macros to the new AT_DISPATCH_V2 format, as defined in `aten/src/ATen/Dispatch_v2.h`.
 ## When to use this skill
 Use this skill when:
 - Converting AT_DISPATCH_* macros to AT_DISPATCH_V2
 - Porting ATen kernels to use the new dispatch API
 - Working with files in `aten/src/ATen/native/` that use dispatch macros
 - User mentions "AT_DISPATCH", "dispatch v2", "Dispatch_v2.h", or macro conversion
 ## Quick reference
 **Old format:**
 ```cpp
 AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kHalf, kBool, dtype, "kernel_name", [&]() {
  // lambda body
 });
 ```
 **New format:**
 ```cpp
 AT_DISPATCH_V2(dtype, "kernel_name", AT_WRAP([&]() {
  // lambda body
 }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool);
 ```
 ## Key transformations
 1. **Reorder arguments**: `scalar_type` and `name` come first, then lambda, then types
 2. **Wrap the lambda**: Use `AT_WRAP(lambda)` to handle internal commas
 3. **Expand type groups**: Use `AT_EXPAND(AT_ALL_TYPES)` instead of implicit expansion
 4. **List individual types**: Add extra types (kHalf, kBFloat16, etc.) after expanded groups
 5. **Add include**: `#include <ATen/Dispatch_v2.h>` near other Dispatch includes
 ## Instructions
 ### Step 1: Add the Dispatch_v2.h include
 Add the v2 header near the existing `#include <ATen/Dispatch.h>`:
 ```cpp
 #include <ATen/Dispatch.h>
 #include <ATen/Dispatch_v2.h>
 ```
 Keep the old Dispatch.h include for now (other code may still need it).
 ### Step 2: Identify the old dispatch pattern
 Common patterns to convert:
 - `AT_DISPATCH_ALL_TYPES_AND{2,3,4}(type1, type2, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_FLOATING_TYPES_AND{2,3}(type1, type2, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND{2,3}(type1, ..., scalar_type, name, lambda)`
 - `AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND{2,3}(type1, ..., scalar_type, name, lambda)`
 ### Step 3: Map the old macro to type groups
 Identify which type group macro corresponds to the base types:
 | Old macro base | AT_DISPATCH_V2 type group |
 |----------------|---------------------------|
 | `ALL_TYPES` | `AT_EXPAND(AT_ALL_TYPES)` |
 | `FLOATING_TYPES` | `AT_EXPAND(AT_FLOATING_TYPES)` |
 | `INTEGRAL_TYPES` | `AT_EXPAND(AT_INTEGRAL_TYPES)` |
 | `COMPLEX_TYPES` | `AT_EXPAND(AT_COMPLEX_TYPES)` |
 | `ALL_TYPES_AND_COMPLEX` | `AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX)` |
 For combined patterns, use multiple `AT_EXPAND()` entries:
 ```cpp
 // Old: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(...)
 // New: AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_COMPLEX_TYPES), type1, type2
 ```
 ### Step 4: Extract the individual types
 From `AT_DISPATCH_*_AND2(type1, type2, ...)` or `AT_DISPATCH_*_AND3(type1, type2, type3, ...)`, extract the individual types (type1, type2, etc.).
 These become the trailing arguments after the type group:
 ```cpp
 AT_DISPATCH_V2(..., AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, kBool)
                                             ^^^^^^^^^^^^^^^^^^^^^^^^
                                             Individual types from AND3
 ```
 ### Step 5: Transform to AT_DISPATCH_V2
 Apply the transformation:
 **Pattern:**
 ```cpp
 AT_DISPATCH_V2(
  scalar_type,           // 1st: The dtype expression
  "name",                // 2nd: The debug string
  AT_WRAP(lambda),       // 3rd: The lambda wrapped in AT_WRAP
  type_groups,           // 4th+: Type groups with AT_EXPAND()
  individual_types       // Last: Individual types
 )
 ```
 **Example transformation:**
 ```cpp
 // BEFORE
 AT_DISPATCH_ALL_TYPES_AND3(
    kBFloat16, kHalf, kBool,
    iter.dtype(),
    "min_values_cuda",
    [&]() {
      min_values_kernel_cuda_impl<scalar_t>(iter);
    }
 );
 // AFTER
 AT_DISPATCH_V2(
    iter.dtype(),
    "min_values_cuda",
    AT_WRAP([&]() {
      min_values_kernel_cuda_impl<scalar_t>(iter);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    kBFloat16, kHalf, kBool
 );
 ```
 ### Step 6: Handle multi-line lambdas
 For lambdas with internal commas or complex expressions, AT_WRAP is essential:
 ```cpp
 AT_DISPATCH_V2(
    dtype,
    "complex_kernel",
    AT_WRAP([&]() {
      gpu_reduce_kernel<scalar_t, scalar_t>(
        iter,
        MinOps<scalar_t>{},
        thrust::pair<scalar_t, int64_t>(upper_bound(), 0)  // Commas inside!
      );
    }),
    AT_EXPAND(AT_ALL_TYPES)
 );
 ```
 ### Step 7: Verify the conversion
 Check that:
 - [ ] `AT_WRAP()` wraps the entire lambda
 - [ ] Type groups use `AT_EXPAND()`
 - [ ] Individual types don't have `AT_EXPAND()` (just `kBFloat16`, not `AT_EXPAND(kBFloat16)`)
 - [ ] Argument order is: scalar_type, name, lambda, types
 - [ ] Include added: `#include <ATen/Dispatch_v2.h>`
 ## Type group reference
 Available type group macros (use with `AT_EXPAND()`):
 ```cpp
 AT_INTEGRAL_TYPES      // kByte, kChar, kInt, kLong, kShort
 AT_FLOATING_TYPES      // kDouble, kFloat
 AT_COMPLEX_TYPES       // kComplexDouble, kComplexFloat
 AT_QINT_TYPES         // kQInt8, kQUInt8, kQInt32
 AT_ALL_TYPES          // INTEGRAL_TYPES + FLOATING_TYPES
 AT_ALL_TYPES_AND_COMPLEX  // ALL_TYPES + COMPLEX_TYPES
 AT_INTEGRAL_TYPES_V2  // INTEGRAL_TYPES + unsigned types
 AT_BAREBONES_UNSIGNED_TYPES  // kUInt16, kUInt32, kUInt64
 AT_FLOAT8_TYPES       // Float8 variants
 ```
 ## Common patterns
 ### Pattern: AT_DISPATCH_ALL_TYPES_AND2
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBFloat16, dtype, "op", [&]() {
  kernel<scalar_t>(data);
 });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>(data);
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBFloat16);
 ```
 ### Pattern: AT_DISPATCH_FLOATING_TYPES_AND3
 ```cpp
 // Before
 AT_DISPATCH_FLOATING_TYPES_AND3(kHalf, kBFloat16, kFloat8_e4m3fn,
    tensor.scalar_type(), "float_op", [&] {
  process<scalar_t>(tensor);
 });
 // After
 AT_DISPATCH_V2(tensor.scalar_type(), "float_op", AT_WRAP([&] {
  process<scalar_t>(tensor);
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn);
 ```
 ### Pattern: AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(
    kComplexHalf, kHalf,
    self.scalar_type(),
    "complex_op",
    [&] {
      result = compute<scalar_t>(self);
    }
 );
 // After
 AT_DISPATCH_V2(
    self.scalar_type(),
    "complex_op",
    AT_WRAP([&] {
      result = compute<scalar_t>(self);
    }),
    AT_EXPAND(AT_ALL_TYPES),
    AT_EXPAND(AT_COMPLEX_TYPES),
    kComplexHalf,
    kHalf
 );
 ```
 ## Edge cases
 ### Case 1: No extra types (rare)
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES(dtype, "op", [&]() { kernel<scalar_t>(); });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES));
 ```
 ### Case 2: Many individual types (AND4, AND5, etc.)
 ```cpp
 // Before
 AT_DISPATCH_FLOATING_TYPES_AND4(kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2,
    dtype, "float8_op", [&]() { kernel<scalar_t>(); });
 // After
 AT_DISPATCH_V2(dtype, "float8_op", AT_WRAP([&]() {
  kernel<scalar_t>();
 }), AT_EXPAND(AT_FLOATING_TYPES), kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2);
 ```
 ### Case 3: Lambda with no captures
 ```cpp
 // Before
 AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, dtype, "op", []() {
  static_kernel<scalar_t>();
 });
 // After
 AT_DISPATCH_V2(dtype, "op", AT_WRAP([]() {
  static_kernel<scalar_t>();
 }), AT_EXPAND(AT_ALL_TYPES), kHalf, kBool);
 ```
 ## Benefits of AT_DISPATCH_V2
 1. **No arity in macro name**: Don't need different macros for AND2, AND3, AND4
 2. **Composable type sets**: Mix and match type groups with `AT_EXPAND()`
 3. **Extensible**: Easy to add more types without hitting macro limits
 4. **Clearer**: Type groups are explicit, not implicit in macro name
 ## Important notes
 - Keep `#include <ATen/Dispatch.h>` - other code may need it
 - The `AT_WRAP()` is mandatory - prevents comma parsing issues in the lambda
 - Type groups need `AT_EXPAND()`, individual types don't
 - The v2 API is in `aten/src/ATen/Dispatch_v2.h` - refer to it for full docs
 - See the header file for the Python script to regenerate the macro implementation
 ## Workflow
 When asked to convert AT_DISPATCH macros:
 1. Read the file to identify all AT_DISPATCH uses
 2. Add `#include <ATen/Dispatch_v2.h>` if not present
 3. For each dispatch macro:
   - Identify the pattern and extract components
   - Map the base type group
   - Extract individual types
   - Construct the AT_DISPATCH_V2 call
   - Apply with Edit tool
 4. Show the user the complete converted file
 5. Explain what was changed
 Do NOT compile or test the code - focus on accurate conversion only.
--- a/.claude/skills/docstring/SKILL.md
+++ b/.claude/skills/docstring/SKILL.md
@ -1,359 +0,0 @@
 ---
 name: docstring
 description: Write docstrings for PyTorch functions and methods following PyTorch conventions. Use when writing or updating docstrings in PyTorch code.
 ---
 # PyTorch Docstring Writing Guide
 This skill describes how to write docstrings for functions and methods in the PyTorch project, following the conventions in `torch/_tensor_docs.py` and `torch/nn/functional.py`.
 ## General Principles
 - Use **raw strings** (`r"""..."""`) for all docstrings to avoid issues with LaTeX/math backslashes
 - Follow **Sphinx/reStructuredText** (reST) format for documentation
 - Be **concise but complete** - include all essential information
 - Always include **examples** when possible
 - Use **cross-references** to related functions/classes
 ## Docstring Structure
 ### 1. Function Signature (First Line)
 Start with the function signature showing all parameters:
 ```python
 r"""function_name(param1, param2, *, kwarg1=default1, kwarg2=default2) -> ReturnType
 ```
 **Notes:**
 - Include the function name
 - Show positional and keyword-only arguments (use `*` separator)
 - Include default values
 - Show return type annotation
 - This line should NOT end with a period
 ### 2. Brief Description
 Provide a one-line description of what the function does:
 ```python
 r"""conv2d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 Applies a 2D convolution over an input image composed of several input
 planes.
 ```
 ### 3. Mathematical Formulas (if applicable)
 Use Sphinx math directives for mathematical expressions:
 ```python
 .. math::
    \text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}
 ```
 Or inline math: `:math:\`x^2\``
 ### 4. Cross-References
 Link to related classes and functions using Sphinx roles:
 - `:class:\`~torch.nn.ModuleName\`` - Link to a class
 - `:func:\`torch.function_name\`` - Link to a function
 - `:meth:\`~Tensor.method_name\`` - Link to a method
 - `:attr:\`attribute_name\`` - Reference an attribute
 - The `~` prefix shows only the last component (e.g., `Conv2d` instead of `torch.nn.Conv2d`)
 **Example:**
 ```python
 See :class:`~torch.nn.Conv2d` for details and output shape.
 ```
 ### 5. Notes and Warnings
 Use admonitions for important information:
 ```python
 .. note::
    This function doesn't work directly with NLLLoss,
    which expects the Log to be computed between the Softmax and itself.
    Use log_softmax instead (it's faster and has better numerical properties).
 .. warning::
    :func:`new_tensor` always copies :attr:`data`. If you have a Tensor
    ``data`` and want to avoid a copy, use :func:`torch.Tensor.requires_grad_`
    or :func:`torch.Tensor.detach`.
 ```
 ### 6. Args Section
 Document all parameters with type annotations and descriptions:
 ```python
 Args:
    input (Tensor): input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
    weight (Tensor): filters of shape :math:`(\text{out\_channels} , kH , kW)`
    bias (Tensor, optional): optional bias tensor of shape :math:`(\text{out\_channels})`. Default: ``None``
    stride (int or tuple): the stride of the convolving kernel. Can be a single number or a
      tuple `(sH, sW)`. Default: 1
 ```
 **Formatting rules:**
 - Parameter name in **lowercase**
 - Type in parentheses: `(Type)`, `(Type, optional)` for optional parameters
 - Description follows the type
 - For optional parameters, include "Default: ``value``" at the end
 - Use double backticks for inline code: ``` ``None`` ```
 - Indent continuation lines by 2 spaces
 ### 7. Keyword Args Section (if applicable)
 Sometimes keyword arguments are documented separately:
 ```python
 Keyword args:
    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
        Default: if None, same :class:`torch.dtype` as this tensor.
    device (:class:`torch.device`, optional): the desired device of returned tensor.
        Default: if None, same :class:`torch.device` as this tensor.
    requires_grad (bool, optional): If autograd should record operations on the
        returned tensor. Default: ``False``.
 ```
 ### 8. Returns Section (if needed)
 Document the return value:
 ```python
 Returns:
    Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
        If ``hard=True``, the returned samples will be one-hot, otherwise they will
        be probability distributions that sum to 1 across `dim`.
 ```
 Or simply include it in the function signature line if obvious from context.
 ### 9. Examples Section
 Always include examples when possible:
 ```python
 Examples::
    >>> inputs = torch.randn(33, 16, 30)
    >>> filters = torch.randn(20, 16, 5)
    >>> F.conv1d(inputs, filters)
    >>> # With square kernels and equal stride
    >>> filters = torch.randn(8, 4, 3, 3)
    >>> inputs = torch.randn(1, 4, 5, 5)
    >>> F.conv2d(inputs, filters, padding=1)
 ```
 **Formatting rules:**
 - Use `Examples::` with double colon
 - Use `>>>` prompt for Python code
 - Include comments with `#` when helpful
 - Show actual output when it helps understanding (indent without `>>>`)
 ### 10. External References
 Link to papers or external documentation:
 ```python
 .. _Link Name:
    https://arxiv.org/abs/1611.00712
 ```
 Reference them in text: ```See `Link Name`_```
 ## Method Types
 ### Native Python Functions
 For regular Python functions, use a standard docstring:
 ```python
 def relu(input: Tensor, inplace: bool = False) -> Tensor:
    r"""relu(input, inplace=False) -> Tensor
    Applies the rectified linear unit function element-wise. See
    :class:`~torch.nn.ReLU` for more details.
    """
    # implementation
 ```
 ### C-Bound Functions (using add_docstr)
 For C-bound functions, use `_add_docstr`:
 ```python
 conv1d = _add_docstr(
    torch.conv1d,
    r"""
 conv1d(input, weight, bias=None, stride=1, padding=0, dilation=1, groups=1) -> Tensor
 Applies a 1D convolution over an input signal composed of several input
 planes.
 See :class:`~torch.nn.Conv1d` for details and output shape.
 Args:
    input: input tensor of shape :math:`(\text{minibatch} , \text{in\_channels} , iW)`
    weight: filters of shape :math:`(\text{out\_channels} , kW)`
    ...
 """,
 )
 ```
 ### In-Place Variants
 For in-place operations (ending with `_`), reference the original:
 ```python
 add_docstr_all(
    "abs_",
    r"""
 abs_() -> Tensor
 In-place version of :meth:`~Tensor.abs`
 """,
 )
 ```
 ### Alias Functions
 For aliases, simply reference the original:
 ```python
 add_docstr_all(
    "absolute",
    r"""
 absolute() -> Tensor
 Alias for :func:`abs`
 """,
 )
 ```
 ## Common Patterns
 ### Shape Documentation
 Use LaTeX math notation for tensor shapes:
 ```python
 :math:`(\text{minibatch} , \text{in\_channels} , iH , iW)`
 ```
 ### Reusable Argument Definitions
 For commonly used arguments, define them once and reuse:
 ```python
 common_args = parse_kwargs(
    """
    dtype (:class:`torch.dtype`, optional): the desired type of returned tensor.
        Default: if None, same as this tensor.
 """
 )
 # Then use with .format():
 r"""
 ...
 Keyword args:
    {dtype}
    {device}
 """.format(**common_args)
 ```
 ### Template Insertion
 Insert reproducibility notes or other common text:
 ```python
 r"""
 {tf32_note}
 {cudnn_reproducibility_note}
 """.format(**reproducibility_notes, **tf32_notes)
 ```
 ## Complete Example
 Here's a complete example showing all elements:
 ```python
 def gumbel_softmax(
    logits: Tensor,
    tau: float = 1,
    hard: bool = False,
    eps: float = 1e-10,
    dim: int = -1,
 ) -> Tensor:
    r"""
    Sample from the Gumbel-Softmax distribution and optionally discretize.
    Args:
        logits (Tensor): `[..., num_features]` unnormalized log probabilities
        tau (float): non-negative scalar temperature
        hard (bool): if ``True``, the returned samples will be discretized as one-hot vectors,
              but will be differentiated as if it is the soft sample in autograd. Default: ``False``
        dim (int): A dimension along which softmax will be computed. Default: -1
    Returns:
        Tensor: Sampled tensor of same shape as `logits` from the Gumbel-Softmax distribution.
            If ``hard=True``, the returned samples will be one-hot, otherwise they will
            be probability distributions that sum to 1 across `dim`.
    .. note::
        This function is here for legacy reasons, may be removed from nn.Functional in the future.
    Examples::
        >>> logits = torch.randn(20, 32)
        >>> # Sample soft categorical using reparametrization trick:
        >>> F.gumbel_softmax(logits, tau=1, hard=False)
        >>> # Sample hard categorical using "Straight-through" trick:
        >>> F.gumbel_softmax(logits, tau=1, hard=True)
    .. _Link 1:
        https://arxiv.org/abs/1611.00712
    """
    # implementation
 ```
 ## Quick Checklist
 When writing a PyTorch docstring, ensure:
 - [ ] Use raw string (`r"""`)
 - [ ] Include function signature on first line
 - [ ] Provide brief description
 - [ ] Document all parameters in Args section with types
 - [ ] Include default values for optional parameters
 - [ ] Use Sphinx cross-references (`:func:`, `:class:`, `:meth:`)
 - [ ] Add mathematical formulas if applicable
 - [ ] Include at least one example in Examples section
 - [ ] Add warnings/notes for important caveats
 - [ ] Link to related module class with `:class:`
 - [ ] Use proper math notation for tensor shapes
 - [ ] Follow consistent formatting and indentation
 ## Common Sphinx Roles Reference
 - `:class:\`~torch.nn.Module\`` - Class reference
 - `:func:\`torch.function\`` - Function reference
 - `:meth:\`~Tensor.method\`` - Method reference
 - `:attr:\`attribute\`` - Attribute reference
 - `:math:\`equation\`` - Inline math
 - `:ref:\`label\`` - Internal reference
 - ``` ``code`` ``` - Inline code (use double backticks)
 ## Additional Notes
 - **Indentation**: Use 4 spaces for code, 2 spaces for continuation of parameter descriptions
 - **Line length**: Try to keep lines under 100 characters when possible
 - **Periods**: End sentences with periods, but not the signature line
 - **Backticks**: Use double backticks for code: ``` ``True`` ``None`` ``False`` ```
 - **Types**: Common types are `Tensor`, `int`, `float`, `bool`, `str`, `tuple`, `list`, etc.
--- a/.claude/skills/skill-writer/SKILL.md
+++ b/.claude/skills/skill-writer/SKILL.md
@ -1,385 +0,0 @@
 ---
 name: skill-writer
 description: Guide users through creating Agent Skills for Claude Code. Use when the user wants to create, write, author, or design a new Skill, or needs help with SKILL.md files, frontmatter, or skill structure.
 ---
 # Skill Writer
 This Skill helps you create well-structured Agent Skills for Claude Code that follow best practices and validation requirements.
 ## When to use this Skill
 Use this Skill when:
 - Creating a new Agent Skill
 - Writing or updating SKILL.md files
 - Designing skill structure and frontmatter
 - Troubleshooting skill discovery issues
 - Converting existing prompts or workflows into Skills
 ## Instructions
 ### Step 1: Determine Skill scope
 First, understand what the Skill should do:
 1. **Ask clarifying questions**:
   - What specific capability should this Skill provide?
   - When should Claude use this Skill?
   - What tools or resources does it need?
   - Is this for personal use or team sharing?
 2. **Keep it focused**: One Skill = one capability
   - Good: "PDF form filling", "Excel data analysis"
   - Too broad: "Document processing", "Data tools"
 ### Step 2: Choose Skill location
 Determine where to create the Skill:
 **Personal Skills** (`~/.claude/skills/`):
 - Individual workflows and preferences
 - Experimental Skills
 - Personal productivity tools
 **Project Skills** (`.claude/skills/`):
 - Team workflows and conventions
 - Project-specific expertise
 - Shared utilities (committed to git)
 ### Step 3: Create Skill structure
 Create the directory and files:
 ```bash
 # Personal
 mkdir -p ~/.claude/skills/skill-name
 # Project
 mkdir -p .claude/skills/skill-name
 ```
 For multi-file Skills:
 ```
 skill-name/
 ├── SKILL.md (required)
 ├── reference.md (optional)
 ├── examples.md (optional)
 ├── scripts/
 │   └── helper.py (optional)
 └── templates/
    └── template.txt (optional)
 ```
 ### Step 4: Write SKILL.md frontmatter
 Create YAML frontmatter with required fields:
 ```yaml
 ---
 name: skill-name
 description: Brief description of what this does and when to use it
 ---
 ```
 **Field requirements**:
 - **name**:
  - Lowercase letters, numbers, hyphens only
  - Max 64 characters
  - Must match directory name
  - Good: `pdf-processor`, `git-commit-helper`
  - Bad: `PDF_Processor`, `Git Commits!`
 - **description**:
  - Max 1024 characters
  - Include BOTH what it does AND when to use it
  - Use specific trigger words users would say
  - Mention file types, operations, and context
 **Optional frontmatter fields**:
 - **allowed-tools**: Restrict tool access (comma-separated list)
  ```yaml
  allowed-tools: Read, Grep, Glob
  ```
  Use for:
  - Read-only Skills
  - Security-sensitive workflows
  - Limited-scope operations
 ### Step 5: Write effective descriptions
 The description is critical for Claude to discover your Skill.
 **Formula**: `[What it does] + [When to use it] + [Key triggers]`
 **Examples**:
 ✅ **Good**:
 ```yaml
 description: Extract text and tables from PDF files, fill forms, merge documents. Use when working with PDF files or when the user mentions PDFs, forms, or document extraction.
 ```
 ✅ **Good**:
 ```yaml
 description: Analyze Excel spreadsheets, create pivot tables, and generate charts. Use when working with Excel files, spreadsheets, or analyzing tabular data in .xlsx format.
 ```
 ❌ **Too vague**:
 ```yaml
 description: Helps with documents
 description: For data analysis
 ```
 **Tips**:
 - Include specific file extensions (.pdf, .xlsx, .json)
 - Mention common user phrases ("analyze", "extract", "generate")
 - List concrete operations (not generic verbs)
 - Add context clues ("Use when...", "For...")
 ### Step 6: Structure the Skill content
 Use clear Markdown sections:
 ```markdown
 # Skill Name
 Brief overview of what this Skill does.
 ## Quick start
 Provide a simple example to get started immediately.
 ## Instructions
 Step-by-step guidance for Claude:
 1. First step with clear action
 2. Second step with expected outcome
 3. Handle edge cases
 ## Examples
 Show concrete usage examples with code or commands.
 ## Best practices
 - Key conventions to follow
 - Common pitfalls to avoid
 - When to use vs. not use
 ## Requirements
 List any dependencies or prerequisites:
 ```bash
 pip install package-name
 ```
 ## Advanced usage
 For complex scenarios, see [reference.md](reference.md).
 ```
 ### Step 7: Add supporting files (optional)
 Create additional files for progressive disclosure:
 **reference.md**: Detailed API docs, advanced options
 **examples.md**: Extended examples and use cases
 **scripts/**: Helper scripts and utilities
 **templates/**: File templates or boilerplate
 Reference them from SKILL.md:
 ```markdown
 For advanced usage, see [reference.md](reference.md).
 Run the helper script:
 \`\`\`bash
 python scripts/helper.py input.txt
 \`\`\`
 ```
 ### Step 8: Validate the Skill
 Check these requirements:
 ✅ **File structure**:
 - [ ] SKILL.md exists in correct location
 - [ ] Directory name matches frontmatter `name`
 ✅ **YAML frontmatter**:
 - [ ] Opening `---` on line 1
 - [ ] Closing `---` before content
 - [ ] Valid YAML (no tabs, correct indentation)
 - [ ] `name` follows naming rules
 - [ ] `description` is specific and < 1024 chars
 ✅ **Content quality**:
 - [ ] Clear instructions for Claude
 - [ ] Concrete examples provided
 - [ ] Edge cases handled
 - [ ] Dependencies listed (if any)
 ✅ **Testing**:
 - [ ] Description matches user questions
 - [ ] Skill activates on relevant queries
 - [ ] Instructions are clear and actionable
 ### Step 9: Test the Skill
 1. **Restart Claude Code** (if running) to load the Skill
 2. **Ask relevant questions** that match the description:
   ```
   Can you help me extract text from this PDF?
   ```
 3. **Verify activation**: Claude should use the Skill automatically
 4. **Check behavior**: Confirm Claude follows the instructions correctly
 ### Step 10: Debug if needed
 If Claude doesn't use the Skill:
 1. **Make description more specific**:
   - Add trigger words
   - Include file types
   - Mention common user phrases
 2. **Check file location**:
   ```bash
   ls ~/.claude/skills/skill-name/SKILL.md
   ls .claude/skills/skill-name/SKILL.md
   ```
 3. **Validate YAML**:
   ```bash
   cat SKILL.md | head -n 10
   ```
 4. **Run debug mode**:
   ```bash
   claude --debug
   ```
 ## Common patterns
 ### Read-only Skill
 ```yaml
 ---
 name: code-reader
 description: Read and analyze code without making changes. Use for code review, understanding codebases, or documentation.
 allowed-tools: Read, Grep, Glob
 ---
 ```
 ### Script-based Skill
 ```yaml
 ---
 name: data-processor
 description: Process CSV and JSON data files with Python scripts. Use when analyzing data files or transforming datasets.
 ---
 # Data Processor
 ## Instructions
 1. Use the processing script:
 \`\`\`bash
 python scripts/process.py input.csv --output results.json
 \`\`\`
 2. Validate output with:
 \`\`\`bash
 python scripts/validate.py results.json
 \`\`\`
 ```
 ### Multi-file Skill with progressive disclosure
 ```yaml
 ---
 name: api-designer
 description: Design REST APIs following best practices. Use when creating API endpoints, designing routes, or planning API architecture.
 ---
 # API Designer
 Quick start: See [examples.md](examples.md)
 Detailed reference: See [reference.md](reference.md)
 ## Instructions
 1. Gather requirements
 2. Design endpoints (see examples.md)
 3. Document with OpenAPI spec
 4. Review against best practices (see reference.md)
 ```
 ## Best practices for Skill authors
 1. **One Skill, one purpose**: Don't create mega-Skills
 2. **Specific descriptions**: Include trigger words users will say
 3. **Clear instructions**: Write for Claude, not humans
 4. **Concrete examples**: Show real code, not pseudocode
 5. **List dependencies**: Mention required packages in description
 6. **Test with teammates**: Verify activation and clarity
 7. **Version your Skills**: Document changes in content
 8. **Use progressive disclosure**: Put advanced details in separate files
 ## Validation checklist
 Before finalizing a Skill, verify:
 - [ ] Name is lowercase, hyphens only, max 64 chars
 - [ ] Description is specific and < 1024 chars
 - [ ] Description includes "what" and "when"
 - [ ] YAML frontmatter is valid
 - [ ] Instructions are step-by-step
 - [ ] Examples are concrete and realistic
 - [ ] Dependencies are documented
 - [ ] File paths use forward slashes
 - [ ] Skill activates on relevant queries
 - [ ] Claude follows instructions correctly
 ## Troubleshooting
 **Skill doesn't activate**:
 - Make description more specific with trigger words
 - Include file types and operations in description
 - Add "Use when..." clause with user phrases
 **Multiple Skills conflict**:
 - Make descriptions more distinct
 - Use different trigger words
 - Narrow the scope of each Skill
 **Skill has errors**:
 - Check YAML syntax (no tabs, proper indentation)
 - Verify file paths (use forward slashes)
 - Ensure scripts have execute permissions
 - List all dependencies
 ## Examples
 See the documentation for complete examples:
 - Simple single-file Skill (commit-helper)
 - Skill with tool permissions (code-reviewer)
 - Multi-file Skill (pdf-processing)
 ## Output format
 When creating a Skill, I will:
 1. Ask clarifying questions about scope and requirements
 2. Suggest a Skill name and location
 3. Create the SKILL.md file with proper frontmatter
 4. Include clear instructions and examples
 5. Add supporting files if needed
 6. Provide testing instructions
 7. Validate against all requirements
 The result will be a complete, working Skill that follows all best practices and validation rules.
--- a/.flake8
+++ b/.flake8
@ -7,12 +7,16 @@ max-line-length = 120
 # C408 ignored because we like the dict keyword argument syntax
 # E501 is not flexible enough, we're using B950 instead
 ignore =
-    E203,E305,E402,E501,E704,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
+    E203,E305,E402,E501,E704,E721,E741,F405,F841,F999,W503,W504,C408,E302,W291,E303,F824,
    # shebang has extra meaning in fbcode lints, so I think it's not worth trying
    # to line this up with executable bit
    EXE001,
    # these ignores are from flake8-bugbear; please fix!
    B007,B008,B017,B019,B023,B028,B903,B905,B906,B907,B908,B910
    # these ignores are from flake8-comprehensions; please fix!
    C407,
    # these ignores are from flake8-logging-format; please fix!
    G100,G101,G200
    # these ignores are from flake8-simplify. please fix or ignore with commented reason
    SIM105,SIM108,SIM110,SIM111,SIM113,SIM114,SIM115,SIM116,SIM117,SIM118,SIM119,SIM12,
    # SIM104 is already covered by pyupgrade ruff
--- a/.github/ISSUE_TEMPLATE/ci-sev.md
+++ b/.github/ISSUE_TEMPLATE/ci-sev.md
@ -8,7 +8,6 @@ assignees: ''
 ---
 > NOTE: Remember to label this issue with "`ci: sev`"
 >       If you want autorevert to be disabled, keep the ci: disable-autorevert label
 <!-- Add the `merge blocking` label to this PR to prevent PRs from being merged while this issue is open -->
--- a/.github/ISSUE_TEMPLATE/disable-autorevert.md
+++ b/.github/ISSUE_TEMPLATE/disable-autorevert.md
@ -1,7 +1,7 @@
 ---
-name: "D❌\U0001F519 ISABLE AUTOREVERT"
+name: DISABLE AUTOREVERT
 about: Disables autorevert when open
-title: "[DISABLE AUTOREVERT]"
+title: "❌\U0001F519 [DISABLE AUTOREVERT]"
 labels: 'ci: disable-autorevert'
 assignees: ''
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -65,7 +65,7 @@ runs:
          cd .ci/lumen_cli
          python3 -m pip install -e .
        )
-        MAX_JOBS="$(nproc --ignore=10)"
+        MAX_JOBS="$(nproc --ignore=6)"
        export MAX_JOBS
        # Split the comma-separated list and build each target
--- a/.github/actions/diskspace-cleanup/action.yml
+++ b/.github/actions/diskspace-cleanup/action.yml
@ -27,9 +27,7 @@ runs:
            docker system prune -af
            diskspace_new=$(df -H --output=pcent ${docker_root_dir} | sed -n 2p | sed 's/%//' | sed 's/ //')
            if [[ "$diskspace_new" -gt "$diskspace_cutoff" ]] ; then
-                diskspace_cutoff_int=$((diskspace_cutoff + 0))
+                echo "Error: Available diskspace is less than $diskspace_cutoff percent. Not enough diskspace."
                difference=$((100 - diskspace_cutoff_int))
                echo "Error: Available diskspace is less than $difference percent. Not enough diskspace."
                echo "$msg"
                exit 1
            else
--- a/.github/actions/setup-rocm/action.yml
+++ b/.github/actions/setup-rocm/action.yml
@ -124,10 +124,3 @@ runs:
      id: login-ecr
      continue-on-error: true
      uses: aws-actions/amazon-ecr-login@062b18b96a7aff071d4dc91bc00c4c1a7945b076 # v2.0.1
    - name: Preserve github env variables for use in docker
      shell: bash
      run: |
        env | grep '^GITHUB' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
        env | grep '^CI' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
        env | grep '^RUNNER' >> "${RUNNER_TEMP}/github_env_${GITHUB_RUN_ID}"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2
+8ad2aa5d354d1bf432339113860185d5a5d1abbd
--- a/.github/ci_commit_pins/vision.txt
+++ b/.github/ci_commit_pins/vision.txt
@ -1 +1 @@
-cfbc5c2f1c798991715a6b06bb3ce46478c4487c
+f5c6c2ec6490455e86f67b2a25c10390d60a27f7
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9
+0fa6e3129e61143224663e1ec67980d12b7ec4eb
--- a/.github/ci_configs/vllm/Dockerfile
+++ b/.github/ci_configs/vllm/Dockerfile
@ -283,9 +283,6 @@ RUN --mount=type=bind,source=${TORCH_WHEELS_PATH},target=/dist \
        uv pip install --system $(cat torch_build_versions.txt | xargs) --index-url https://download.pytorch.org/whl/nightly/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
    fi
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --pre apache-tvm-ffi==0.1.0b15
 # Install the vllm wheel from previous stage
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /wheels/vllm/*.whl --verbose
@ -298,8 +295,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ARG torch_cuda_arch_list='8.0;8.9;9.0a;10.0a;12.0'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 # TODO(elainewy): remove this once vllm commit is updated, and install flashinfer from pip
 # see https://github.com/pytorch/pytorch/pull/165274#issuecomment-3408531784
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 ARG FLASHINFER_GIT_REF="v0.2.14.post1"
--- a/.github/label_to_label.yml
+++ b/.github/label_to_label.yml
@ -15,11 +15,6 @@
  - "module: reinplacing"
  then:
  - "module: pt2-dispatcher"
 - any:
  - "vllm-compile"
  then:
  - "module: vllm"
  - "oncall: pt2"
 - any:
  - "module: vmap"
  then:
@ -32,6 +27,10 @@
  - "module: pt2 optimizer"
  then:
  - "module: dynamo"
 - any:
  - "module: flex attention"
  then:
  - "module: higher order operators"
 - any:
  - "module: aotinductor"
  then:
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@ -133,32 +133,3 @@
 "ciflow/vllm":
 - .github/ci_commit_pins/vllm.txt
 "ciflow/b200":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
 "ciflow/h100":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/Blas.cpp
 - torch/**/*cublas*
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
 "ciflow/rocm":
 - test/test_matmul_cuda.py
 - test/test_scaled_matmul_cuda.py
 - test/inductor/test_fp8.py
 - aten/src/ATen/native/cuda/Blas.cpp
 - torch/_inductor/kernel/mm.py
 - test/inductor/test_max_autotune.py
 - third_party/fbgemm
--- a/.github/merge_rules.yaml
+++ b/.github/merge_rules.yaml
@ -540,26 +540,6 @@
  - Lint
  - pull
 - name: PrivateUse1
  patterns:
  - torch/accelerator/**
  - torch/utils/backend_registration.py
  - torch/csrc/acc/**
  - torch/csrc/DeviceAccelerator.*
  - torch/csrc/profiler/standalone/privateuse1_observer.*
  - aten/src/ATen/DeviceAccelerator.*
  - aten/src/ATen/core/GeneratorForPrivateuseone.*
  - aten/src/ATen/detail/PrivateUse1HooksInterface.*
  - docs/source/accelerator/**
  - test/cpp_extensions/open_registration_extension/torch_openreg/**
  approved_by:
  - albanD
  - fffrog
  mandatory_checks_name:
  - EasyCLA
  - Lint
  - pull
 - name: superuser
  patterns:
  - '*'
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -3,7 +3,6 @@ ciflow_tracking_issue: 64124
 ciflow_push_tags:
 - ciflow/b200
 - ciflow/b200-symm-mem
 - ciflow/b200-distributed
 - ciflow/binaries
 - ciflow/binaries_libtorch
 - ciflow/binaries_wheel
@ -16,10 +15,8 @@ ciflow_push_tags:
 - ciflow/inductor-micro-benchmark
 - ciflow/inductor-micro-benchmark-cpu-x86
 - ciflow/inductor-perf-compare
- ciflow/inductor-perf-test-nightly-rocm-mi300
+- ciflow/inductor-perf-test-nightly-rocm
 - ciflow/inductor-perf-test-nightly-rocm-mi355
 - ciflow/inductor-perf-test-nightly-x86-zen
 - ciflow/inductor-perf-test-nightly-xpu
 - ciflow/inductor-periodic
 - ciflow/inductor-rocm
 - ciflow/linux-aarch64
@ -27,7 +24,6 @@ ciflow_push_tags:
 - ciflow/nightly
 - ciflow/op-benchmark
 - ciflow/periodic
 - ciflow/periodic-rocm-mi200
 - ciflow/periodic-rocm-mi300
 - ciflow/pull
 - ciflow/quantization-periodic
@ -35,7 +31,6 @@ ciflow_push_tags:
 - ciflow/rocm
 - ciflow/rocm-mi300
 - ciflow/rocm-mi355
 - ciflow/rocm-navi31
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -11,17 +11,11 @@ architectures:
    * Latest XPU
 """
 import json
 import os
 import re
 from pathlib import Path
 from typing import Optional
-SCRIPT_DIR = Path(__file__).absolute().parent
+# NOTE: Please also update the CUDA sources in `PIP_SOURCES` in tools/nightly.py when changing this
 REPO_ROOT = SCRIPT_DIR.parent.parent
 CUDA_ARCHES = ["12.6", "12.8", "12.9", "13.0"]
 CUDA_STABLE = "12.8"
 CUDA_ARCHES_FULL_VERSION = {
@ -37,7 +31,8 @@ CUDA_ARCHES_CUDNN_VERSION = {
    "13.0": "9",
 }
-ROCM_ARCHES = ["7.0", "7.1"]
+# NOTE: Please also update the ROCm sources in `PIP_SOURCES` in tools/nightly.py when changing this
 ROCM_ARCHES = ["6.4", "7.0"]
 XPU_ARCHES = ["xpu"]
@ -61,7 +56,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'"
@ -78,44 +73,44 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'"
    ),
    "12.9": (
-        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | "
+        "nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | "
+        "nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | "
+        "nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | "
+        "nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | "
+        "nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | "
+        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | "
+        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | "
+        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | "
+        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | "
+        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'"
+        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
    ),
    "13.0": (
-        "nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | "
+        "nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | "
+        "nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | "
-        "nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | "
+        "nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | "
        "nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | "
-        "nvidia-cublas==13.1.0.3; platform_system == 'Linux' | "
+        "nvidia-cublas==13.0.0.19; platform_system == 'Linux' | "
-        "nvidia-cufft==12.0.0.61; platform_system == 'Linux' | "
+        "nvidia-cufft==12.0.0.15; platform_system == 'Linux' | "
        "nvidia-curand==10.4.0.35; platform_system == 'Linux' | "
-        "nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | "
+        "nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | "
-        "nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | "
+        "nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | "
        "nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | "
        "nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | "
-        "nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | "
+        "nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | "
-        "nvidia-nvtx==13.0.85; platform_system == 'Linux' | "
+        "nvidia-nvtx==13.0.39; platform_system == 'Linux' | "
-        "nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | "
+        "nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | "
-        "nvidia-cufile==1.15.1.6; platform_system == 'Linux'"
+        "nvidia-cufile==1.15.0.42; platform_system == 'Linux'"
    ),
    "xpu": (
        "intel-cmplr-lib-rt==2025.2.1 | "
@ -142,48 +137,9 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
 }
 # Used by tools/nightly.py
 PYTORCH_NIGHTLY_PIP_INDEX_URL = "https://download.pytorch.org/whl/nightly"
 NIGHTLY_SOURCE_MATRIX = {
    "cpu": dict(
        name="cpu",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cpu",
        supported_platforms=["Linux", "macOS", "Windows"],
        accelerator="cpu",
    )
 }
 CUDA_NIGHTLY_SOURCE_MATRIX = {
    f"cuda-{major}.{minor}": dict(
        name=f"cuda-{major}.{minor}",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/cu{major}{minor}",
        supported_platforms=["Linux", "Windows"],
        accelerator="cuda",
    )
    for major, minor in (map(int, version.split(".")) for version in CUDA_ARCHES)
 }
 ROCM_NIGHTLY_SOURCE_MATRIX = {
    f"rocm-{major}.{minor}": dict(
        name=f"rocm-{major}.{minor}",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/rocm{major}.{minor}",
        supported_platforms=["Linux"],
        accelerator="rocm",
    )
    for major, minor in (map(int, version.split(".")) for version in ROCM_ARCHES)
 }
 XPU_NIGHTLY_SOURCE_MATRIX = {
    "xpu": dict(
        name="xpu",
        index_url=f"{PYTORCH_NIGHTLY_PIP_INDEX_URL}/xpu",
        supported_platforms=["Linux"],
        accelerator="xpu",
    )
 }
 NIGHTLY_SOURCE_MATRIX.update(CUDA_NIGHTLY_SOURCE_MATRIX)
 NIGHTLY_SOURCE_MATRIX.update(ROCM_NIGHTLY_SOURCE_MATRIX)
 NIGHTLY_SOURCE_MATRIX.update(XPU_NIGHTLY_SOURCE_MATRIX)
 def get_nccl_wheel_version(arch_version: str) -> str:
    import re
    requirements = map(
        str.strip, re.split("[;|]", PYTORCH_EXTRA_INSTALL_REQUIREMENTS[arch_version])
    )
@ -191,14 +147,17 @@ def get_nccl_wheel_version(arch_version: str) -> str:
 def read_nccl_pin(arch_version: str) -> str:
-    nccl_pin_path = (
+    from pathlib import Path
-        REPO_ROOT
+
-        / ".ci"
+    nccl_pin_path = os.path.join(
-        / "docker"
+        Path(__file__).absolute().parents[2],
-        / "ci_commit_pins"
+        ".ci",
-        / f"nccl-cu{arch_version[:2]}.txt"
+        "docker",
        "ci_commit_pins",
        f"nccl-cu{arch_version[:2]}.txt",
    )
-    return nccl_pin_path.read_text().strip()
+    with open(nccl_pin_path) as f:
        return f.read().strip()
 def validate_nccl_dep_consistency(arch_version: str) -> None:
@ -206,8 +165,7 @@ def validate_nccl_dep_consistency(arch_version: str) -> None:
    wheel_ver = get_nccl_wheel_version(arch_version)
    if not nccl_release_tag.startswith(f"v{wheel_ver}"):
        raise RuntimeError(
-            f"{arch_version} NCCL release tag version {nccl_release_tag} "
+            f"{arch_version} NCCL release tag version {nccl_release_tag} does not correspond to wheel version {wheel_ver}"
            f"does not correspond to wheel version {wheel_ver}"
        )
@ -283,11 +241,7 @@ def generate_libtorch_matrix(
            arches += CUDA_ARCHES
            arches += ROCM_ARCHES
        elif os == "windows":
-            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            arches += CUDA_ARCHES
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches
    if libtorch_variants is None:
        libtorch_variants = [
            "shared-with-deps",
@ -351,11 +305,7 @@ def generate_wheels_matrix(
        if os == "linux":
            arches += CUDA_ARCHES + ROCM_ARCHES + XPU_ARCHES
        elif os == "windows":
-            # TODO (huydhn): Only build CUDA 12.9 for Linux. This logic is to be cleaned up
+            arches += CUDA_ARCHES + XPU_ARCHES
            # in 2.10
            windows_cuda_arches = CUDA_ARCHES.copy()
            windows_cuda_arches.remove("12.9")
            arches += windows_cuda_arches + XPU_ARCHES
        elif os == "linux-aarch64":
            # Separate new if as the CPU type is different and
            # uses different build/test scripts
@ -454,14 +404,7 @@ def generate_wheels_matrix(
    return ret
-arch_version = ""
+validate_nccl_dep_consistency("13.0")
-for arch_version in CUDA_ARCHES:
+validate_nccl_dep_consistency("12.9")
-    validate_nccl_dep_consistency(arch_version)
+validate_nccl_dep_consistency("12.8")
-del arch_version
+validate_nccl_dep_consistency("12.6")
 if __name__ == "__main__":
    # Used by tools/nightly.py
    (SCRIPT_DIR / "nightly_source_matrix.json").write_text(
        json.dumps(NIGHTLY_SOURCE_MATRIX, indent=4) + "\n"
    )
--- a/.github/scripts/trymerge.py
+++ b/.github/scripts/trymerge.py
@ -1092,7 +1092,7 @@ class GitHubPR:
        editor = node["editor"]
        return GitHubComment(
            body_text=node["bodyText"],
-            created_at=node.get("createdAt", ""),
+            created_at=node["createdAt"] if "createdAt" in node else "",
            author_login=node["author"]["login"],
            author_url=node["author"].get("url", None),
            author_association=node["authorAssociation"],
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -26,8 +26,9 @@ name: !{{ build_environment }}
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "!{{ py_ver.strip('t') + ('.4' if '3.14' not in py_ver else '.0') }}"
+          python-version: "!{{ (py_ver.strip('t') + '.4') if '3.14' not in py_ver else '3.14.0-rc.2' }}"
          freethreaded: !{{ "true" if py_ver.endswith('t') else "false" }}
 {%- endmacro %}
--- a/.github/templates/windows_binary_build_workflow.yml.j2
+++ b/.github/templates/windows_binary_build_workflow.yml.j2
@ -79,9 +79,9 @@ jobs:
    runs-on: "windows-11-arm64-preview"
    {%- else %}
    {%- if branches == "nightly" %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    {%- else %}
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge.nonephemeral"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    {%- endif %}
    {%- endif %}
    timeout-minutes: !{{ common.timeout_minutes_windows_binary }}
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -37,7 +37,7 @@ on:
      runner:
        required: false
        type: string
-        default: "linux.c7i.2xlarge"
+        default: "linux.2xlarge"
        description: |
          Label of the runner this job should run on.
      test-matrix:
--- a/.github/workflows/_linux-test.yml
+++ b/.github/workflows/_linux-test.yml
@ -224,46 +224,6 @@ jobs:
        continue-on-error: true
        uses: ./.github/actions/download-td-artifacts
      - name: Download Windows torch wheel for cross-compilation
        if: matrix.win_torch_wheel_artifact != ''
        uses: seemethere/download-artifact-s3@1da556a7aa0a088e3153970611f6c432d58e80e6 # v4.2.0
        with:
          name: ${{ matrix.win_torch_wheel_artifact }}
          path: win-torch-wheel
      - name: Extract Windows wheel and setup CUDA libraries
        if: matrix.win_torch_wheel_artifact != ''
        shell: bash
        run: |
          set -x
          # Find the wheel file
          WHEEL_FILE=$(find win-torch-wheel -name "*.whl" -type f | head -n 1)
          if [ -z "$WHEEL_FILE" ]; then
            echo "Error: No wheel file found in win-torch-wheel directory"
            exit 1
          fi
          echo "Found wheel file: $WHEEL_FILE"
          # Unzip the wheel file
          unzip -q "$WHEEL_FILE" -d win-torch-wheel-extracted
          echo "Extracted wheel contents"
          # Setup CUDA libraries (cuda.lib and cudart.lib) directory
          mkdir -p win-torch-wheel-extracted/lib/x64
          if [ -f "win-torch-wheel/cuda.lib" ]; then
            mv win-torch-wheel/cuda.lib win-torch-wheel-extracted/lib/x64/
            echo "Moved cuda.lib to win-torch-wheel-extracted/lib/x64/"
          fi
          if [ -f "win-torch-wheel/cudart.lib" ]; then
            mv win-torch-wheel/cudart.lib win-torch-wheel-extracted/lib/x64/
            echo "Moved cudart.lib to win-torch-wheel-extracted/lib/x64/"
          fi
          # Verify CUDA libraries are present
          echo "CUDA libraries:"
          ls -la win-torch-wheel-extracted/lib/x64/ || echo "No CUDA libraries found"
      - name: Parse ref
        id: parse-ref
        run: .github/scripts/parse_ref.py
--- a/.github/workflows/_win-build.yml
+++ b/.github/workflows/_win-build.yml
@ -168,31 +168,6 @@ jobs:
        run: |
          .ci/pytorch/win-build.sh
      # Collect Windows torch libs and CUDA libs for cross-compilation
      - name: Collect Windows CUDA libs for cross-compilation
        if: steps.build.outcome != 'skipped' && inputs.cuda-version != 'cpu'
        shell: bash
        run: |
          set -ex
          # Create directory structure if does not exist
          mkdir -p /c/${{ github.run_id }}/build-results
          # Copy CUDA libs
          CUDA_PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${{ inputs.cuda-version }}"
          if [ -f "${CUDA_PATH}/lib/x64/cuda.lib" ]; then
            cp "${CUDA_PATH}/lib/x64/cuda.lib" /c/${{ github.run_id }}/build-results/
          fi
          if [ -f "${CUDA_PATH}/lib/x64/cudart.lib" ]; then
            cp "${CUDA_PATH}/lib/x64/cudart.lib" /c/${{ github.run_id }}/build-results/
          fi
          # List collected files
          echo "Collected CUDA libs:"
          ls -lah /c/${{ github.run_id }}/build-results/*.lib
      # Upload to github so that people can click and download artifacts
      - name: Upload artifacts to s3
        if: steps.build.outcome != 'skipped'
--- a/.github/workflows/_xpu-test.yml
+++ b/.github/workflows/_xpu-test.yml
@ -38,10 +38,6 @@ on:
        default: ""
        description: |
          List of tests to include (empty string implies default list)
      dashboard-tag:
        required: false
        type: string
        default: ""
      disable-monitor:
        description: |
          [Experimental] Disable utilization monitoring for tests.
@ -62,11 +58,6 @@ on:
        required: false
        type: number
        default: 1
    secrets:
      HUGGING_FACE_HUB_TOKEN:
        required: false
        description: |
          HF Auth token to avoid rate limits when downloading models or datasets from hub
 permissions:
  id-token: write
  contents: read
@ -205,8 +196,6 @@ jobs:
          PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }}
          PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }}
          TESTS_TO_INCLUDE: ${{ inputs.tests-to-include }}
          DASHBOARD_TAG: ${{ inputs.dashboard-tag }}
          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }}
        run: |
          # Fetch aws credential from IMDs
@ -257,8 +246,6 @@ jobs:
            -e PYTORCH_TEST_RERUN_DISABLED_TESTS \
            -e TESTS_TO_INCLUDE \
            -e ZE_AFFINITY_MASK \
            -e HUGGING_FACE_HUB_TOKEN \
            -e DASHBOARD_TAG \
            --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \
            --ulimit stack=10485760:83886080 \
            --ulimit core=0 \
--- a/.github/workflows/b200-distributed.yml
+++ b/.github/workflows/b200-distributed.yml
@ -1,62 +0,0 @@
 name: CI for distributed tests on B200
 on:
  pull_request:
    paths:
      - .github/workflows/b200-distributed.yml
  workflow_dispatch:
  push:
    tags:
      - ciflow/b200-distributed/*
  schedule:
    - cron: 46 8 * * *  # about 1:46am PDT
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-label-type:
    if: github.repository_owner == 'pytorch'
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200:
    name: linux-jammy-cuda12.8-py3.10-gcc11-build-distributed-b200
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      runner: linux.12xlarge.memory
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc11
      cuda-arch-list: '10.0'
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 2, runner: "linux.dgx.b200.8" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "linux.dgx.b200.8" },
        ]}
    secrets: inherit
  linux-jammy-cuda12_8-py3_10-gcc11-test-distributed-b200:
    name: linux-jammy-cuda12.8-py3.10-gcc11-test-b200
    uses: ./.github/workflows/_linux-test.yml
    needs:
      - linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200
    with:
      timeout-minutes: 1200
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11-distributed-b200
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build-distributed-b200.outputs.test-matrix }}
      aws-role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
    secrets: inherit
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm7.0", "rocm7.1", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.4", "rocm7.0", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/build-libtorch-images.yml
+++ b/.github/workflows/build-libtorch-images.yml
@ -52,8 +52,8 @@ jobs:
          { tag: "cuda12.9" },
          { tag: "cuda12.8" },
          { tag: "cuda12.6" },
          { tag: "rocm6.4"  },
          { tag: "rocm7.0"  },
          { tag: "rocm7.1"  },
          { tag: "cpu"      },
        ]
    steps:
--- a/.github/workflows/build-magma-rocm-linux.yml
+++ b/.github/workflows/build-magma-rocm-linux.yml
@ -34,7 +34,7 @@ jobs:
      id-token: write
    strategy:
      matrix:
-        rocm_version: ["71", "70"]
+        rocm_version: ["70", "64"]
    steps:
      - name: Checkout PyTorch
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/.github/workflows/build-manywheel-images.yml
+++ b/.github/workflows/build-manywheel-images.yml
@ -54,8 +54,8 @@ jobs:
          { name: "manylinuxaarch64-builder",       tag: "cuda12.9",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.8",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinuxaarch64-builder",       tag: "cuda12.6",          runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm6.4",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.0",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "rocm7.1",           runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "cpu",               runner: "linux.9xlarge.ephemeral" },
          { name: "manylinux2_28_aarch64-builder",  tag: "cpu-aarch64",       runner: "linux.arm64.2xlarge.ephemeral" },
          { name: "manylinux2_28-builder",          tag: "xpu",               runner: "linux.9xlarge.ephemeral" },
--- a/.github/workflows/build-triton-wheel.yml
+++ b/.github/workflows/build-triton-wheel.yml
@ -55,7 +55,7 @@ jobs:
        docker-image: ["pytorch/manylinux2_28-builder:cpu"]
        include:
          - device: "rocm"
-            rocm_version: "7.1"
+            rocm_version: "7.0"
            runs_on: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge"
          - device: "cuda"
            rocm_version: ""
@ -159,7 +159,12 @@ jobs:
            WITH_CLANG_LDD="--with-clang-ldd"
          fi
-          docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
+          if [[ "${BUILD_DEVICE}" == xpu ]]; then
            docker exec -t "${container_name}" bash -c "dnf install -y gcc-toolset-13-gcc-c++"
            docker exec -t "${container_name}" bash -c "source /opt/rh/gcc-toolset-13/enable && ${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE"
          else
            docker exec -t "${container_name}" bash -c "${PYTHON_EXECUTABLE} /pytorch/.github/scripts/build_triton_wheel.py --device=$BUILD_DEVICE $RELEASE $WITH_CLANG_LDD"
          fi
          if [[ ("${{ matrix.device }}" == "cuda" || "${{ matrix.device }}" == "xpu") ]]; then
            docker exec -t "${container_name}"  bash -c "auditwheel repair --plat ${PLATFORM} //artifacts/*.whl"
--- a/.github/workflows/build-vllm-wheel.yml
+++ b/.github/workflows/build-vllm-wheel.yml
@ -27,8 +27,9 @@ jobs:
      fail-fast: false
      matrix:
        python-version: [ '3.12' ]
        # TODO (huydhn): Add cu130 after https://github.com/vllm-project/vllm/issues/24464 is resolved
        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
-        device: [ 'cu128', 'cu129', 'cu130' ]
+        device: [ 'cu128', 'cu129' ]
        include:
          - platform: manylinux_2_28_x86_64
            device: cu128
@ -38,10 +39,6 @@ jobs:
            device: cu129
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda12.9'
            runner: linux.12xlarge.memory
          - platform: manylinux_2_28_x86_64
            device: cu130
            manylinux-image: 'pytorch/manylinux2_28-builder:cuda13.0'
            runner: linux.12xlarge.memory
          - platform: manylinux_2_28_aarch64
            device: cu128
            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.8'
@ -50,11 +47,6 @@ jobs:
            device: cu129
            manylinux-image: 'pytorch/manylinuxaarch64-builder:cuda12.9'
            runner: linux.arm64.r7g.12xlarge.memory
        exclude:
          # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
          # xformers is update to support 13.0
          - platform: manylinux_2_28_aarch64
            device: cu130
    name: "Build ${{ matrix.device }} vLLM wheel on ${{ matrix.platform }}"
    runs-on: ${{ matrix.runner }}
    timeout-minutes: 480
@ -177,12 +169,7 @@ jobs:
      fail-fast: false
      matrix:
        platform: [ 'manylinux_2_28_x86_64', 'manylinux_2_28_aarch64' ]
-        device: [ 'cu128', 'cu129', 'cu130' ]
+        device: [ 'cu128', 'cu129' ]
        exclude:
          # TODO (huydhn): Add cu130 aarch64 once PyTorch is on 2.9+ and
          # xformers is update to support 13.0
          - platform: manylinux_2_28_aarch64
            device: cu130
    env:
      PLATFORM: ${{ matrix.platform }}
      BUILD_DEVICE: ${{ matrix.device }}
--- a/.github/workflows/docker-builds.yml
+++ b/.github/workflows/docker-builds.yml
@ -57,7 +57,6 @@ jobs:
          pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11,
          pytorch-linux-jammy-py3.10-clang12,
          pytorch-linux-jammy-py3.13-clang12,
          pytorch-linux-jammy-py3.14-clang12,
          pytorch-linux-jammy-rocm-n-py3,
          pytorch-linux-noble-rocm-n-py3,
          pytorch-linux-jammy-rocm-n-py3-benchmarks,
@ -67,7 +66,6 @@ jobs:
          pytorch-linux-jammy-py3.12-halide,
          pytorch-linux-jammy-xpu-n-1-py3,
          pytorch-linux-jammy-xpu-n-py3,
          pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks,
          pytorch-linux-jammy-py3-clang18-asan,
          pytorch-linux-jammy-py3-clang12-onnx,
          pytorch-linux-jammy-linter,
--- a/.github/workflows/docker-release.yml
+++ b/.github/workflows/docker-release.yml
@ -8,7 +8,6 @@ on:
      - docker.Makefile
      - .github/workflows/docker-release.yml
      - .github/scripts/generate_docker_release_matrix.py
      - .github/scripts/generate_binary_build_matrix.py
  push:
    branches:
      - nightly
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -178,7 +178,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -224,7 +224,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -270,7 +270,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -381,7 +381,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -427,7 +427,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -473,7 +473,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -519,7 +519,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -630,7 +630,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -676,7 +676,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -722,7 +722,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -768,7 +768,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -879,7 +879,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -925,7 +925,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -971,7 +971,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1017,7 +1017,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1128,7 +1128,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1174,7 +1174,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1220,7 +1220,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1266,7 +1266,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1377,7 +1377,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1423,7 +1423,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1469,7 +1469,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1515,7 +1515,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1626,7 +1626,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_6
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1672,7 +1672,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_8
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.3.24; platform_system == 'Linux' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1718,7 +1718,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' | nvidia-nvshmem-cu12==3.4.5; platform_system == 'Linux' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -1764,7 +1764,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-13_0
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.88; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.96; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.85; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.1.0.3; platform_system == 'Linux' | nvidia-cufft==12.0.0.61; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.4.66; platform_system == 'Linux' | nvidia-cusparse==12.6.3.3; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.4.5; platform_system == 'Linux' | nvidia-nvtx==13.0.85; platform_system == 'Linux' | nvidia-nvjitlink==13.0.88; platform_system == 'Linux' | nvidia-cufile==1.15.1.6; platform_system == 'Linux'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc==13.0.48; platform_system == 'Linux' | nvidia-cuda-runtime==13.0.48; platform_system == 'Linux' | nvidia-cuda-cupti==13.0.48; platform_system == 'Linux' | nvidia-cudnn-cu13==9.13.0.50; platform_system == 'Linux' | nvidia-cublas==13.0.0.19; platform_system == 'Linux' | nvidia-cufft==12.0.0.15; platform_system == 'Linux' | nvidia-curand==10.4.0.35; platform_system == 'Linux' | nvidia-cusolver==12.0.3.29; platform_system == 'Linux' | nvidia-cusparse==12.6.2.49; platform_system == 'Linux' | nvidia-cusparselt-cu13==0.8.0; platform_system == 'Linux' | nvidia-nccl-cu13==2.27.7; platform_system == 'Linux' | nvidia-nvshmem-cu13==3.3.24; platform_system == 'Linux' | nvidia-nvtx==13.0.39; platform_system == 'Linux' | nvidia-nvjitlink==13.0.39; platform_system == 'Linux' | nvidia-cufile==1.15.0.42; platform_system == 'Linux'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-libtorch-nightly.yml
+++ b/.github/workflows/generated-linux-binary-libtorch-nightly.yml
@ -384,6 +384,124 @@ jobs:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm6_4-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: "6.4"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      build_name: libtorch-rocm6_4-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm6_4-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm6_4-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: "6.4"
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    permissions:
      id-token: write
      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm6_4-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: configure aws credentials
        id: aws_creds
        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
          role-duration-seconds: 18000
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: libtorch-cxx11-builder
          custom-tag-prefix: rocm6.4
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
        env:
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm6_4-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm6_4-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm6.4
      GPU_ARCH_VERSION: "6.4"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm6.4
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm6_4-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm7_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
@ -501,121 +619,3 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-rocm7_1-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    uses: ./.github/workflows/_binary-build-linux.yml
    needs: get-label-type
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.1
      GPU_ARCH_VERSION: "7.1"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      timeout-minutes: 300
      build_name: libtorch-rocm7_1-shared-with-deps-release
      build_environment: linux-binary-libtorch
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  libtorch-rocm7_1-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-rocm7_1-shared-with-deps-release-build
      - get-label-type
    runs-on: linux.rocm.gpu.mi250
    timeout-minutes: 240
    env:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.1
      GPU_ARCH_VERSION: "7.1"
      GPU_ARCH_TYPE: rocm
      SKIP_ALL_TESTS: 1
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
    permissions:
      id-token: write
      contents: read
    steps:
      - name: Setup ROCm
        uses: ./.github/actions/setup-rocm
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-rocm7_1-shared-with-deps-release
          path: "${{ runner.temp }}/artifacts/"
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: ROCm set GPU_FLAG
        run: |
          echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
      - name: configure aws credentials
        id: aws_creds
        if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') }}
        uses: aws-actions/configure-aws-credentials@v4
        with:
          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only
          aws-region: us-east-1
          role-duration-seconds: 18000
      - name: Calculate docker image
        id: calculate-docker-image
        uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
        with:
          docker-registry: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/') && '308535385114.dkr.ecr.us-east-1.amazonaws.com' || 'docker.io' }}
          docker-image-name: libtorch-cxx11-builder
          custom-tag-prefix: rocm7.1
          docker-build-dir: .ci/docker
          working-directory: pytorch
      - name: Pull Docker image
        uses: pytorch/test-infra/.github/actions/pull-docker-image@main
        with:
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Test Pytorch binary
        uses: ./pytorch/.github/actions/test-pytorch-binary
        env:
          DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
      - name: Teardown ROCm
        uses: ./.github/actions/teardown-rocm
  libtorch-rocm7_1-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-rocm7_1-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: /pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: rocm7.1
      GPU_ARCH_VERSION: "7.1"
      GPU_ARCH_TYPE: rocm
      DOCKER_IMAGE: libtorch-cxx11-builder
      DOCKER_IMAGE_TAG_PREFIX: rocm7.1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      build_name: libtorch-rocm7_1-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
--- a/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-libtorch-release-nightly.yml
@ -63,6 +63,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.10.4"
          freethreaded: false
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -59,6 +59,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.10.4"
          freethreaded: false
@ -168,6 +169,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.11.4"
          freethreaded: false
@ -277,6 +279,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.12.4"
          freethreaded: false
@ -386,6 +389,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.13.4"
          freethreaded: false
@ -495,6 +499,7 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
          python-version: "3.13.4"
          freethreaded: true
@ -604,8 +609,9 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0"
+          python-version: "3.14.0-rc.2"
          freethreaded: false
      - name: Checkout PyTorch
        uses: actions/checkout@v4
@ -713,8 +719,9 @@ jobs:
      - name: Setup Python
        uses: actions/setup-python@v6
        with:
          # TODO: Removeme once 3.14 is out
          # .4 version is min minor for 3.10, and also no-gil version of 3.13 needs at least 3.13.3
-          python-version: "3.14.0"
+          python-version: "3.14.0-rc.2"
          freethreaded: true
      - name: Checkout PyTorch
        uses: actions/checkout@v4
--- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml
@ -44,7 +44,7 @@ jobs:
  libtorch-cpu-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
  libtorch-cuda12_6-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
  libtorch-cuda12_8-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -788,10 +788,260 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_9-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
        run: |
          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
      # removed once Windows Defender is removed from the AMI
      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
        continue-on-error: true
        shell: powershell
        run: |
          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
          name: libtorch-cuda12_9-shared-with-deps-debug
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Wait until all sessions have drained
        shell: powershell
        working-directory: pytorch
        if: always()
        timeout-minutes: 120
        run: |
          .github\scripts\wait_for_ssh_to_drain.ps1
      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
        shell: powershell
        working-directory: pytorch
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_9-shared-with-deps-debug-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-cuda12_9-shared-with-deps-debug-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
        run: |
          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
      # removed once Windows Defender is removed from the AMI
      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
        continue-on-error: true
        shell: powershell
        run: |
          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cuda12_9-shared-with-deps-debug
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
      - name: Wait until all sessions have drained
        shell: powershell
        working-directory: pytorch
        if: always()
        timeout-minutes: 120
        run: |
          .github\scripts\wait_for_ssh_to_drain.ps1
      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
        shell: powershell
        working-directory: pytorch
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_9-shared-with-deps-debug-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-cuda12_9-shared-with-deps-debug-test
    with:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: debug
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_9-shared-with-deps-debug
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda13_0-shared-with-deps-debug-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
+++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml
@ -44,7 +44,7 @@ jobs:
  libtorch-cpu-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -291,7 +291,7 @@ jobs:
  libtorch-cuda12_6-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -541,7 +541,7 @@ jobs:
  libtorch-cuda12_8-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
@ -788,10 +788,260 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda12_9-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
    steps:
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
        run: |
          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
      # removed once Windows Defender is removed from the AMI
      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
        continue-on-error: true
        shell: powershell
        run: |
          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      - name: Populate binary env
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Build PyTorch binary
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh"
      - uses: actions/upload-artifact@v4.4.0
        if: always()
        with:
          name: libtorch-cuda12_9-shared-with-deps-release
          retention-days: 14
          if-no-files-found: error
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Wait until all sessions have drained
        shell: powershell
        working-directory: pytorch
        if: always()
        timeout-minutes: 120
        run: |
          .github\scripts\wait_for_ssh_to_drain.ps1
      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
        shell: powershell
        working-directory: pytorch
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_9-shared-with-deps-release-test:  # Testing
    if: ${{ github.repository_owner == 'pytorch' }}
    needs:
      - libtorch-cuda12_9-shared-with-deps-release-build
      - get-label-type
    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.g4dn.xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      SKIP_ALL_TESTS: 1
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
    steps:
      - name: Display EC2 information
        shell: bash
        run: |
          set -euo pipefail
          function get_ec2_metadata() {
            # Pulled from instance metadata endpoint for EC2
            # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html
            category=$1
            curl -H "X-aws-ec2-metadata-token: $(curl -s -X PUT "http://169.254.169.254/latest/api/token" -H "X-aws-ec2-metadata-token-ttl-seconds: 30")" -fsSL "http://169.254.169.254/latest/meta-data/${category}"
          }
          echo "ami-id: $(get_ec2_metadata ami-id)"
          echo "instance-id: $(get_ec2_metadata instance-id)"
          echo "instance-type: $(get_ec2_metadata instance-type)"
          echo "system info $(uname -a)"
      - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)"
        uses: pytorch/test-infra/.github/actions/setup-ssh@main
        continue-on-error: true
        with:
          github-secret: ${{ secrets.GITHUB_TOKEN }}
      - name: Enable git long paths and symlinks on Windows and disable fsmonitor daemon
        shell: bash
        run: |
          git config --global core.longpaths true
          git config --global core.symlinks true
          # https://git-scm.com/docs/git-fsmonitor--daemon.  The daemon could lock
          # the directory on Windows and prevent GHA from checking out as reported
          # in https://github.com/actions/checkout/issues/1018
          git config --global core.fsmonitor false
      # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560
      - name: Enable long paths on Windows
        shell: powershell
        run: |
          Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1
      # Since it's just a defensive command, the workflow should continue even the command fails. This step can be
      # removed once Windows Defender is removed from the AMI
      - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch
        continue-on-error: true
        shell: powershell
        run: |
          Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore
          # Let's both exclude the path and disable Windows Defender completely just to be sure
          # that it doesn't interfere
          Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore
      - name: Checkout PyTorch
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
          submodules: recursive
          path: pytorch
          show-progress: false
      - name: Clean PyTorch checkout
        run: |
          # Remove any artifacts from the previous checkouts
          git clean -fxd
        working-directory: pytorch
      # NOTE: These environment variables are put here so that they can be applied on every job equally
      #       They are also here because setting them at a workflow level doesn't give us access to the
      #       runner.temp variable, which we need.
      - name: Populate binary env
        shell: bash
        run: |
          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
          echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}"
      - uses: actions/download-artifact@v4.1.7
        name: Download Build Artifacts
        with:
          name: libtorch-cuda12_9-shared-with-deps-release
          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
      - name: Populate binary env
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
      - name: Test PyTorch binary
        shell: bash
        run: |
          "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh"
      - name: Wait until all sessions have drained
        shell: powershell
        working-directory: pytorch
        if: always()
        timeout-minutes: 120
        run: |
          .github\scripts\wait_for_ssh_to_drain.ps1
      - name: Kill active ssh sessions if still around (Useful if workflow was cancelled)
        shell: powershell
        working-directory: pytorch
        if: always()
        run: |
          .github\scripts\kill_active_ssh_sessions.ps1
  libtorch-cuda12_9-shared-with-deps-release-upload:  # Uploading
    if: ${{ github.repository_owner == 'pytorch' }}
    permissions:
      id-token: write
      contents: read
    needs: libtorch-cuda12_9-shared-with-deps-release-test
    with:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
      PACKAGE_TYPE: libtorch
      # TODO: This is a legacy variable that we eventually want to get rid of in
      #       favor of GPU_ARCH_VERSION
      DESIRED_CUDA: cu129
      GPU_ARCH_VERSION: "12.9"
      GPU_ARCH_TYPE: cuda
      LIBTORCH_CONFIG: release
      LIBTORCH_VARIANT: shared-with-deps
      # This is a dummy value for libtorch to work correctly with our batch scripts
      # without this value pip does not get installed for some reason
      DESIRED_PYTHON: "3.10"
      build_name: libtorch-cuda12_9-shared-with-deps-release
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
  libtorch-cuda13_0-shared-with-deps-release-build:
    if: ${{ github.repository_owner == 'pytorch' }}
    needs: get-label-type
-    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.12xlarge"
+    runs-on: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge"
    timeout-minutes: 360
    env:
      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
--- a/.github/workflows/generated-windows-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml
--- a/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi300.yml
@ -1,132 +0,0 @@
 name: inductor-perf-nightly-rocm-mi300
 on:
  push:
    tags:
      - ciflow/inductor-perf-test-nightly-rocm-mi300/*
  schedule:
    - cron: 15 0 * * *
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
  workflow_dispatch:
    inputs:
      training:
        description: Run training (on by default)?
        required: false
        type: boolean
        default: true
      inference:
        description: Run inference (on by default)?
        required: false
        type: boolean
        default: true
      default:
        description: Run inductor_default?
        required: false
        type: boolean
        default: false
      dynamic:
        description: Run inductor_dynamic_shapes?
        required: false
        type: boolean
        default: false
      cppwrapper:
        description: Run inductor_cpp_wrapper?
        required: false
        type: boolean
        default: false
      cudagraphs:
        description: Run inductor_cudagraphs?
        required: false
        type: boolean
        default: true
      freezing_cudagraphs:
        description: Run inductor_cudagraphs with freezing for inference?
        required: false
        type: boolean
        default: false
      aotinductor:
        description: Run aot_inductor for inference?
        required: false
        type: boolean
        default: false
      maxautotune:
        description: Run inductor_max_autotune?
        required: false
        type: boolean
        default: false
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
        type: string
        default: inductor_huggingface_perf_rocm_mi300,inductor_timm_perf_rocm_mi300,inductor_torchbench_perf_rocm_mi300
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf
  linux-jammy-rocm-py3_10-inductor-benchmark-build:
    if: github.repository_owner == 'pytorch'
    name: rocm-py3_10-inductor-benchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-rocm-py3_10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_rocm_mi300", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_huggingface_perf_rocm_mi300", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_huggingface_perf_rocm_mi300", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_huggingface_perf_rocm_mi300", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_huggingface_perf_rocm_mi300", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_timm_perf_rocm_mi300", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi300", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.gfx942.1" },
        ]}
    secrets: inherit
  linux-jammy-rocm-py3_10-inductor-benchmark-test:
    permissions:
      id-token: write
      contents: read
    name: rocm-py3_10-inductor-benchmark-test
    uses: ./.github/workflows/_rocm-test.yml
    needs: linux-jammy-rocm-py3_10-inductor-benchmark-build
    with:
      build-environment: linux-jammy-rocm-py3_10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-cppwrapper-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-inductor-benchmark-build.outputs.test-matrix }}
      timeout-minutes: 720
      # Disable monitor in perf tests for more investigation
      disable-monitor: true
      monitor-log-interval: 10
      monitor-data-collect-interval: 2
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
+++ b/.github/workflows/inductor-perf-test-nightly-rocm-mi355.yml
@ -1,11 +1,11 @@
-name: inductor-perf-nightly-rocm-mi355
+name: inductor-perf-nightly-rocm
 on:
  push:
    tags:
-      - ciflow/inductor-perf-test-nightly-rocm-mi355/*
+      - ciflow/inductor-perf-test-nightly-rocm/*
  schedule:
-    - cron: 15 0 * * *
+    - cron: 0 7 * * 0,3
  # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it
  # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs
  workflow_dispatch:
@ -59,7 +59,7 @@ on:
        description: The list of configs used the benchmark
        required: false
        type: string
-        default: inductor_huggingface_perf_rocm_mi355,inductor_timm_perf_rocm_mi355,inductor_torchbench_perf_rocm_mi355
+        default: inductor_huggingface_perf_rocm,inductor_timm_perf_rocm,inductor_torchbench_perf_rocm
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -88,27 +88,23 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      test-matrix: |
        { include: [
-          { config: "inductor_huggingface_perf_rocm_mi355", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 1, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm_mi355", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 2, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm_mi355", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 3, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm_mi355", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_huggingface_perf_rocm", shard: 4, num_shards: 4, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_huggingface_perf_rocm_mi355", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm", shard: 1, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 1, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm", shard: 2, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 2, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm", shard: 3, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 3, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm", shard: 4, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 4, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_timm_perf_rocm", shard: 5, num_shards: 5, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 5, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 1, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 6, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 2, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_timm_perf_rocm_mi355", shard: 7, num_shards: 7, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 3, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm_mi355", shard: 1, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 4, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm_mi355", shard: 2, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 5, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm_mi355", shard: 3, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 6, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm_mi355", shard: 4, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 7, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
-          { config: "inductor_torchbench_perf_rocm_mi355", shard: 5, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "inductor_torchbench_perf_rocm", shard: 8, num_shards: 8, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "inductor_torchbench_perf_rocm_mi355", shard: 6, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
          { config: "inductor_torchbench_perf_rocm_mi355", shard: 7, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
          { config: "inductor_torchbench_perf_rocm_mi355", shard: 8, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
          { config: "inductor_torchbench_perf_rocm_mi355", shard: 9, num_shards: 9, runner: "linux.rocm.gpu.mi355.1" },
        ]}
    secrets: inherit
--- a/.github/workflows/inductor-perf-test-nightly-xpu.yml
+++ b/.github/workflows/inductor-perf-test-nightly-xpu.yml
@ -1,148 +0,0 @@
 name: inductor-perf-nightly-xpu
 on:
  push:
    tags:
      - ciflow/inductor-perf-test-nightly-xpu/*
  schedule:
    - cron: 30 17 * * *
  workflow_dispatch:
    inputs:
      training:
        description: Run training (on by default)?
        required: false
        type: boolean
        default: true
      inference:
        description: Run inference (on by default)?
        required: false
        type: boolean
        default: true
      default:
        description: Run inductor_default?
        required: false
        type: boolean
        default: false
      dynamic:
        description: Run inductor_dynamic_shapes?
        required: false
        type: boolean
        default: false
      cppwrapper:
        description: Run inductor_cpp_wrapper?
        required: false
        type: boolean
        default: false
      cudagraphs:
        description: Run inductor_cudagraphs?
        required: false
        type: boolean
        default: false
      freezing_cudagraphs:
        description: Run inductor_cudagraphs with freezing for inference?
        required: false
        type: boolean
        default: false
      aotinductor:
        description: Run aot_inductor for inference?
        required: false
        type: boolean
        default: false
      maxautotune:
        description: Run inductor_max_autotune?
        required: false
        type: boolean
        default: false
      benchmark_configs:
        description: The list of configs used the benchmark
        required: false
        type: string
        default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf,cachebench
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions: read-all
 jobs:
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf
  xpu-n-py3_10-inductor-benchmark-build:
    name: xpu-n-py3.10-inductor-benchmark
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3-inductor-benchmarks
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
          { config: "inductor_huggingface_perf_xpu", shard: 1, num_shards: 5, runner: "linux.idc.xpu" },
          { config: "inductor_huggingface_perf_xpu", shard: 2, num_shards: 5, runner: "linux.idc.xpu" },
          { config: "inductor_huggingface_perf_xpu", shard: 3, num_shards: 5, runner: "linux.idc.xpu" },
          { config: "inductor_huggingface_perf_xpu", shard: 4, num_shards: 5, runner: "linux.idc.xpu" },
          { config: "inductor_huggingface_perf_xpu", shard: 5, num_shards: 5, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_timm_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 1, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 2, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 3, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 4, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 5, num_shards: 6, runner: "linux.idc.xpu" },
          { config: "inductor_torchbench_perf_xpu", shard: 6, num_shards: 6, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit
  xpu-n-py3_10-inductor-benchmark-test-nightly:
    permissions:
      id-token: write
      contents: read
    if: github.event_name != 'workflow_dispatch'
    name: xpu-n-py3.10-inductor-benchmark
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-false-cppwrapper-true-aotinductor-true-freezing_cudagraphs-false-cudagraphs_low_precision-false
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
      timeout-minutes: 720
      # Disable monitor in perf tests for more investigation
      disable-monitor: true
      monitor-log-interval: 10
      monitor-data-collect-interval: 2
    secrets: inherit
  xpu-n-py3_10-inductor-benchmark-test:
    permissions:
      id-token: write
      contents: read
    if: github.event_name == 'workflow_dispatch'
    name: xpu-n-py3.10-inductor-test
    uses: ./.github/workflows/_xpu-test.yml
    needs: xpu-n-py3_10-inductor-benchmark-build
    with:
      build-environment: linux-jammy-xpu-n-py3.10
      dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }}
      docker-image: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.xpu-n-py3_10-inductor-benchmark-build.outputs.test-matrix }}
      timeout-minutes: 720
      disable-monitor: false
      monitor-log-interval: 15
      monitor-data-collect-interval: 4
    secrets: inherit
--- a/.github/workflows/inductor-periodic.yml
+++ b/.github/workflows/inductor-periodic.yml
@ -88,6 +88,7 @@ jobs:
    with:
      build-environment: linux-jammy-rocm-py3_10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3-benchmarks
      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "dynamo_eager_torchbench", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
--- a/.github/workflows/inductor-rocm.yml
+++ b/.github/workflows/inductor-rocm.yml
@ -1,10 +1,9 @@
 name: inductor-rocm
 on:
  schedule:
    - cron: 0 * * * *
  push:
    branches:
      - main
      - release/*
    tags:
      - ciflow/inductor-rocm/*
--- a/.github/workflows/inductor-unittest.yml
+++ b/.github/workflows/inductor-unittest.yml
@ -115,10 +115,10 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "inductor_amx", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "inductor_amx", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" },
+          { config: "inductor_avx2", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
-          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.avx2" },
+          { config: "inductor_avx2", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.10xlarge.avx2" },
        ]}
    secrets: inherit
--- a/.github/workflows/inductor.yml
+++ b/.github/workflows/inductor.yml
@ -84,13 +84,13 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      test-matrix: |
        { include: [
-          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_huggingface", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
-          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge.amx" },
+          { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.8xlarge.amx" },
          { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.24xl.spr-metal" },
        ]}
      build-additional-packages: "vision audio torchao"
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -76,12 +76,11 @@ jobs:
  # NOTE: mypy needs its own job because it depends on --all-files, without assessing all files it sometimes
  #       fails to find types when it should
-  # NOTE: We should be able to disable this and consolidate with Pyrefly
+  lintrunner-mypy:
  lintrunner-pyrefly:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
-    name: lintrunner-pyrefly-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
+    name: lintrunner-mypy-${{ needs.get-changed-files.outputs.changed-files == '*' && 'all' || 'partial' }}
    needs: [get-label-type, get-changed-files]
-    # Only run if there are changed files relevant to pyrefly
+    # Only run if there are changed files relevant to mypy
    if: |
      github.repository_owner == 'pytorch' && (
        needs.get-changed-files.outputs.changed-files == '*' ||
@ -99,8 +98,8 @@ jobs:
      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
-        echo "Running pyrefly"
+        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take PYREFLY --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -119,9 +118,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,PYREFLY --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,PYREFLY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi
  quick-checks:
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -41,7 +41,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge"
      build-environment: linux-jammy-py3.10-gcc11
      docker-image-name: ci-image:pytorch-linux-jammy-py3.10-gcc11
    secrets: inherit
--- a/.github/workflows/operator_benchmark.yml
+++ b/.github/workflows/operator_benchmark.yml
@ -7,11 +7,9 @@ on:
  workflow_dispatch:
    inputs:
      test_mode:
-        type: choice
+        required: false
-        options:
+        type: string
-          - 'short'
+        default: 'short'
          - 'long'
          - 'all'
        description: tag filter for operator benchmarks, options from long, short, all
  schedule:
    # Run at 07:00 UTC every Sunday
@ -30,49 +28,38 @@ permissions:
  contents: read
 jobs:
-  x86-opbenchmark-build:
+  opbenchmark-build:
    if: github.repository_owner == 'pytorch'
-    name: x86-opbenchmark-build
+    name: opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      test-matrix: |
        { include: [
-          { config: "cpu_operator_benchmark_${{ inputs.test_mode || 'short' }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
+          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit
-  x86-opbenchmark-test:
+  opbenchmark-on-demand-build:
-    name: x86-opbenchmark-test
+    if: ${{ github.event_name == 'workflow_dispatch' && github.repository_owner == 'pytorch' }}
-    uses: ./.github/workflows/_linux-test.yml
+    name: opbenchmark-on-demand-build
    needs: x86-opbenchmark-build
    with:
      build-environment: linux-jammy-py3.10-gcc11-build
      docker-image: ${{ needs.x86-opbenchmark-build.outputs.docker-image }}
      test-matrix: ${{ needs.x86-opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
  aarch64-opbenchmark-build:
    if: github.repository_owner == 'pytorch'
    name: aarch64-opbenchmark-build
    uses: ./.github/workflows/_linux-build.yml
    with:
-      build-environment: linux-jammy-aarch64-py3.10
+      build-environment: linux-jammy-py3.10-gcc11-build
-      runner: linux.arm64.m7g.4xlarge
+      docker-image-name: ci-image:pytorch-linux-jammy-py3-gcc11-inductor-benchmarks
      docker-image-name: ci-image:pytorch-linux-jammy-aarch64-py3.10-gcc11
      test-matrix: |
        { include: [
-          { config: "cpu_operator_benchmark_short", shard: 1, num_shards: 1, runner: "linux.arm64.m8g.4xlarge" },
+          { config: "cpu_operator_benchmark_${{ inputs.test_mode }}", shard: 1, num_shards: 1, runner: "linux.12xlarge" },
        ]}
    secrets: inherit
-  aarch64-opbenchmark-test:
+  opbenchmark-test:
-    name: aarch64-opbenchmark-test
+    name: opbenchmark-test
    uses: ./.github/workflows/_linux-test.yml
-    needs: aarch64-opbenchmark-build
+    needs: opbenchmark-build
    with:
-      build-environment: linux-jammy-aarch64-py3.10
+      build-environment: linux-jammy-py3.10-gcc11-build
-      docker-image: ${{ needs.aarch64-opbenchmark-build.outputs.docker-image }}
+      docker-image: ${{ needs.opbenchmark-build.outputs.docker-image }}
-      test-matrix: ${{ needs.aarch64-opbenchmark-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.opbenchmark-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic-rocm-mi200.yml
+++ b/.github/workflows/periodic-rocm-mi200.yml
@ -1,84 +0,0 @@
 name: periodic-rocm-mi200
 on:
  schedule:
    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
    # Also run less frequently on weekends.
    - cron: 45 0,8,16 * * 1-5
    - cron: 45 4 * * 0,6
    - cron: 45 4,12,20 * * 1-5
    - cron: 45 12 * * 0,6
    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
  push:
    tags:
      - ciflow/periodic/*
      - ciflow/periodic-rocm-mi200/*
    branches:
      - release/*
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  llm-td:
    if: github.repository_owner == 'pytorch'
    name: before-test
    uses: ./.github/workflows/llm_td_retrieval.yml
    permissions:
      id-token: write
      contents: read
  target-determination:
    name: before-test
    uses: ./.github/workflows/target_determination.yml
    needs: llm-td
    permissions:
      id-token: write
      contents: read
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch'
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit
  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
--- a/.github/workflows/periodic.yml
+++ b/.github/workflows/periodic.yml
@ -147,16 +147,15 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-cuda12.8-py3.10-gcc9-debug
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3-gcc9
      cuda-arch-list: 8.9
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 1, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 2, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 3, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 4, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 5, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 6, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
-          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", owners: ["oncall:debug-build"] },
+          { config: "default", shard: 7, num_shards: 7, runner: "${{ needs.get-label-type.outputs.label-type }}linux.4xlarge.nvidia.gpu", owners: ["oncall:debug-build"] },
        ]}
    secrets: inherit
@ -204,6 +203,37 @@ jobs:
      test-matrix: ${{ needs.linux-jammy-cuda13_0-py3_10-gcc11-build.outputs.test-matrix }}
    secrets: inherit
  linux-jammy-rocm-py3_10-build:
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      test-matrix: |
        { include: [
          { config: "distributed", shard: 1, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 2, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
          { config: "distributed", shard: 3, num_shards: 3, runner: "linux.rocm.gpu.mi250.4", owners: ["module:rocm", "oncall:distributed"] },
        ]}
    secrets: inherit
  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
  linux-jammy-cuda12_8-py3-gcc11-slow-gradcheck-build:
    name: linux-jammy-cuda12.8-py3-gcc11-slow-gradcheck
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -66,10 +66,10 @@ jobs:
          { config: "default", shard: 5, num_shards: 5, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "docs_test", shard: 1, num_shards: 1,  runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "jit_legacy", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "backwards_compat", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "distributed", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
          { config: "distributed", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "numpy_2_x", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
    secrets: inherit
@ -167,8 +167,8 @@ jobs:
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "default", shard: 1, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
-          { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.c7i.2xlarge" },
+          { config: "default", shard: 2, num_shards: 2, runner: "${{ needs.get-label-type.outputs.label-type }}linux.2xlarge" },
        ]}
    secrets: inherit
@ -347,8 +347,7 @@ jobs:
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
-      # This should sync with the build in xpu.yml but xpu uses a larger runner
+      sync-tag: linux-xpu-n-build
      # sync-tag: linux-xpu-n-build
      runner_prefix: ${{ needs.get-label-type.outputs.label-type }}
      build-environment: linux-jammy-xpu-n-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-xpu-n-py3
--- a/.github/workflows/rocm-mi300.yml
+++ b/.github/workflows/rocm-mi300.yml
@ -45,6 +45,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi300
      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.gfx942.1" },
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -42,14 +42,15 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-noble-rocm-py3.12-mi355
      docker-image-name: ci-image:pytorch-linux-noble-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 1, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 2, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 3, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 4, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 5, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
-          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.1" },
+          { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.mi355.2" },
        ]}
    secrets: inherit
--- a/.github/workflows/rocm-navi31.yml
+++ b/.github/workflows/rocm-navi31.yml
@ -1,75 +0,0 @@
 name: rocm-navi31
 on:
  push:
    tags:
      - ciflow/rocm-navi31/*
  workflow_dispatch:
  schedule:
    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
    # Also run less frequently on weekends.
    - cron: 45 */2 * * 1-5
    - cron: 45 4,12 * * 0,6
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
  cancel-in-progress: true
 permissions: read-all
 jobs:
  target-determination:
    if: github.repository_owner == 'pytorch'
    name: before-test
    uses: ./.github/workflows/target_determination.yml
    permissions:
      id-token: write
      contents: read
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
        ]}
    secrets: inherit
  linux-jammy-rocm-py3_10-test:
    permissions:
      id-token: write
      contents: read
    name: linux-jammy-rocm-py3_10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
      tests-to-include: >-
         ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
         inductor/test_flex_attention inductor/test_max_autotune' || '' }}
    secrets: inherit
--- a/.github/workflows/rocm.yml
+++ b/.github/workflows/rocm.yml
@ -3,13 +3,13 @@ name: rocm
 on:
  push:
    branches:
      - main
      - release/*
    tags:
      - ciflow/rocm/*
  workflow_dispatch:
  schedule:
    - cron: 29 8 * * *  # about 1:29am PDT
    - cron: 0 * * * *
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@ -26,23 +26,11 @@ jobs:
      id-token: write
      contents: read
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
  linux-jammy-rocm-py3_10-build:
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
@ -71,3 +59,29 @@ jobs:
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
    secrets: inherit
  linux-jammy-rocm-py3_10-gfx1100-test:
    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
    permissions:
      id-token: write
      contents: read
    name: linux-jammy-rocm-py3_10-gfx1100
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx1100" },
        ]}
      tests-to-include: >
         test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs
         test_autograd inductor/test_torchinductor inductor/test_kernel_benchmark
         inductor/test_pad_mm inductor/test_benchmark_fusion inductor/test_aot_inductor
         inductor/test_torchinductor inductor/test_decompose_mem_bound_mm
         inductor/test_flex_attention inductor/test_max_autotune
    secrets: inherit
--- a/.github/workflows/trunk-tagging.yml
+++ b/.github/workflows/trunk-tagging.yml
@ -58,10 +58,8 @@ jobs:
          else
            COMMIT_SHA="${{ github.sha }}"
          fi
-          {
+          echo "sha=${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
-            echo "sha=${COMMIT_SHA}"
+          echo "tag_name=trunk/${COMMIT_SHA}" >> "${GITHUB_OUTPUT}"
            echo "tag_name=trunk/${COMMIT_SHA}"
          } >> "${GITHUB_OUTPUT}"
      - name: Validate commit SHA
        run: |
@ -89,7 +87,7 @@ jobs:
            echo "✅ Commit ${COMMIT_SHA} is valid (automatic push trigger)"
          fi
-      - name: Create and push tag(s) with retry
+      - name: Create and push tag with retry
        id: check_tag
        env:
          TAG_NAME: ${{ steps.commit.outputs.tag_name }}
@ -114,23 +112,14 @@ jobs:
            return 1
          }
-          # Counters for summary reporting
+          # Exit early if tag already exists
-          created_count=0
+          if check_tag_exists; then
-          skipped_count=0
+            echo "✅ Tag already exists - no action needed"
-          failed_count=0
+            echo "exists=true" >> "${GITHUB_OUTPUT}"
            exit 0
          fi
-          # Always write outputs once on exit
+          echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
          finish() {
            set +e
            if [ -n "${GITHUB_OUTPUT:-}" ]; then
              {
                echo "created_count=${created_count}"
                echo "skipped_count=${skipped_count}"
                echo "failed_count=${failed_count}"
              } >> "${GITHUB_OUTPUT}"
            fi
          }
          trap finish EXIT
          # Retry configuration
          MAX_RETRIES=5
@ -205,111 +194,31 @@ jobs:
            }
          }
-          # New behavior for push events: enumerate commits in the push and tag each one.
+          # Execute with retry
-          # For workflow_dispatch, retain existing single-SHA behavior.
+          if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
-
+            echo "exists=false" >> "${GITHUB_OUTPUT}"
          # Always fetch tags once up front to improve idempotency in loops
          git fetch origin --tags --quiet || true
          if [ "${{ github.event_name }}" = "push" ]; then
            BEFORE_SHA="${{ github.event.before }}"
            AFTER_SHA="${{ github.sha }}"  # same as event.after
            # List commits introduced by this push (old..new), oldest first for stable ordering
            commits_file="$(mktemp)"
            git rev-list --reverse "${BEFORE_SHA}..${AFTER_SHA}" > "${commits_file}"
            if [ ! -s "${commits_file}" ]; then
              echo "No new commits found between ${BEFORE_SHA}..${AFTER_SHA}; nothing to tag."
              rm -f "${commits_file}"
              exit 0
            fi
            commit_count="$(wc -l < "${commits_file}" | tr -d ' ')"
            echo "Found ${commit_count} commit(s) to tag for push:"
            while IFS= read -r sha; do
              printf '  %s\n' "${sha}"
            done < "${commits_file}"
            while IFS= read -r sha; do
              TAG_NAME="trunk/${sha}"
              COMMIT_SHA="${sha}"
              # If tag already exists locally or remotely, skip (idempotent)
              if check_tag_exists; then
                echo "✅ Tag ${TAG_NAME} already exists - skipping"
                skipped_count=$((skipped_count + 1))
                continue
              fi
              echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
              if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
                created_count=$((created_count + 1))
              else
                echo "Tag creation failed after all retry attempts for ${TAG_NAME}"
                failed_count=$((failed_count + 1))
              fi
            done < "${commits_file}"
            rm -f "${commits_file}"
            if [ "${failed_count}" -gt 0 ]; then
              exit 1
            fi
            exit 0
          else
-            # workflow_dispatch path (single SHA tagging preserved)
+            echo "Tag creation failed after all retry attempts"
-
+            exit 1
            # Exit early if tag already exists
            if check_tag_exists; then
              echo "✅ Tag already exists - no action needed"
              skipped_count=1
              exit 0
            fi
            echo "Tag ${TAG_NAME} does not exist, proceeding with creation"
            if retry_with_backoff "tag_with_retry" "Creating tag ${TAG_NAME} for commit ${COMMIT_SHA}"; then
              created_count=1
              exit 0
            else
              echo "Tag creation failed after all retry attempts"
              failed_count=1
              exit 1
            fi
          fi
      - name: Tag creation summary
        if: always()
        run: |
-          if [ "${{ github.event_name }}" = "push" ]; then
+          if [ "${{ steps.check_tag.outputs.exists }}" = "true" ]; then
-            echo "Trigger: push on main"
+            echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
-            echo "Created: ${{ steps.check_tag.outputs.created_count }}"
+          elif [ "${{ job.status }}" = "success" ]; then
-            echo "Skipped (already existed): ${{ steps.check_tag.outputs.skipped_count }}"
+            echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
            echo "Failed: ${{ steps.check_tag.outputs.failed_count }}"
            if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
              echo "✅ Completed tagging for push range ${{ github.event.before }}..${{ github.sha }}"
            else
              echo "❌ Some tags failed to create for push range ${{ github.event.before }}..${{ github.sha }}"
            fi
          else
-            if [ "${{ steps.check_tag.outputs.failed_count }}" = "0" ]; then
+            echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
-              if [ "${{ steps.check_tag.outputs.created_count }}" = "0" ]; then
+          fi
-                echo "✅ Tag ${{ steps.commit.outputs.tag_name }} already existed - no action needed"
+
-              else
+          echo ""
-                echo "✅ Successfully created tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          echo "Tag details:"
-              fi
+          echo "  Name: ${{ steps.commit.outputs.tag_name }}"
-            else
+          echo "  Commit: ${{ steps.commit.outputs.sha }}"
-              echo "❌ Failed to create tag ${{ steps.commit.outputs.tag_name }} for commit ${{ steps.commit.outputs.sha }}"
+          echo "  Trigger: ${{ github.event_name }}"
-            fi
+          if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
-
+            echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
            echo ""
            echo "Tag details:"
            echo "  Name: ${{ steps.commit.outputs.tag_name }}"
            echo "  Commit: ${{ steps.commit.outputs.sha }}"
            echo "  Trigger: ${{ github.event_name }}"
            if [ -n "${{ github.event.inputs.commit_sha }}" ]; then
              echo "  Manual commit: ${{ github.event.inputs.commit_sha }}"
            fi
          fi
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@ -190,41 +190,6 @@ jobs:
      runner: "${{ needs.get-label-type.outputs.label-type }}windows.4xlarge.nonephemeral"
    secrets: inherit
  linux-jammy-rocm-py3_10-build:
    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build-environment: linux-jammy-rocm-py3.10
      docker-image-name: ci-image:pytorch-linux-jammy-rocm-n-py3
      sync-tag: rocm-build
      test-matrix: |
        { include: [
          { config: "default", shard: 1, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "default", shard: 2, num_shards: 2, runner: "linux.rocm.gpu.gfx942.1" },
          { config: "distributed", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.gfx942.4" },
        ]}
    secrets: inherit
  linux-jammy-rocm-py3_10-test:
    if: ${{ startsWith(github.event.ref, 'refs/tags/ciflow/trunk') }}
    permissions:
      id-token: write
      contents: read
    name: linux-jammy-rocm-py3.10
    uses: ./.github/workflows/_rocm-test.yml
    needs:
      - linux-jammy-rocm-py3_10-build
      - target-determination
    with:
      build-environment: linux-jammy-rocm-py3.10
      docker-image: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-jammy-rocm-py3_10-build.outputs.test-matrix }}
      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor distributed/test_c10d_common distributed/test_c10d_nccl"
    secrets: inherit
  inductor-build:
    name: inductor-build
    uses: ./.github/workflows/_linux-build.yml
@ -235,23 +200,6 @@ jobs:
      cuda-arch-list: '8.0'
    secrets: inherit
  # Test cross-compiled models with Windows libs extracted from wheel
  cross-compile-linux-test:
    name: cross-compile-linux-test
    uses: ./.github/workflows/_linux-test.yml
    needs:
      - linux-jammy-cuda12_8-py3_10-gcc11-build
      - get-label-type
      - win-vs2022-cuda12_8-py3-build
    with:
      build-environment: linux-jammy-cuda12.8-py3.10-gcc11
      docker-image: ${{ needs.linux-jammy-cuda12_8-py3_10-gcc11-build.outputs.docker-image }}
      test-matrix: |
        { include: [
          { config: "aoti_cross_compile_for_windows", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu", win_torch_wheel_artifact: "win-vs2022-cuda12.8-py3" },
        ]}
    secrets: inherit
  verify-cachebench-cpu-build:
    name: verify-cachebench-cpu-build
    uses: ./.github/workflows/_linux-build.yml
--- a/.github/workflows/upload-test-stats.yml
+++ b/.github/workflows/upload-test-stats.yml
@ -6,7 +6,6 @@ on:
      - pull
      - trunk
      - periodic
      - periodic-rocm-mi200
      - periodic-rocm-mi300
      - inductor
      - unstable
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -59,18 +59,14 @@ jobs:
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
          { config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
          { config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
          { config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
          { config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -127,7 +127,6 @@ torch/test/
 torch/utils/benchmark/utils/valgrind_wrapper/callgrind.h
 torch/utils/benchmark/utils/valgrind_wrapper/valgrind.h
 torch/version.py
 torch/_inductor/kernel/vendored_templates/*
 minifier_launcher.py
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_fwd_d*
 aten/src/ATen/native/transformers/hip/flash_attn/ck/fmha_bwd_d*
@ -144,7 +143,6 @@ scripts/release_notes/*.json
 sccache-stats*.json
 lint.json
 merge_record.json
 .github/scripts/nightly_source_matrix.json
 # These files get copied over on invoking setup.py
 torchgen/packaged/*
@ -376,7 +374,6 @@ third_party/ruy/
 third_party/glog/
 # Virtualenv
 .venv/
 venv/
 # Log files
@ -399,4 +396,3 @@ CLAUDE.local.md
 /test_*.py
 /debug_*.py
 CLAUDE_CONTEXT/
 /.claude/settings.local.json
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -121,22 +121,35 @@ command = [
 ]
 is_formatter = true
 [[linter]]
-code = 'PYREFLY'
+code = 'MYPY'
 include_patterns = [
    'setup.py',
    'functorch/dim/**/*.py',
    'torch/**/*.py',
    'torch/**/*.pyi',
-    'torchgen/**/*.py',
+    'caffe2/**/*.py',
-    'torchgen/**/*.pyi',
+    'caffe2/**/*.pyi',
-    'functorch/**/*.py',
+    'test/test_bundled_images.py',
-    'functorch/**/*.pyi',
+    'test/test_bundled_inputs.py',
    'test/test_complex.py',
    'test/test_datapipe.py',
    'test/test_futures.py',
    'test/test_numpy_interop.py',
    'test/test_torch.py',
    'test/test_type_hints.py',
    'test/test_type_info.py',
    'test/test_utils.py',
 ]
 exclude_patterns = [
    '**/fb/**',
 ]
 exclude_patterns = []
 command = [
    'python3',
-    'tools/linter/adapters/pyrefly_linter.py',
+    'tools/linter/adapters/mypy_linter.py',
-    '--config=pyrefly.toml',
+    '--config=mypy.ini',
    '--',
    '@{{PATHSFILE}}'
 ]
 init_command = [
    'python3',
@ -145,7 +158,7 @@ init_command = [
    'numpy==1.26.4 ; python_version >= "3.10" and python_version <= "3.11"',
    'numpy==2.1.0 ; python_version >= "3.12"',
    'expecttest==0.3.0',
-    'pyrefly==0.36.2',
+    'mypy==1.16.0',
    'sympy==1.13.3',
    'types-requests==2.27.25',
    'types-pyyaml==6.0.2',
@ -157,9 +170,43 @@ init_command = [
    'filelock==3.18.0',
    'junitparser==2.1.1',
    'rich==14.1.0',
-    'optree==0.17.0',
+    'pyyaml==6.0.2',
-    'types-openpyxl==3.1.5.20250919',
+    'optree==0.13.0',
-    'types-python-dateutil==2.9.0.20251008'
+    'dataclasses-json==0.6.7',
    'pandas==2.2.3',
 ]
 [[linter]]
 code = 'MYPYSTRICT'
 include_patterns = [
    '.github/**/*.py',
    'benchmarks/instruction_counts/**/*.py',
    'tools/**/*.py',
    'torchgen/**/*.py',
    'torch/utils/_pytree.py',
    'torch/utils/_cxx_pytree.py',
    'torch/utils/benchmark/utils/common.py',
    'torch/utils/benchmark/utils/timer.py',
    'torch/utils/benchmark/utils/valgrind_wrapper/**/*.py',
 ]
 exclude_patterns = [
    # (linbinyu) copied from internal repo
    '**/fb/**',
    'tools/code_analyzer/gen_operators_yaml.py',
    'tools/dynamo/verify_dynamo.py',
    'tools/gen_vulkan_spv.py',
    'tools/test/gen_operators_yaml_test.py',
    'tools/test/gen_oplist_test.py',
    'tools/test/test_selective_build.py',
    'tools/experimental/torchfuzz/**',
 ]
 command = [
    'python3',
    'tools/linter/adapters/mypy_linter.py',
    '--config=mypy-strict.ini',
    '--code=MYPYSTRICT',
    '--',
    '@{{PATHSFILE}}'
 ]
 [[linter]]
@ -211,6 +258,7 @@ exclude_patterns = [
    '**/*pb.h',
    '**/*inl.h',
    'aten/src/ATen/cpu/FlushDenormal.cpp',
    'aten/src/ATen/cpu/Utils.cpp',
    'aten/src/ATen/cpu/vml.h',
    'aten/src/ATen/CPUFixedAllocator.h',
    'aten/src/ATen/Parallel*.h',
@ -229,6 +277,8 @@ exclude_patterns = [
    'c10/util/win32-headers.h',
    'c10/test/**/*.h',
    'third_party/**/*',
    'torch/csrc/api/include/torch/nn/modules/common.h',
    'torch/csrc/api/include/torch/linalg.h',
    'torch/csrc/autograd/generated/**',
    'torch/csrc/distributed/**/*.cu',
    'torch/csrc/distributed/c10d/WinSockUtils.hpp',
@ -240,6 +290,7 @@ exclude_patterns = [
    'torch/csrc/utils/generated_serialization_types.h',
    'torch/csrc/utils/pythoncapi_compat.h',
    'torch/csrc/inductor/aoti_runtime/sycl_runtime_wrappers.h',
    'aten/src/ATen/ExpandBase.h',
 ]
 init_command = [
    'python3',
@ -742,7 +793,8 @@ exclude_patterns = [
 command = [
    'python3',
    'tools/linter/adapters/grep_linter.py',
-    '--pattern=(cudaSetDevice|cudaGetDevice)\\(',
+    '--pattern=cudaSetDevice(',
    '--pattern=cudaGetDevice(',
    '--linter-name=RAWCUDADEVICE',
    '--error-name=raw CUDA API usage',
    """--error-description=\
@ -1046,8 +1098,11 @@ command = [
 [[linter]]
 code = 'WORKFLOWSYNC'
 include_patterns = [
-    '.github/workflows/*.yml',
+    '.github/workflows/pull.yml',
-    '.github/workflows/*.yaml',
+    '.github/workflows/trunk.yml',
    '.github/workflows/periodic.yml',
    '.github/workflows/mac-mps.yml',
    '.github/workflows/slow.yml',
 ]
 command = [
    'python3',
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -374,7 +374,7 @@ cmake_dependent_option(
  "Build the lazy Torchscript backend, not compatible with mobile builds" ON
  "NOT INTERN_BUILD_MOBILE" OFF)
 cmake_dependent_option(BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF)
-cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin folder"
+cmake_dependent_option(BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler"
                       OFF "USE_CUDA" OFF)
 cmake_dependent_option(USE_KLEIDIAI "Use KleidiAI for the ARM CPU & AARCH64 architecture." ON
                        "CPU_AARCH64" OFF)
--- a/14
+++ b/14
@ -201,17 +201,3 @@ torch/backends/cudnn/ @eqy @syed-ahmed @Aidyn-A
 /torch/csrc/stable/ @janeyx99 @mikaylagawarecki
 /torch/headeronly/ @janeyx99
 /torch/header_only_apis.txt @janeyx99
 # FlexAttention
 /torch/nn/attention/flex_attention.py @drisspg
 /torch/_higher_order_ops/flex_attention.py @drisspg
 /torch/_inductor/kernel/flex/ @drisspg
 /torch/_inductor/codegen/cpp_flex_attention_template.py @drisspg
 /test/inductor/test_flex_attention.py @drisspg
 /test/inductor/test_flex_decoding.py @drisspg
 # Low Precision GEMMs
 /aten/src/ATen/native/cuda/Blas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.cpp @drisspg @slayton58
 /aten/src/ATen/cuda/CUDABlas.h @drisspg @slayton58
 /test/test_scaled_matmul_cuda.py @drisspg @slayton58
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -11,6 +11,7 @@ aspects of contributing to PyTorch.
 <!-- toc -->
 - [Developing PyTorch](#developing-pytorch)
  - [Setup the development environment](#setup-the-development-environment)
  - [Tips and Debugging](#tips-and-debugging)
 - [Nightly Checkout & Pull](#nightly-checkout--pull)
 - [Codebase structure](#codebase-structure)
@ -66,6 +67,23 @@ aspects of contributing to PyTorch.
 Follow the instructions for [installing PyTorch from source](https://github.com/pytorch/pytorch#from-source). If you get stuck when developing PyTorch on your machine, check out the [tips and debugging](#tips-and-debugging) section below for common solutions.
 ### Setup the development environment
 First, you need to [fork the PyTorch project on GitHub](https://github.com/pytorch/pytorch/fork) and follow the instructions at [Connecting to GitHub with SSH](https://docs.github.com/en/authentication/connecting-to-github-with-ssh) to setup your SSH authentication credentials.
 Then clone the PyTorch project and setup the development environment:
 ```bash
 git clone git@github.com:<USERNAME>/pytorch.git
 cd pytorch
 git remote add upstream git@github.com:pytorch/pytorch.git
 make setup-env
 # Or run `make setup-env-cuda` for pre-built CUDA binaries
 # Or run `make setup-env-rocm` for pre-built ROCm binaries
 source venv/bin/activate  # or `. .\venv\Scripts\activate` on Windows
 ```
 ### Tips and Debugging
 * If you want to have no-op incremental rebuilds (which are fast), see [Make no-op build fast](#make-no-op-build-fast) below.
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-![PyTorch Logo](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/pytorch-logo-dark.png)
+![PyTorch Logo](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/pytorch-logo-dark.png)
 --------------------------------------------------------------------------------
@ -72,7 +72,7 @@ Elaborating Further:
 If you use NumPy, then you have used Tensors (a.k.a. ndarray).
-![Tensor illustration](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/tensor_illustration.png)
+![Tensor illustration](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/tensor_illustration.png)
 PyTorch provides Tensors that can live either on the CPU or the GPU and accelerates the
 computation by a huge amount.
@ -99,7 +99,7 @@ from several research papers on this topic, as well as current and past work suc
 While this technique is not unique to PyTorch, it's one of the fastest implementations of it to date.
 You get the best of speed and flexibility for your crazy research.
-![Dynamic graph](https://github.com/pytorch/pytorch/raw/main/docs/source/_static/img/dynamic_graph.gif)
+![Dynamic graph](https://github.com/pytorch/pytorch/blob/9708fcf92db88b80b9010c68662d634434da3106/docs/source/_static/img/dynamic_graph.gif)
 ### Python First
--- a/SECURITY.md
+++ b/SECURITY.md
@ -1,7 +1,7 @@
 # Security Policy
 - [**Reporting a Vulnerability**](#reporting-a-vulnerability)
- - [**Using PyTorch Securely**](#using-pytorch-securely)
+ - [**Using Pytorch Securely**](#using-pytorch-securely)
   - [Untrusted models](#untrusted-models)
   - [TorchScript models](#torchscript-models)
   - [Untrusted inputs](#untrusted-inputs)
@ -10,30 +10,30 @@
 - [**CI/CD security principles**](#cicd-security-principles)
 ## Reporting Security Issues
-Beware that none of the topics under [Using PyTorch Securely](#using-pytorch-securely) are considered vulnerabilities of PyTorch.
+Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch.
 However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
 Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new
-All reports submitted through the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
+All reports submitted thru the security advisories mechanism would **either be made public or dismissed by the team within 90 days of the submission**. If advisory has been closed on the grounds that it is not a security issue, please do not hesitate to create an [new issue](https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml) as it is still likely a valid issue within the framework.
 Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported:
 https://www.facebook.com/whitehat
-## Using PyTorch Securely
+## Using Pytorch Securely
-**PyTorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package).
+**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package).
 ### Untrusted models
 Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources].
 **Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing).
-**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [Safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) has a significantly larger surface of attack but is more flexible in what it can serialize. See the documentation for more details.
+**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done.
 Even for more secure serialization formats, unexpected inputs to the downstream system can cause diverse security threats (e.g. denial of service, out of bound reads/writes) and thus we recommend extensive validation of any untrusted inputs.
 Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance.
@ -43,7 +43,7 @@ Important Note: The trustworthiness of a model is not binary. You must always de
 ### TorchScript models
-TorchScript models should be treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load.
+TorchScript models should treated the same way as locally executable code from an unknown source. Only run TorchScript models if you trust the provider. Please note, that tools for introspecting TorchScript models (such as `torch.utils.model_dump`) may also execute partial or full code stored in those models, therefore they should be used only if you trust the provider of the binary you are about to load.
 ### Untrusted inputs during training and prediction
@ -59,9 +59,9 @@ If applicable, prepare your model against bad inputs and prompt injections. Some
 ### Data privacy
-**Take special security measures if you train your models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
+**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and:
- Do not feed sensitive data to an untrusted model (even if runs in a sandboxed environment)
+- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment)
- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if the model overfits).
+- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits).
 ### Using distributed features
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@ -38,7 +38,7 @@ set_bool(AT_HIPSPARSELT_ENABLED CAFFE2_USE_HIPSPARSELT)
 configure_file(Config.h.in "${CMAKE_CURRENT_SOURCE_DIR}/Config.h")
 # TODO: Do not generate CUDAConfig.h for ROCm BUILDS
-# At the moment, `jit_macros.h` include CUDAConfig.h for both CUDA and HIP builds
+# At the moment, `jit_macors.h` include CUDAConfig.h for both CUDA and HIP builds
 if(USE_CUDA OR USE_ROCM)
  configure_file(cuda/CUDAConfig.h.in "${CMAKE_CURRENT_SOURCE_DIR}/cuda/CUDAConfig.h")
 endif()
@ -256,11 +256,10 @@ endif()
 IF(USE_FBGEMM_GENAI)
  set(FBGEMM_THIRD_PARTY ${PROJECT_SOURCE_DIR}/third_party/fbgemm/external/)
  set(FBGEMM_GENAI_SRCS ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize)
  if(USE_CUDA)
    # To avoid increasing the build time/binary size unnecessarily, use an allow-list of kernels to build.
    # If you want to integrate a kernel from FBGEMM into torch, you have to add it here.
-    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*(mx8mx8bf16_grouped|f4f4bf16_grouped|f4f4bf16).*")
+    set(FBGEMM_CUTLASS_KERNELS_REGEX ".*mx8mx8bf16_grouped.*")
    file(GLOB_RECURSE fbgemm_genai_native_cuda_cu
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/*.cu"
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/**/*.cu")
@ -289,71 +288,62 @@ IF(USE_FBGEMM_GENAI)
    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
-    set(fbgemm_genai_cuh
+    set(fbgemm_genai_mx8mx8bf16_grouped
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/mx8mx8bf16_grouped/"
      "${FBGEMM_GENAI_SRCS}/cutlass_extensions/f4f4bf16_grouped/"
      "${FBGEMM_GENAI_SRCS}/"
    )
-    target_include_directories(fbgemm_genai PRIVATE
+    target_include_directories(fbgemm_genai PUBLIC
      ${FBGEMM_THIRD_PARTY}/cutlass/include
      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
-      ${fbgemm_genai_cuh}
+      ${fbgemm_genai_mx8mx8bf16_grouped}
      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
    )
  else()
    if(USE_ROCM)
      # Only include the kernels we want to build to avoid increasing binary size.
      file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
        "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
      set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
-    # Add FBGEMM_GENAI include directories for torch_ops.h
+      # Add additional HIPCC compiler flags for performance
-    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
+      set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
-    list(APPEND ATen_CUDA_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
+        -mllvm
-  elseif(USE_ROCM)
+        -amdgpu-coerce-illegal-types=1
-    # Only include the kernels we want to build to avoid increasing binary size.
+        -mllvm
-    file(GLOB_RECURSE fbgemm_genai_native_rocm_hip
+        -enable-post-misched=0
-      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/kernels/fp8_rowwise_grouped*.hip"
+        -mllvm
-      "${FBGEMM_GENAI_SRCS}/ck_extensions/fp8_rowwise_grouped/fp8_rowwise_grouped_gemm.hip")
+        -greedy-reverse-local-assignment=1
-    set_source_files_properties(${fbgemm_genai_native_rocm_hip} PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        -fhip-new-launch-api)
-    # Add additional HIPCC compiler flags for performance
+      # Only compile for gfx942 for now.
-    set(FBGEMM_GENAI_EXTRA_HIPCC_FLAGS
+      # This is rather hacky, I could not figure out a clean solution :(
-      -mllvm
+      set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
-      -enable-post-misched=0
+      string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
-      -mllvm
+      if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
-      -greedy-reverse-local-assignment=1
+        list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
      -fhip-new-launch-api)
    if(DEFINED ROCM_VERSION_DEV AND ROCM_VERSION_DEV VERSION_LESS "7.2.0")
        list(PREPEND FBGEMM_GENAI_EXTRA_HIPCC_FLAGS -mllvm -amdgpu-coerce-illegal-types=1)
      endif()
      set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
-    # Only compile for gfx942 for now.
+      hip_add_library(
-    # This is rather hacky, I could not figure out a clean solution :(
+        fbgemm_genai STATIC
-    set(HIP_CLANG_FLAGS_ORIGINAL ${HIP_CLANG_FLAGS})
+        ${fbgemm_genai_native_rocm_hip}
-    string(REGEX REPLACE "--offload-arch=[^ ]*" "" FILTERED_HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS}")
+        HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
-    if("gfx942" IN_LIST PYTORCH_ROCM_ARCH)
+      set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
-      list(APPEND FILTERED_HIP_CLANG_FLAGS --offload-arch=gfx942;)
+      set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
      target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
      target_include_directories(fbgemm_genai PUBLIC
        # FBGEMM version of Composable Kernel is used due to some customizations
        ${FBGEMM_THIRD_PARTY}/composable_kernel/include
        ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
        ${FBGEMM_THIRD_PARTY}/cutlass/include
        ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
        ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
        ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
      )
    endif()
    set(HIP_CLANG_FLAGS ${FILTERED_HIP_CLANG_FLAGS})
    hip_add_library(
      fbgemm_genai STATIC
      ${fbgemm_genai_native_rocm_hip}
      HIPCC_OPTIONS ${HIP_HCC_FLAGS} ${FBGEMM_GENAI_EXTRA_HIPCC_FLAGS})
    set(HIP_CLANG_FLAGS ${HIP_CLANG_FLAGS_ORIGINAL})
    set_target_properties(fbgemm_genai PROPERTIES POSITION_INDEPENDENT_CODE ON)
    target_compile_definitions(fbgemm_genai PRIVATE FBGEMM_GENAI_NO_EXTENDED_SHAPES)
    target_include_directories(fbgemm_genai PRIVATE
      # FBGEMM version of Composable Kernel is used due to some customizations
      ${FBGEMM_THIRD_PARTY}/composable_kernel/include
      ${FBGEMM_THIRD_PARTY}/composable_kernel/library/include
      ${FBGEMM_THIRD_PARTY}/cutlass/include
      ${FBGEMM_THIRD_PARTY}/cutlass/tools/util/include
      ${FBGEMM_GENAI_SRCS}/common/include/   # includes fbgemm_gpu/quantize/utils.h, fbgemm_gpu/quantize/tuning_cache.hpp
      ${FBGEMM_GENAI_SRCS}/include/          # includes fbgemm_gpu/torch_ops.h
    )
    # Add FBGEMM_GENAI include directories for torch_ops.h
    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
    list(APPEND ATen_HIP_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
  endif()
 endif()
@ -702,6 +692,12 @@ if(USE_CUDA AND NOT USE_ROCM)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/include)
  list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/cutlass/tools/util/include)
  # Add FBGEMM_GENAI include directories for torch_ops.h
  if(USE_FBGEMM_GENAI)
    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/include)
    list(APPEND ATen_CUDA_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/fbgemm/fbgemm_gpu/experimental/gen_ai/src/quantize/common/include)
  endif()
  if($ENV{ATEN_STATIC_CUDA})
    if(CUDA_VERSION VERSION_LESS_EQUAL 12.9)
      list(APPEND ATen_CUDA_DEPENDENCY_LIBS
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Guilherme Leobas	47aaf03798	[dict] Support `dict.update()` with no args ghstack-source-id: c36750bbc4a12c016986ba84ee25a549d96b4b52 Pull-Request: https://github.com/pytorch/pytorch/pull/158061	2025-10-14 12:28:27 -03:00
Guilherme Leobas	00e728c4dd	[OrderedDict] Implement `hasattr(..., IteratorVariable)` ghstack-source-id: 2c7221fd1cefe8f7009972e8a54570e48dcd8ed9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155501	2025-10-14 12:28:27 -03:00
Guilherme Leobas	312bda8417	[dict] Implement dict subclass `fromkeys` classmethod ghstack-source-id: d024c6b95f293642fb6be338002c0a991fe57620 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155608	2025-10-14 12:28:26 -03:00
Guilherme Leobas	ae89b899cb	[OrderedDict] Add `bool(OrderedDict)` ghstack-source-id: 8e0befb6c60da5df505106b51df25ead970e454a Pull Request resolved: https://github.com/pytorch/pytorch/pull/155503	2025-10-14 12:28:25 -03:00
Guilherme Leobas	4c722fe44e	[OrderedDict] Set the correct dict class in UserDefinedDictVariable ghstack-source-id: efbc483057c41b1cf3f2beb6f71a97cd00f75ac9 Pull Request resolved: https://github.com/pytorch/pytorch/pull/155502	2025-10-14 12:28:25 -03:00
Guilherme Leobas	8345670234	[dict] Implement `__eq__` for dict_items ghstack-source-id: d5360cc0d97524f57bc88763955527ca4622f48c Pull Request resolved: https://github.com/pytorch/pytorch/pull/155154	2025-10-14 12:28:24 -03:00
`@ -1 +1 @@`
	`3b0e7a6f192ca2715e7e6cbe5db007aea7165fe2`	`8ad2aa5d354d1bf432339113860185d5a5d1abbd`
`@ -1 +1 @@`
	`cfbc5c2f1c798991715a6b06bb3ce46478c4487c`	`f5c6c2ec6490455e86f67b2a25c10390d60a27f7`
`@ -1 +1 @@`
	`c8b09f5f77d6bf6fb7ed7a9aa83e5d8156b3a5e9`	`0fa6e3129e61143224663e1ec67980d12b7ec4eb`