test

2025-11-04 16:04:58 +08:00 · 2025-08-19 17:29:55 -07:00
188 changed files with 2626 additions and 5855 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,7 +92,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -210,6 +209,8 @@ if __name__ == "__main__":
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
        # nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
        build_vars += "USE_NVSHMEM=OFF "
    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9
 FROM cuda as cuda13.0
 RUN bash ./install_cuda.sh 13.0
 ENV DESIRED_CUDA=13.0
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@ -83,7 +79,6 @@ FROM base as all_cuda
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
 COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0
 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,7 +168,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    ONNX=yes
@ -288,6 +288,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
@ -298,6 +299,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +0,0 @@
 transformers==4.54.0
 soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
 v4.54.0
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +0,0 @@
 v2.27.7-1
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi
-NVSHMEM_VERSION=3.3.20
+NVSHMEM_VERSION=3.3.9
 function install_cuda {
  version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"
  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
-  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
  suffix=".tar.xz"
  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}${suffix}"
+  tar xf "${filename}.tar.gz"
-  cp -a "${filename}/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
-  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/
  # cleanup
  cd ..
@ -128,6 +126,74 @@ function install_129 {
  ldconfig
 }
 function prune_124 {
  echo "Pruning CUDA 12.4"
  #####################################################################################
  # CUDA 12.4 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.4 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.4/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
 }
 function prune_126 {
  echo "Pruning CUDA 12.6"
  #####################################################################################
  # CUDA 12.6 prune static libs
  #####################################################################################
  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
  if [[ -n "$OVERRIDE_GENCODE" ]]; then
      export GENCODE=$OVERRIDE_GENCODE
  fi
  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
  fi
  # all CUDA libs except CuDNN and CuBLAS
  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
      | xargs -I {} bash -c \
                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
  # prune CuDNN and CuBLAS
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
  #####################################################################################
  # CUDA 12.6 prune visual tools
  #####################################################################################
  export CUDA_BASE="/usr/local/cuda-12.6/"
  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
 }
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,39 +212,18 @@ function install_128 {
  ldconfig
 }
 function install_130 {
  CUDNN_VERSION=9.12.0.46
  NVSHMEM_VERSION=3.3.20
  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
  # install CUDA 13.0 in the same container
  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
  install_cudnn 13 $CUDNN_VERSION
  install_nvshmem 13 $NVSHMEM_VERSION
  CUDA_VERSION=13.0 bash install_nccl.sh
  CUDA_VERSION=13.0 bash install_cusparselt.sh
  ldconfig
 }
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124;
+    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126;
+    12.6|12.6.*) install_126; prune_126
        ;;
    12.8|12.8.*) install_128;
        ;;
    12.9|12.9.*) install_129;
        ;;
    13.0|13.0.*) install_130;
        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt
-if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
        arch_path='x86_64'
    fi
    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
 elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,7 +5,9 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"
 function install_huggingface() {
-  pip_install -r huggingface-requirements.txt
+  local version
  commit=$(get_pinned_commit huggingface)
  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }
 function install_timm() {
@ -24,6 +26,9 @@ function install_torchbench() {
  python install.py --continue_on_fail
  # soxr comes from https://github.com/huggingface/transformers/pull/39429
  pip install transformers==4.54.0 soxr==0.5.0
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
 elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
  exit 1
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -174,15 +174,17 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
  popd
-  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
  pip install transformers==4.54.0 soxr==0.5.0
  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
  # its current version 0.12.0 doesn't work with transformers 4.54.0
  pip uninstall -y torchao
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
 }
 torchbench_setup_macos() {
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-02351a683668dd65bc82343e55245e308eb97b4e
+f92ceca80df7a36194468665d62b0f791b1826c5
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-0fc8fa751a4321d6531467537ff77cf3c1c70260
+0ca2393b47e72c4424a49aa3b32c7c5d0e378a72
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,20 +0,0 @@
 version: 2
 updates:
  # Update to the latest transformers version with dependabot
  - package-ecosystem: "pip"
    directory: "/.ci/docker/ci_commit_pins"
    schedule:
      interval: "daily"
    target-branch: "main"
    allow:
      - dependency-name: "transformers"
    commit-message:
      prefix: "[Dependabot] Update"
      include: "scope"
    labels:
      - "dependencies"
      - "open source"
      - "python"
      - "topic: not user facing"
      - "module: ci"
      - "module: inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,7 +27,6 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
 - ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,13 +96,6 @@ on:
        required: false
        type: string
        default: ""
      build-external-packages:
        description: |
          If set, the build external packages and saves their wheels as artifacts
          use command separated list of packages to build ex: 'vllm,transformers'.
        required: false
        type: string
        default: ""
    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -363,26 +356,6 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"
      - name: Build external packages
        id: build-external-packages
        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
        uses: ./.github/actions/build-external-packages
        with:
          build-targets: ${{ inputs.build-external-packages }}
          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
          cuda-arch-list: ${{ inputs.cuda-arch-list }}
          output-dir: external
      - name: Move external packages to dist
        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
        shell: bash
        run: |
          src="${{ steps.build-external-packages.outputs.output_dir }}"
          if [ -d "$src" ]; then
            mkdir -p "dist/$(dirname "$src")"
            mv "$src" "dist/$(dirname "$src")/"
          fi
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"
--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -57,11 +57,6 @@ jobs:
          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
      - name: Copy docs requirements for inclusion
        run: |
          # Replace symlink with actual file
          rm docs/requirements.txt || true
          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
            # Create new folder with specified name so extracting the archive yields that
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -354,7 +354,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -465,7 +465,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +687,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -798,7 +798,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -909,7 +909,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -785,7 +785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -851,7 +851,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1311,7 +1311,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1377,7 +1377,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1508,7 +1508,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -1968,7 +1968,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2034,7 +2034,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2100,7 +2100,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2560,7 +2560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2626,7 +2626,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2692,7 +2692,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3152,7 +3152,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3218,7 +3218,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3284,7 +3284,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -3744,7 +3744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3810,7 +3810,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3876,7 +3876,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -4336,7 +4336,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -4402,7 +4402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -4468,7 +4468,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi
  quick-checks:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -156,13 +156,13 @@ jobs:
      sync-tag: asan-test
    secrets: inherit
-  linux-jammy-py3_10-clang12-onnx-build:
+  linux-jammy-py3_9-clang12-onnx-build:
-    name: linux-jammy-py3.10-clang12-onnx
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12-onnx
+      build-environment: linux-jammy-py3.9-clang12-onnx
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
      test-matrix: |
        { include: [
@ -171,16 +171,16 @@ jobs:
        ]}
    secrets: inherit
-  linux-jammy-py3_10-clang12-onnx-test:
+  linux-jammy-py3_9-clang12-onnx-test:
-    name: linux-jammy-py3.10-clang12-onnx
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-onnx-build
+      - linux-jammy-py3_9-clang12-onnx-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12-onnx
+      build-environment: linux-jammy-py3.9-clang12-onnx
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit
  linux-jammy-py3_9-clang12-build:
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -1,45 +0,0 @@
 name: vllm-test
 on:
  push:
    tags:
      - ciflow/vllm/*
  workflow_dispatch:
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
  cancel-in-progress: true
 permissions:
  id-token: write
  contents: read
 jobs:
  get-label-type:
    name: get-label-type
    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
    with:
      triggering_actor: ${{ github.triggering_actor }}
      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
      curr_branch: ${{ github.head_ref || github.ref_name }}
      curr_ref_type: ${{ github.ref_type }}
      opt_out_experiments: lf
  torch-build-sm89:
    name: sm89-vllm-test
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      build-additional-packages: "vision audio torchao"
      build-external-packages: "vllm"
      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
      cuda-arch-list: '8.9'
      runner: linux.24xlarge.memory
      test-matrix: |
        { include: [
          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu"  },
          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
        ]}
    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -32,7 +32,6 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
 aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1) {
-      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
        continue;
      }
    }
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
    if (tensor.dim() == 1) {
-      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
        continue;
      }
    }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1847,12 +1847,8 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
      TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+#if CUDA_VERSION >= 12080
 #ifdef USE_ROCM
      return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
 #else
      return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
 #endif // USE_ROCM
 #else
      TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
 #endif // if CUDA_VERSION >= 12080
@ -1950,26 +1946,12 @@ void scaled_gemm(
  // hipblaslt supported row-wise before cublas, and did so their own way (via
  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
  // the SCALE_MODEs). Here we check for this early custom mode.
  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
 #if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
-  if (use_rowwise) {
+  if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
    matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
    matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
  }
-  else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
+#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  #if ROCM_VERSION >= 70000
            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
                // TODO: add constraints based on hipblaslt internals
                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
                           "Matrix dimensions must be multiples of 32 for MX format. "
                           "Got m=", m, ", n=", n, ", k=", k);
            }
  #endif
  }
 #else
  // rowwise isn't supported using cublaslt or older hipblaslt
  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
 #endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
  computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
  if (result_scale_ptr != nullptr) {
@ -2008,16 +1990,15 @@ void scaled_gemm(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
  }
-    // For other data types, use the get_scale_mode function based on scaling type
+
-    // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
+  // The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
-    // but we must invoke get_scale_mode anyways to trigger the version checks.
+  // but we must invoke get_scale_mode anyways to trigger the version checks.
-    // Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details.
+  [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
-    [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
+  [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
-    [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
+#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
+#endif
 #endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
  CuBlasLtMatmulPreference preference;
  auto ltworkspace = CublasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@ -90,7 +90,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
    case c10::ScalarType::Float8_e5m2fnuz:
      return HIP_R_8F_E5M2_FNUZ;
 #endif
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
    case c10::ScalarType::Float4_e2m1fn_x2:
      return CUDA_R_4F_E2M1;
 #endif
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -85,15 +85,6 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
  return static_cast<hipDataType>(500);
 }
 template <>
 constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
 #if ROCM_VERSION >= 70000
  return HIP_R_4F_E2M1;
 #else
  return static_cast<hipDataType>(33);
 #endif
 }
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
  return 1;
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1283,35 +1283,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  if (use_fast_accum) {
    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
  }
 #ifdef USE_ROCM
  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
  }
  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above");
  }
  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above");
  }
 #endif
  if (bias) {
-    TORCH_CHECK(out.scalar_type() != kFloat,
+    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
-        "Bias is not supported when out_dtype is set to Float32");
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
-
+         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
-    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 ||
+    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
-                bias->scalar_type() == ScalarType::Half,
+          bias->scalar_type() == ScalarType::BFloat16,
-        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
+          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
-
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
-    TORCH_CHECK((out.scalar_type() != kFloat &&
+          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
                 out.scalar_type() != ScalarType::BFloat16) ||
                bias->scalar_type() == ScalarType::BFloat16,
        "Bias must be BFloat16 to compute ", out.scalar_type(),
        " output, but got ", bias->scalar_type());
    TORCH_CHECK(out.scalar_type() != ScalarType::Half ||
                bias->scalar_type() == ScalarType::Half,
        "Bias must be Float16 to compute ", out.scalar_type(),
        " output, but got ", bias->scalar_type());
  }
  {
    auto bias_ = bias.value_or(Tensor());
@ -1373,22 +1353,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
  }
  else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
    #if ROCM_VERSION >= 70000
    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
                "Matrix dimensions must be multiples of 32 for block-wise scaling");
    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
                out.scalar_type() == ScalarType::Half,
                "Block-wise scaling only supports BFloat16 or Half output types");
 #else
    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
 #endif
  }
 #endif
  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
@ -1466,14 +1430,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      params.k = args.k;
      params.a = args.mata->data_ptr();
      params.a_scale_ptr = args.scale_mata_ptr;
      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.lda = args.lda;
      params.a_dtype = args.mata->scalar_type();
      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.a_scaling_type = args.scaling_mata_type.value();
      params.b = args.matb->data_ptr();
      params.b_scale_ptr = args.scale_matb_ptr;
      params.b_scale_dtype = args.scale_matb_dtype.value();
      params.ldb = args.ldb;
      params.b_dtype = args.matb->scalar_type();
      params.b_scale_dtype = args.scale_matb_dtype.value();
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -19,7 +19,9 @@ struct GridSamplerOffsets {
 static GridSamplerOffsets find_grid_sampler_offsets(
    constant int32_t* output_sizes,
    constant int32_t* output_strides,
    constant int32_t* input_sizes,
    constant int32_t* input_strides,
    constant int32_t* grid_sizes,
    constant int32_t* grid_strides,
    int32_t sampler_dims,
    uint tid) {
@ -276,13 +278,16 @@ kernel void grid_sampler(
  auto output_strides = params.output_strides.data();
  auto input_sizes = params.input_sizes.data();
  auto input_strides = params.input_strides.data();
  auto grid_sizes = params.grid_sizes.data();
  auto grid_strides = params.grid_strides.data();
  auto sampler_dims = params.sampler_dims;
  auto offsets = find_grid_sampler_offsets(
      output_sizes,
      output_strides,
      input_sizes,
      input_strides,
      grid_sizes,
      grid_strides,
      sampler_dims,
      tid);
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -456,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
    errMessage += ": reduction dim must be in the range of input shape";
    for (const auto dim : dim_value) {
      auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
-      TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
+      TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
    }
  }
--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@ -243,6 +243,12 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  } else {
    softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
  }
  at::Tensor atomic_counter;
  if (is_causal) {
    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
  }
  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
                                                              window_size_right,
                                                              seqlen_q,
@ -256,14 +262,6 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  constexpr bool uses_swa = false;
 #endif
  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
  is_causal = is_causal || uses_swa;
  at::Tensor atomic_counter;
  if (is_causal) {
    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
  }
  hipError_t err; // TODO: Error handling
  using aotriton::v2::flash::attn_fwd;
  using sdp::aotriton_adapter::mk_aotensor;
@ -457,9 +455,6 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
  constexpr bool uses_swa = false;
 #endif
  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
  is_causal = is_causal || needs_swa;
  auto [seed_t, offset_t, philox_state, use_philox_state] =
    prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);
--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -4190,7 +4190,7 @@ def run(runner, args, original_dir=None):
                nonlocal marked
                for i, s in enumerate(t.size()):
                    if s == batch_size:
-                        torch._dynamo.maybe_mark_dynamic(t, i)
+                        torch._dynamo.mark_dynamic(t, i)
                        marked = True
                        break
--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -370,7 +370,6 @@ class HuggingfaceRunner(BenchmarkRunner):
        return name in [
            "ElectraForQuestionAnswering",
            "MegatronBertForQuestionAnswering",
            "GPT2ForSequenceClassification",
        ]
    def _get_model_cls_and_config(self, model_name):
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -631,7 +631,6 @@ libtorch_nativert_sources = [
    "torch/nativert/kernels/NativeKernels.cpp",
    "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
    "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
    "torch/nativert/graph/passes/SubgraphRewriter.cpp",
 ]
 torch_mobile_tracer_sources = [
--- a/requirements.txt
+++ b/requirements.txt
@ -10,7 +10,7 @@ filelock
 fsspec>=0.8.5
 hypothesis
 jinja2
-lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
+lintrunner ; platform_machine != "s390x"
 networkx>=2.5.1
 optree>=0.13.0
 psutil
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@ -36,7 +36,6 @@ set(NATIVERT_TEST_SRCS
  ${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
  ${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
 )
 add_executable(test_nativert
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -288,16 +288,6 @@ void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outpu
  stack[0] = from(res);
 }
 bool my_is_cpu(Tensor t) {
  return t.is_cpu();
 }
 void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
  auto res = my_is_cpu(to<Tensor>(stack[0]));
  stack[0] = from(res);
 }
 Tensor fill_infinity(Tensor t) {
  auto value = std::numeric_limits<float>::infinity();
  return fill_(t, value);
@ -354,7 +344,6 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_transpose", &boxed_my_transpose);
  m.impl("my_empty_like", &boxed_empty_like);
  m.impl("fill_infinity", &boxed_fill_infinity);
  m.impl("my_is_cpu", &boxed_my_is_cpu);
 }
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@ -373,8 +362,6 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs
 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
  m.def("my_is_cpu(Tensor t) -> bool");
 }
 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -51,19 +51,6 @@ def my_abs(t) -> Tensor:
    return torch.ops.libtorch_agnostic.my_abs.default(t)
 def my_is_cpu(t) -> bool:
    """
    Returns is_cpu on the input tensor.
    Args:
        t: any Tensor
    Returns:
        a bool
    """
    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
 def my_ones_like(tensor, device) -> Tensor:
    """
    Returns a new Tensor like the input tensor, but with all ones
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -209,13 +209,6 @@ if not IS_WINDOWS:
            self.assertEqual(id(out), id(t))
            self.assertEqual(out, torch.zeros_like(t))
        def test_my_is_cpu(self, device):
            import libtorch_agnostic
            t = torch.rand(2, 7, device=device)
            out = libtorch_agnostic.ops.my_is_cpu(t)
            self.assertEqual(out, t.is_cpu)
        def test_fill_infinity(self, device):
            import libtorch_agnostic
--- a/test/dynamo/cpython/3_13/test_collections.diff
+++ b/test/dynamo/cpython/3_13/test_collections.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_collections.py b/test/dynamo/cpython/3_13/test_collections.py
-index cafc44007d1..4571e5a14fd 100644
+index cafc44007d1..1ee548abc7d 100644
 --- a/test/dynamo/cpython/3_13/test_collections.py
 +++ b/test/dynamo/cpython/3_13/test_collections.py
@@ -1,3 +1,23 @@
@ -35,21 +35,7 @@ index cafc44007d1..4571e5a14fd 100644
     def _superset_test(self, a, b):
         self.assertGreaterEqual(
             set(dir(a)),
-@@ -73,9 +93,10 @@ class TestUserObjects(unittest.TestCase):
+@@ -85,7 +105,7 @@ class TestUserObjects(unittest.TestCase):
         self._copy_test(obj)
     def test_dict_missing(self):
 -        class A(UserDict):
 -            def __missing__(self, key):
 -                return 456
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class A(UserDict):
 +                def __missing__(self, key):
 +                    return 456
         self.assertEqual(A()[123], 456)
         # get() ignores __missing__ on dict
         self.assertIs(A().get(123), None)
@@ -85,7 +106,7 @@ class TestUserObjects(unittest.TestCase):
 ### ChainMap (helper class for configparser and the string module)
 ################################################################################
@ -58,69 +44,7 @@ index cafc44007d1..4571e5a14fd 100644
     def test_basics(self):
         c = ChainMap()
-@@ -172,9 +193,10 @@ class TestChainMap(unittest.TestCase):
+@@ -315,7 +335,7 @@ class TestChainMap(unittest.TestCase):
         self.assertTrue(ChainMap({}, {1:2}))
     def test_missing(self):
 -        class DefaultChainMap(ChainMap):
 -            def __missing__(self, key):
 -                return 999
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class DefaultChainMap(ChainMap):
 +                def __missing__(self, key):
 +                    return 999
         d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
         for k, v in dict(a=1, b=2, c=30, d=999).items():
             self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
@@ -206,13 +228,14 @@ class TestChainMap(unittest.TestCase):
              ('i', 9999), ('j', 0)])
     def test_iter_not_calling_getitem_on_maps(self):
 -        class DictWithGetItem(UserDict):
 -            def __init__(self, *args, **kwds):
 -                self.called = False
 -                UserDict.__init__(self, *args, **kwds)
 -            def __getitem__(self, item):
 -                self.called = True
 -                UserDict.__getitem__(self, item)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class DictWithGetItem(UserDict):
 +                def __init__(self, *args, **kwds):
 +                    self.called = False
 +                    UserDict.__init__(self, *args, **kwds)
 +                def __getitem__(self, item):
 +                    self.called = True
 +                    UserDict.__getitem__(self, item)
         d = DictWithGetItem(a=1)
         c = ChainMap(d)
@@ -237,15 +260,16 @@ class TestChainMap(unittest.TestCase):
         self.assertIs(m, d.maps[0])
         # Use a different map than a dict
 -        class lowerdict(dict):
 -            def __getitem__(self, key):
 -                if isinstance(key, str):
 -                    key = key.lower()
 -                return dict.__getitem__(self, key)
 -            def __contains__(self, key):
 -                if isinstance(key, str):
 -                    key = key.lower()
 -                return dict.__contains__(self, key)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class lowerdict(dict):
 +                def __getitem__(self, key):
 +                    if isinstance(key, str):
 +                        key = key.lower()
 +                    return dict.__getitem__(self, key)
 +                def __contains__(self, key):
 +                    if isinstance(key, str):
 +                        key = key.lower()
 +                    return dict.__contains__(self, key)
         c = ChainMap()
         c['a'] = 1
@@ -315,7 +339,7 @@ class TestChainMap(unittest.TestCase):
 TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests
@ -129,19 +53,7 @@ index cafc44007d1..4571e5a14fd 100644
     def test_factory(self):
         Point = namedtuple('Point', 'x y')
-@@ -666,8 +690,9 @@ class TestNamedTuple(unittest.TestCase):
+@@ -722,7 +742,7 @@ class TestNamedTuple(unittest.TestCase):
             NT = namedtuple('NT', ['abc', 'def'], False, True)
     def test_namedtuple_subclass_issue_24931(self):
 -        class Point(namedtuple('_Point', ['x', 'y'])):
 -            pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class Point(namedtuple('_Point', ['x', 'y'])):
 +                pass
         a = Point(3, 4)
         self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
@@ -722,21 +747,26 @@ class TestNamedTuple(unittest.TestCase):
 ### Abstract Base Classes
 ################################################################################
@ -150,750 +62,7 @@ index cafc44007d1..4571e5a14fd 100644
     def validate_abstract_methods(self, abc, *names):
         methodstubs = dict.fromkeys(names, lambda s, *args: 0)
- 
+@@ -2059,7 +2079,7 @@ class CounterSubclassWithGet(Counter):
         # everything should work will all required methods are present
 -        C = type('C', (abc,), methodstubs)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            C = type('C', (abc,), methodstubs)
         C()
 +        # Dynamo raises a hard error here that we can't easily capture
 +        # Commenting this part as this would also fail in eager if a user
 +        # attempt to run the same code
 +
         # instantiation should fail if a required method is missing
 -        for name in names:
 -            stubs = methodstubs.copy()
 -            del stubs[name]
 -            C = type('C', (abc,), stubs)
 -            self.assertRaises(TypeError, C, name)
 +        # for name in names:
 +        #     stubs = methodstubs.copy()
 +        #     del stubs[name]
 +        #     C = type('C', (abc,), stubs)
 +        #     self.assertRaises(TypeError, C, name)
     def validate_isinstance(self, abc, name):
         stub = lambda s, *args: 0
@@ -981,19 +1011,21 @@ class TestOneTrickPonyABCs(ABCTestCase):
         for x in samples:
             self.assertIsInstance(x, Iterable)
             self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
 -        # Check direct subclassing
 -        class I(Iterable):
 -            def __iter__(self):
 -                return super().__iter__()
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check direct subclassing
 +            class I(Iterable):
 +                def __iter__(self):
 +                    return super().__iter__()
         self.assertEqual(list(I()), [])
         self.assertFalse(issubclass(str, I))
         self.validate_abstract_methods(Iterable, '__iter__')
         self.validate_isinstance(Iterable, '__iter__')
 -        # Check None blocking
 -        class It:
 -            def __iter__(self): return iter([])
 -        class ItBlocked(It):
 -            __iter__ = None
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check None blocking
 +            class It:
 +                def __iter__(self): return iter([])
 +            class ItBlocked(It):
 +                __iter__ = None
         self.assertTrue(issubclass(It, Iterable))
         self.assertTrue(isinstance(It(), Iterable))
         self.assertFalse(issubclass(ItBlocked, Iterable))
@@ -1023,32 +1055,35 @@ class TestOneTrickPonyABCs(ABCTestCase):
         self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
         self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
         self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
 -        # Check direct subclassing
 -        class R(Reversible):
 -            def __iter__(self):
 -                return iter(list())
 -            def __reversed__(self):
 -                return iter(list())
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check direct subclassing
 +            class R(Reversible):
 +                def __iter__(self):
 +                    return iter(list())
 +                def __reversed__(self):
 +                    return iter(list())
         self.assertEqual(list(reversed(R())), [])
         self.assertFalse(issubclass(float, R))
         self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
 -        # Check reversible non-iterable (which is not Reversible)
 -        class RevNoIter:
 -            def __reversed__(self): return reversed([])
 -        class RevPlusIter(RevNoIter):
 -            def __iter__(self): return iter([])
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check reversible non-iterable (which is not Reversible)
 +            class RevNoIter:
 +                def __reversed__(self): return reversed([])
 +            class RevPlusIter(RevNoIter):
 +                def __iter__(self): return iter([])
         self.assertFalse(issubclass(RevNoIter, Reversible))
         self.assertFalse(isinstance(RevNoIter(), Reversible))
         self.assertTrue(issubclass(RevPlusIter, Reversible))
         self.assertTrue(isinstance(RevPlusIter(), Reversible))
 -        # Check None blocking
 -        class Rev:
 -            def __iter__(self): return iter([])
 -            def __reversed__(self): return reversed([])
 -        class RevItBlocked(Rev):
 -            __iter__ = None
 -        class RevRevBlocked(Rev):
 -            __reversed__ = None
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check None blocking
 +            class Rev:
 +                def __iter__(self): return iter([])
 +                def __reversed__(self): return reversed([])
 +            class RevItBlocked(Rev):
 +                __iter__ = None
 +            class RevRevBlocked(Rev):
 +                __reversed__ = None
         self.assertTrue(issubclass(Rev, Reversible))
         self.assertTrue(isinstance(Rev(), Reversible))
         self.assertFalse(issubclass(RevItBlocked, Reversible))
@@ -1082,15 +1117,16 @@ class TestOneTrickPonyABCs(ABCTestCase):
         self.assertTrue(issubclass(Set, Collection), repr(Set))
         self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
         self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
 -        # Check direct subclassing
 -        class Col(Collection):
 -            def __iter__(self):
 -                return iter(list())
 -            def __len__(self):
 -                return 0
 -            def __contains__(self, item):
 -                return False
 -        class DerCol(Col): pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check direct subclassing
 +            class Col(Collection):
 +                def __iter__(self):
 +                    return iter(list())
 +                def __len__(self):
 +                    return 0
 +                def __contains__(self, item):
 +                    return False
 +            class DerCol(Col): pass
         self.assertEqual(list(iter(Col())), [])
         self.assertFalse(issubclass(list, Col))
         self.assertFalse(issubclass(set, Col))
@@ -1102,44 +1138,48 @@ class TestOneTrickPonyABCs(ABCTestCase):
         self.validate_abstract_methods(Collection, '__len__', '__iter__',
                                                    '__contains__')
         # Check sized container non-iterable (which is not Collection) etc.
 -        class ColNoIter:
 -            def __len__(self): return 0
 -            def __contains__(self, item): return False
 -        class ColNoSize:
 -            def __iter__(self): return iter([])
 -            def __contains__(self, item): return False
 -        class ColNoCont:
 -            def __iter__(self): return iter([])
 -            def __len__(self): return 0
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class ColNoIter:
 +                def __len__(self): return 0
 +                def __contains__(self, item): return False
 +            class ColNoSize:
 +                def __iter__(self): return iter([])
 +                def __contains__(self, item): return False
 +            class ColNoCont:
 +                def __iter__(self): return iter([])
 +                def __len__(self): return 0
         self.assertFalse(issubclass(ColNoIter, Collection))
         self.assertFalse(isinstance(ColNoIter(), Collection))
         self.assertFalse(issubclass(ColNoSize, Collection))
         self.assertFalse(isinstance(ColNoSize(), Collection))
         self.assertFalse(issubclass(ColNoCont, Collection))
         self.assertFalse(isinstance(ColNoCont(), Collection))
 -        # Check None blocking
 -        class SizeBlock:
 -            def __iter__(self): return iter([])
 -            def __contains__(self): return False
 -            __len__ = None
 -        class IterBlock:
 -            def __len__(self): return 0
 -            def __contains__(self): return True
 -            __iter__ = None
 +
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check None blocking
 +            class SizeBlock:
 +                def __iter__(self): return iter([])
 +                def __contains__(self): return False
 +                __len__ = None
 +            class IterBlock:
 +                def __len__(self): return 0
 +                def __contains__(self): return True
 +                __iter__ = None
         self.assertFalse(issubclass(SizeBlock, Collection))
         self.assertFalse(isinstance(SizeBlock(), Collection))
         self.assertFalse(issubclass(IterBlock, Collection))
         self.assertFalse(isinstance(IterBlock(), Collection))
 -        # Check None blocking in subclass
 -        class ColImpl:
 -            def __iter__(self):
 -                return iter(list())
 -            def __len__(self):
 -                return 0
 -            def __contains__(self, item):
 -                return False
 -        class NonCol(ColImpl):
 -            __contains__ = None
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Check None blocking in subclass
 +            class ColImpl:
 +                def __iter__(self):
 +                    return iter(list())
 +                def __len__(self):
 +                    return 0
 +                def __contains__(self, item):
 +                    return False
 +            class NonCol(ColImpl):
 +                __contains__ = None
         self.assertFalse(issubclass(NonCol, Collection))
         self.assertFalse(isinstance(NonCol(), Collection))
@@ -1162,30 +1202,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
             self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
         self.validate_abstract_methods(Iterator, '__next__', '__iter__')
 -        # Issue 10565
 -        class NextOnly:
 -            def __next__(self):
 -                yield 1
 -                return
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            # Issue 10565
 +            class NextOnly:
 +                def __next__(self):
 +                    yield 1
 +                    return
         self.assertNotIsInstance(NextOnly(), Iterator)
     def test_Generator(self):
 -        class NonGen1:
 -            def __iter__(self): return self
 -            def __next__(self): return None
 -            def close(self): pass
 -            def throw(self, typ, val=None, tb=None): pass
 -
 -        class NonGen2:
 -            def __iter__(self): return self
 -            def __next__(self): return None
 -            def close(self): pass
 -            def send(self, value): return value
 -
 -        class NonGen3:
 -            def close(self): pass
 -            def send(self, value): return value
 -            def throw(self, typ, val=None, tb=None): pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class NonGen1:
 +                def __iter__(self): return self
 +                def __next__(self): return None
 +                def close(self): pass
 +                def throw(self, typ, val=None, tb=None): pass
 +
 +            class NonGen2:
 +                def __iter__(self): return self
 +                def __next__(self): return None
 +                def close(self): pass
 +                def send(self, value): return value
 +
 +            class NonGen3:
 +                def close(self): pass
 +                def send(self, value): return value
 +                def throw(self, typ, val=None, tb=None): pass
         non_samples = [
             None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
@@ -1194,18 +1236,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
             self.assertNotIsInstance(x, Generator)
             self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
 -        class Gen:
 -            def __iter__(self): return self
 -            def __next__(self): return None
 -            def close(self): pass
 -            def send(self, value): return value
 -            def throw(self, typ, val=None, tb=None): pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class Gen:
 +                def __iter__(self): return self
 +                def __next__(self): return None
 +                def close(self): pass
 +                def send(self, value): return value
 +                def throw(self, typ, val=None, tb=None): pass
 -        class MinimalGen(Generator):
 -            def send(self, value):
 -                return value
 -            def throw(self, typ, val=None, tb=None):
 -                super().throw(typ, val, tb)
 +            class MinimalGen(Generator):
 +                def send(self, value):
 +                    return value
 +                def throw(self, typ, val=None, tb=None):
 +                    super().throw(typ, val, tb)
         def gen():
             yield 1
@@ -1228,15 +1271,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
                                mgen.throw, ValueError, ValueError("huhu"))
         self.assertRaises(StopIteration, mgen.throw, StopIteration())
 -        class FailOnClose(Generator):
 -            def send(self, value): return value
 -            def throw(self, *args): raise ValueError
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class FailOnClose(Generator):
 +                def send(self, value): return value
 +                def throw(self, *args): raise ValueError
         self.assertRaises(ValueError, FailOnClose().close)
 -        class IgnoreGeneratorExit(Generator):
 -            def send(self, value): return value
 -            def throw(self, *args): pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class IgnoreGeneratorExit(Generator):
 +                def send(self, value): return value
 +                def throw(self, *args): pass
         self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
@@ -1379,15 +1424,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
     def test_direct_subclassing(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
 -            class C(B):
 -                pass
 +            with torch._dynamo.set_fullgraph(fullgraph=False):
 +                class C(B):
 +                    pass
             self.assertTrue(issubclass(C, B))
             self.assertFalse(issubclass(int, C))
     def test_registration(self):
         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
 -            class C:
 -                __hash__ = None  # Make sure it isn't hashable by default
 +            with torch._dynamo.set_fullgraph(fullgraph=False):
 +                class C:
 +                    __hash__ = None  # Make sure it isn't hashable by default
             self.assertFalse(issubclass(C, B), B.__name__)
             B.register(C)
             self.assertTrue(issubclass(C, B))
@@ -1423,13 +1470,14 @@ class TestCollectionABCs(ABCTestCase):
             self.assertIsInstance(sample(), Set)
             self.assertTrue(issubclass(sample, Set))
         self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
 -        class MySet(Set):
 -            def __contains__(self, x):
 -                return False
 -            def __len__(self):
 -                return 0
 -            def __iter__(self):
 -                return iter([])
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MySet(Set):
 +                def __contains__(self, x):
 +                    return False
 +                def __len__(self):
 +                    return 0
 +                def __iter__(self):
 +                    return iter([])
         self.validate_comparison(MySet())
     def test_hash_Set(self):
@@ -1448,15 +1496,16 @@ class TestCollectionABCs(ABCTestCase):
         self.assertTrue(hash(a) == hash(b))
     def test_isdisjoint_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
 -                self.contents = itr
 -            def __contains__(self, x):
 -                return x in self.contents
 -            def __iter__(self):
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
 +                def __contains__(self, x):
 +                    return x in self.contents
 +                def __iter__(self):
 +                    return iter(self.contents)
 +                def __len__(self):
 +                    return len([x for x in self.contents])
         s1 = MySet((1, 2, 3))
         s2 = MySet((4, 5, 6))
         s3 = MySet((1, 5, 6))
@@ -1464,15 +1513,16 @@ class TestCollectionABCs(ABCTestCase):
         self.assertFalse(s1.isdisjoint(s3))
     def test_equality_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
 -                self.contents = itr
 -            def __contains__(self, x):
 -                return x in self.contents
 -            def __iter__(self):
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
 +                def __contains__(self, x):
 +                    return x in self.contents
 +                def __iter__(self):
 +                    return iter(self.contents)
 +                def __len__(self):
 +                    return len([x for x in self.contents])
         s1 = MySet((1,))
         s2 = MySet((1, 2))
         s3 = MySet((3, 4))
@@ -1486,15 +1536,16 @@ class TestCollectionABCs(ABCTestCase):
         self.assertNotEqual(s2, s3)
     def test_arithmetic_Set(self):
 -        class MySet(Set):
 -            def __init__(self, itr):
 -                self.contents = itr
 -            def __contains__(self, x):
 -                return x in self.contents
 -            def __iter__(self):
 -                return iter(self.contents)
 -            def __len__(self):
 -                return len([x for x in self.contents])
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MySet(Set):
 +                def __init__(self, itr):
 +                    self.contents = itr
 +                def __contains__(self, x):
 +                    return x in self.contents
 +                def __iter__(self):
 +                    return iter(self.contents)
 +                def __len__(self):
 +                    return len([x for x in self.contents])
         s1 = MySet((1, 2, 3))
         s2 = MySet((3, 4, 5))
         s3 = s1 & s2
@@ -1516,28 +1567,29 @@ class TestCollectionABCs(ABCTestCase):
     def test_issue_4920(self):
         # MutableSet.pop() method did not work
 -        class MySet(MutableSet):
 -            __slots__=['__s']
 -            def __init__(self,items=None):
 -                if items is None:
 -                    items=[]
 -                self.__s=set(items)
 -            def __contains__(self,v):
 -                return v in self.__s
 -            def __iter__(self):
 -                return iter(self.__s)
 -            def __len__(self):
 -                return len(self.__s)
 -            def add(self,v):
 -                result=v not in self.__s
 -                self.__s.add(v)
 -                return result
 -            def discard(self,v):
 -                result=v in self.__s
 -                self.__s.discard(v)
 -                return result
 -            def __repr__(self):
 -                return "MySet(%s)" % repr(list(self))
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MySet(MutableSet):
 +                __slots__=['__s']
 +                def __init__(self,items=None):
 +                    if items is None:
 +                        items=[]
 +                    self.__s=set(items)
 +                def __contains__(self,v):
 +                    return v in self.__s
 +                def __iter__(self):
 +                    return iter(self.__s)
 +                def __len__(self):
 +                    return len(self.__s)
 +                def add(self,v):
 +                    result=v not in self.__s
 +                    self.__s.add(v)
 +                    return result
 +                def discard(self,v):
 +                    result=v in self.__s
 +                    self.__s.discard(v)
 +                    return result
 +                def __repr__(self):
 +                    return "MySet(%s)" % repr(list(self))
         items = [5,43,2,1]
         s = MySet(items)
         r = s.pop()
@@ -1563,24 +1615,25 @@ class TestCollectionABCs(ABCTestCase):
     def test_issue16373(self):
         # Recursion error comparing comparable and noncomparable
         # Set instances
 -        class MyComparableSet(Set):
 -            def __contains__(self, x):
 -                return False
 -            def __len__(self):
 -                return 0
 -            def __iter__(self):
 -                return iter([])
 -        class MyNonComparableSet(Set):
 -            def __contains__(self, x):
 -                return False
 -            def __len__(self):
 -                return 0
 -            def __iter__(self):
 -                return iter([])
 -            def __le__(self, x):
 -                return NotImplemented
 -            def __lt__(self, x):
 -                return NotImplemented
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MyComparableSet(Set):
 +                def __contains__(self, x):
 +                    return False
 +                def __len__(self):
 +                    return 0
 +                def __iter__(self):
 +                    return iter([])
 +            class MyNonComparableSet(Set):
 +                def __contains__(self, x):
 +                    return False
 +                def __len__(self):
 +                    return 0
 +                def __iter__(self):
 +                    return iter([])
 +                def __le__(self, x):
 +                    return NotImplemented
 +                def __lt__(self, x):
 +                    return NotImplemented
         cs = MyComparableSet()
         ncs = MyNonComparableSet()
@@ -1591,13 +1644,14 @@ class TestCollectionABCs(ABCTestCase):
     def test_issue26915(self):
         # Container membership test should check identity first
 -        class CustomSequence(Sequence):
 -            def __init__(self, seq):
 -                self._seq = seq
 -            def __getitem__(self, index):
 -                return self._seq[index]
 -            def __len__(self):
 -                return len(self._seq)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class CustomSequence(Sequence):
 +                def __init__(self, seq):
 +                    self._seq = seq
 +                def __getitem__(self, index):
 +                    return self._seq[index]
 +                def __len__(self):
 +                    return len(self._seq)
         nan = float('nan')
         obj = support.NEVER_EQ
@@ -1622,30 +1676,31 @@ class TestCollectionABCs(ABCTestCase):
     def test_Set_from_iterable(self):
         """Verify _from_iterable overridden to an instance method works."""
 -        class SetUsingInstanceFromIterable(MutableSet):
 -            def __init__(self, values, created_by):
 -                if not created_by:
 -                    raise ValueError('created_by must be specified')
 -                self.created_by = created_by
 -                self._values = set(values)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class SetUsingInstanceFromIterable(MutableSet):
 +                def __init__(self, values, created_by):
 +                    if not created_by:
 +                        raise ValueError('created_by must be specified')
 +                    self.created_by = created_by
 +                    self._values = set(values)
 -            def _from_iterable(self, values):
 -                return type(self)(values, 'from_iterable')
 +                def _from_iterable(self, values):
 +                    return type(self)(values, 'from_iterable')
 -            def __contains__(self, value):
 -                return value in self._values
 +                def __contains__(self, value):
 +                    return value in self._values
 -            def __iter__(self):
 -                yield from self._values
 +                def __iter__(self):
 +                    yield from self._values
 -            def __len__(self):
 -                return len(self._values)
 +                def __len__(self):
 +                    return len(self._values)
 -            def add(self, value):
 -                self._values.add(value)
 +                def add(self, value):
 +                    self._values.add(value)
 -            def discard(self, value):
 -                self._values.discard(value)
 +                def discard(self, value):
 +                    self._values.discard(value)
         impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
@@ -1678,20 +1733,21 @@ class TestCollectionABCs(ABCTestCase):
     def test_Set_interoperability_with_real_sets(self):
         # Issue: 8743
 -        class ListSet(Set):
 -            def __init__(self, elements=()):
 -                self.data = []
 -                for elem in elements:
 -                    if elem not in self.data:
 -                        self.data.append(elem)
 -            def __contains__(self, elem):
 -                return elem in self.data
 -            def __iter__(self):
 -                return iter(self.data)
 -            def __len__(self):
 -                return len(self.data)
 -            def __repr__(self):
 -                return 'Set({!r})'.format(self.data)
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class ListSet(Set):
 +                def __init__(self, elements=()):
 +                    self.data = []
 +                    for elem in elements:
 +                        if elem not in self.data:
 +                            self.data.append(elem)
 +                def __contains__(self, elem):
 +                    return elem in self.data
 +                def __iter__(self):
 +                    return iter(self.data)
 +                def __len__(self):
 +                    return len(self.data)
 +                def __repr__(self):
 +                    return 'Set({!r})'.format(self.data)
         r1 = set('abc')
         r2 = set('bcd')
@@ -1846,13 +1902,14 @@ class TestCollectionABCs(ABCTestCase):
             self.assertTrue(issubclass(sample, Mapping))
         self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
             '__getitem__')
 -        class MyMapping(Mapping):
 -            def __len__(self):
 -                return 0
 -            def __getitem__(self, i):
 -                raise IndexError
 -            def __iter__(self):
 -                return iter(())
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MyMapping(Mapping):
 +                def __len__(self):
 +                    return 0
 +                def __getitem__(self, i):
 +                    raise IndexError
 +                def __iter__(self):
 +                    return iter(())
         self.validate_comparison(MyMapping())
         self.assertRaises(TypeError, reversed, MyMapping())
@@ -1860,7 +1917,7 @@ class TestCollectionABCs(ABCTestCase):
         for sample in [dict]:
             self.assertIsInstance(sample(), MutableMapping)
             self.assertTrue(issubclass(sample, MutableMapping))
 -        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
 +        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
             '__getitem__', '__setitem__', '__delitem__')
     def test_MutableMapping_subclass(self):
@@ -1903,15 +1960,16 @@ class TestCollectionABCs(ABCTestCase):
             '__getitem__')
     def test_Sequence_mixins(self):
 -        class SequenceSubclass(Sequence):
 -            def __init__(self, seq=()):
 -                self.seq = seq
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class SequenceSubclass(Sequence):
 +                def __init__(self, seq=()):
 +                    self.seq = seq
 -            def __getitem__(self, index):
 -                return self.seq[index]
 +                def __getitem__(self, index):
 +                    return self.seq[index]
 -            def __len__(self):
 -                return len(self.seq)
 +                def __len__(self):
 +                    return len(self.seq)
         # Compare Sequence.index() behavior to (list|str).index() behavior
         def assert_index_same(seq1, seq2, index_args):
@@ -1983,24 +2041,25 @@ class TestCollectionABCs(ABCTestCase):
     def test_MutableSequence_mixins(self):
         # Test the mixins of MutableSequence by creating a minimal concrete
         # class inherited from it.
 -        class MutableSequenceSubclass(MutableSequence):
 -            def __init__(self):
 -                self.lst = []
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MutableSequenceSubclass(MutableSequence):
 +                def __init__(self):
 +                    self.lst = []
 -            def __setitem__(self, index, value):
 -                self.lst[index] = value
 +                def __setitem__(self, index, value):
 +                    self.lst[index] = value
 -            def __getitem__(self, index):
 -                return self.lst[index]
 +                def __getitem__(self, index):
 +                    return self.lst[index]
 -            def __len__(self):
 -                return len(self.lst)
 +                def __len__(self):
 +                    return len(self.lst)
 -            def __delitem__(self, index):
 -                del self.lst[index]
 +                def __delitem__(self, index):
 +                    del self.lst[index]
 -            def insert(self, index, value):
 -                self.lst.insert(index, value)
 +                def insert(self, index, value):
 +                    self.lst.insert(index, value)
         mss = MutableSequenceSubclass()
         mss.append(0)
@@ -2059,7 +2118,7 @@ class CounterSubclassWithGet(Counter):
         self.called = True
         return Counter.get(self, key, default)
@ -902,19 +71,7 @@ index cafc44007d1..4571e5a14fd 100644
     def test_basics(self):
         c = Counter('abcaba')
-@@ -2225,8 +2284,9 @@ class TestCounter(unittest.TestCase):
+@@ -2402,10 +2422,5 @@ class TestCounter(unittest.TestCase):
         check(Counter(words))
     def test_copy_subclass(self):
 -        class MyCounter(Counter):
 -            pass
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            class MyCounter(Counter):
 +                pass
         c = MyCounter('slartibartfast')
         d = c.copy()
         self.assertEqual(d, c)
@@ -2402,10 +2462,5 @@ class TestCounter(unittest.TestCase):
         self.assertFalse(Counter(a=2, b=1, c=0) > Counter('aab'))
--- a/test/dynamo/cpython/3_13/test_collections.py
+++ b/test/dynamo/cpython/3_13/test_collections.py
@ -93,10 +93,9 @@ class TestUserObjects(__TestCase):
        self._copy_test(obj)
    def test_dict_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class A(UserDict):
-            class A(UserDict):
+            def __missing__(self, key):
-                def __missing__(self, key):
+                return 456
                    return 456
        self.assertEqual(A()[123], 456)
        # get() ignores __missing__ on dict
        self.assertIs(A().get(123), None)
@ -193,10 +192,9 @@ class TestChainMap(__TestCase):
        self.assertTrue(ChainMap({}, {1:2}))
    def test_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class DefaultChainMap(ChainMap):
-            class DefaultChainMap(ChainMap):
+            def __missing__(self, key):
-                def __missing__(self, key):
+                return 999
                    return 999
        d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
        for k, v in dict(a=1, b=2, c=30, d=999).items():
            self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
@ -228,14 +226,13 @@ class TestChainMap(__TestCase):
             ('i', 9999), ('j', 0)])
    def test_iter_not_calling_getitem_on_maps(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class DictWithGetItem(UserDict):
-            class DictWithGetItem(UserDict):
+            def __init__(self, *args, **kwds):
-                def __init__(self, *args, **kwds):
+                self.called = False
-                    self.called = False
+                UserDict.__init__(self, *args, **kwds)
-                    UserDict.__init__(self, *args, **kwds)
+            def __getitem__(self, item):
-                def __getitem__(self, item):
+                self.called = True
-                    self.called = True
+                UserDict.__getitem__(self, item)
                    UserDict.__getitem__(self, item)
        d = DictWithGetItem(a=1)
        c = ChainMap(d)
@ -260,16 +257,15 @@ class TestChainMap(__TestCase):
        self.assertIs(m, d.maps[0])
        # Use a different map than a dict
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class lowerdict(dict):
-            class lowerdict(dict):
+            def __getitem__(self, key):
-                def __getitem__(self, key):
+                if isinstance(key, str):
-                    if isinstance(key, str):
+                    key = key.lower()
-                        key = key.lower()
+                return dict.__getitem__(self, key)
-                    return dict.__getitem__(self, key)
+            def __contains__(self, key):
-                def __contains__(self, key):
+                if isinstance(key, str):
-                    if isinstance(key, str):
+                    key = key.lower()
-                        key = key.lower()
+                return dict.__contains__(self, key)
                    return dict.__contains__(self, key)
        c = ChainMap()
        c['a'] = 1
@ -690,9 +686,8 @@ class TestNamedTuple(__TestCase):
            NT = namedtuple('NT', ['abc', 'def'], False, True)
    def test_namedtuple_subclass_issue_24931(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class Point(namedtuple('_Point', ['x', 'y'])):
-            class Point(namedtuple('_Point', ['x', 'y'])):
+            pass
                pass
        a = Point(3, 4)
        self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
@ -753,20 +748,15 @@ class ABCTestCase(__TestCase):
        methodstubs = dict.fromkeys(names, lambda s, *args: 0)
        # everything should work will all required methods are present
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        C = type('C', (abc,), methodstubs)
            C = type('C', (abc,), methodstubs)
        C()
        # Dynamo raises a hard error here that we can't easily capture
        # Commenting this part as this would also fail in eager if a user
        # attempt to run the same code
        # instantiation should fail if a required method is missing
-        # for name in names:
+        for name in names:
-        #     stubs = methodstubs.copy()
+            stubs = methodstubs.copy()
-        #     del stubs[name]
+            del stubs[name]
-        #     C = type('C', (abc,), stubs)
+            C = type('C', (abc,), stubs)
-        #     self.assertRaises(TypeError, C, name)
+            self.assertRaises(TypeError, C, name)
    def validate_isinstance(self, abc, name):
        stub = lambda s, *args: 0
@ -1011,21 +1001,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
        for x in samples:
            self.assertIsInstance(x, Iterable)
            self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check direct subclassing
-            # Check direct subclassing
+        class I(Iterable):
-            class I(Iterable):
+            def __iter__(self):
-                def __iter__(self):
+                return super().__iter__()
                    return super().__iter__()
        self.assertEqual(list(I()), [])
        self.assertFalse(issubclass(str, I))
        self.validate_abstract_methods(Iterable, '__iter__')
        self.validate_isinstance(Iterable, '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check None blocking
-            # Check None blocking
+        class It:
-            class It:
+            def __iter__(self): return iter([])
-                def __iter__(self): return iter([])
+        class ItBlocked(It):
-            class ItBlocked(It):
+            __iter__ = None
                __iter__ = None
        self.assertTrue(issubclass(It, Iterable))
        self.assertTrue(isinstance(It(), Iterable))
        self.assertFalse(issubclass(ItBlocked, Iterable))
@ -1055,35 +1043,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
        self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
        self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check direct subclassing
-            # Check direct subclassing
+        class R(Reversible):
-            class R(Reversible):
+            def __iter__(self):
-                def __iter__(self):
+                return iter(list())
-                    return iter(list())
+            def __reversed__(self):
-                def __reversed__(self):
+                return iter(list())
                    return iter(list())
        self.assertEqual(list(reversed(R())), [])
        self.assertFalse(issubclass(float, R))
        self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check reversible non-iterable (which is not Reversible)
-            # Check reversible non-iterable (which is not Reversible)
+        class RevNoIter:
-            class RevNoIter:
+            def __reversed__(self): return reversed([])
-                def __reversed__(self): return reversed([])
+        class RevPlusIter(RevNoIter):
-            class RevPlusIter(RevNoIter):
+            def __iter__(self): return iter([])
                def __iter__(self): return iter([])
        self.assertFalse(issubclass(RevNoIter, Reversible))
        self.assertFalse(isinstance(RevNoIter(), Reversible))
        self.assertTrue(issubclass(RevPlusIter, Reversible))
        self.assertTrue(isinstance(RevPlusIter(), Reversible))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check None blocking
-            # Check None blocking
+        class Rev:
-            class Rev:
+            def __iter__(self): return iter([])
-                def __iter__(self): return iter([])
+            def __reversed__(self): return reversed([])
-                def __reversed__(self): return reversed([])
+        class RevItBlocked(Rev):
-            class RevItBlocked(Rev):
+            __iter__ = None
-                __iter__ = None
+        class RevRevBlocked(Rev):
-            class RevRevBlocked(Rev):
+            __reversed__ = None
                __reversed__ = None
        self.assertTrue(issubclass(Rev, Reversible))
        self.assertTrue(isinstance(Rev(), Reversible))
        self.assertFalse(issubclass(RevItBlocked, Reversible))
@ -1117,16 +1102,15 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.assertTrue(issubclass(Set, Collection), repr(Set))
        self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
        self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check direct subclassing
-            # Check direct subclassing
+        class Col(Collection):
-            class Col(Collection):
+            def __iter__(self):
-                def __iter__(self):
+                return iter(list())
-                    return iter(list())
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __contains__(self, item):
-                def __contains__(self, item):
+                return False
-                    return False
+        class DerCol(Col): pass
            class DerCol(Col): pass
        self.assertEqual(list(iter(Col())), [])
        self.assertFalse(issubclass(list, Col))
        self.assertFalse(issubclass(set, Col))
@ -1138,48 +1122,44 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.validate_abstract_methods(Collection, '__len__', '__iter__',
                                                   '__contains__')
        # Check sized container non-iterable (which is not Collection) etc.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class ColNoIter:
-            class ColNoIter:
+            def __len__(self): return 0
-                def __len__(self): return 0
+            def __contains__(self, item): return False
-                def __contains__(self, item): return False
+        class ColNoSize:
-            class ColNoSize:
+            def __iter__(self): return iter([])
-                def __iter__(self): return iter([])
+            def __contains__(self, item): return False
-                def __contains__(self, item): return False
+        class ColNoCont:
-            class ColNoCont:
+            def __iter__(self): return iter([])
-                def __iter__(self): return iter([])
+            def __len__(self): return 0
                def __len__(self): return 0
        self.assertFalse(issubclass(ColNoIter, Collection))
        self.assertFalse(isinstance(ColNoIter(), Collection))
        self.assertFalse(issubclass(ColNoSize, Collection))
        self.assertFalse(isinstance(ColNoSize(), Collection))
        self.assertFalse(issubclass(ColNoCont, Collection))
        self.assertFalse(isinstance(ColNoCont(), Collection))
-
+        # Check None blocking
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class SizeBlock:
-            # Check None blocking
+            def __iter__(self): return iter([])
-            class SizeBlock:
+            def __contains__(self): return False
-                def __iter__(self): return iter([])
+            __len__ = None
-                def __contains__(self): return False
+        class IterBlock:
-                __len__ = None
+            def __len__(self): return 0
-            class IterBlock:
+            def __contains__(self): return True
-                def __len__(self): return 0
+            __iter__ = None
                def __contains__(self): return True
                __iter__ = None
        self.assertFalse(issubclass(SizeBlock, Collection))
        self.assertFalse(isinstance(SizeBlock(), Collection))
        self.assertFalse(issubclass(IterBlock, Collection))
        self.assertFalse(isinstance(IterBlock(), Collection))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Check None blocking in subclass
-            # Check None blocking in subclass
+        class ColImpl:
-            class ColImpl:
+            def __iter__(self):
-                def __iter__(self):
+                return iter(list())
-                    return iter(list())
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __contains__(self, item):
-                def __contains__(self, item):
+                return False
-                    return False
+        class NonCol(ColImpl):
-            class NonCol(ColImpl):
+            __contains__ = None
                __contains__ = None
        self.assertFalse(issubclass(NonCol, Collection))
        self.assertFalse(isinstance(NonCol(), Collection))
@ -1202,32 +1182,30 @@ class TestOneTrickPonyABCs(ABCTestCase):
            self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
        self.validate_abstract_methods(Iterator, '__next__', '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        # Issue 10565
-            # Issue 10565
+        class NextOnly:
-            class NextOnly:
+            def __next__(self):
-                def __next__(self):
+                yield 1
-                    yield 1
+                return
                    return
        self.assertNotIsInstance(NextOnly(), Iterator)
    def test_Generator(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class NonGen1:
-            class NonGen1:
+            def __iter__(self): return self
-                def __iter__(self): return self
+            def __next__(self): return None
-                def __next__(self): return None
+            def close(self): pass
-                def close(self): pass
+            def throw(self, typ, val=None, tb=None): pass
                def throw(self, typ, val=None, tb=None): pass
-            class NonGen2:
+        class NonGen2:
-                def __iter__(self): return self
+            def __iter__(self): return self
-                def __next__(self): return None
+            def __next__(self): return None
-                def close(self): pass
+            def close(self): pass
-                def send(self, value): return value
+            def send(self, value): return value
-            class NonGen3:
+        class NonGen3:
-                def close(self): pass
+            def close(self): pass
-                def send(self, value): return value
+            def send(self, value): return value
-                def throw(self, typ, val=None, tb=None): pass
+            def throw(self, typ, val=None, tb=None): pass
        non_samples = [
            None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
@ -1236,19 +1214,18 @@ class TestOneTrickPonyABCs(ABCTestCase):
            self.assertNotIsInstance(x, Generator)
            self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class Gen:
-            class Gen:
+            def __iter__(self): return self
-                def __iter__(self): return self
+            def __next__(self): return None
-                def __next__(self): return None
+            def close(self): pass
-                def close(self): pass
+            def send(self, value): return value
-                def send(self, value): return value
+            def throw(self, typ, val=None, tb=None): pass
                def throw(self, typ, val=None, tb=None): pass
-            class MinimalGen(Generator):
+        class MinimalGen(Generator):
-                def send(self, value):
+            def send(self, value):
-                    return value
+                return value
-                def throw(self, typ, val=None, tb=None):
+            def throw(self, typ, val=None, tb=None):
-                    super().throw(typ, val, tb)
+                super().throw(typ, val, tb)
        def gen():
            yield 1
@ -1271,17 +1248,15 @@ class TestOneTrickPonyABCs(ABCTestCase):
                               mgen.throw, ValueError, ValueError("huhu"))
        self.assertRaises(StopIteration, mgen.throw, StopIteration())
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class FailOnClose(Generator):
-            class FailOnClose(Generator):
+            def send(self, value): return value
-                def send(self, value): return value
+            def throw(self, *args): raise ValueError
                def throw(self, *args): raise ValueError
        self.assertRaises(ValueError, FailOnClose().close)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class IgnoreGeneratorExit(Generator):
-            class IgnoreGeneratorExit(Generator):
+            def send(self, value): return value
-                def send(self, value): return value
+            def throw(self, *args): pass
                def throw(self, *args): pass
        self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
@ -1424,17 +1399,15 @@ class TestOneTrickPonyABCs(ABCTestCase):
    def test_direct_subclassing(self):
        for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C(B):
-                class C(B):
+                pass
                    pass
            self.assertTrue(issubclass(C, B))
            self.assertFalse(issubclass(int, C))
    def test_registration(self):
        for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
-                class C:
+                __hash__ = None  # Make sure it isn't hashable by default
                    __hash__ = None  # Make sure it isn't hashable by default
            self.assertFalse(issubclass(C, B), B.__name__)
            B.register(C)
            self.assertTrue(issubclass(C, B))
@ -1470,14 +1443,13 @@ class TestCollectionABCs(ABCTestCase):
            self.assertIsInstance(sample(), Set)
            self.assertTrue(issubclass(sample, Set))
        self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MySet(Set):
-            class MySet(Set):
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return False
-                    return False
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __iter__(self):
-                def __iter__(self):
+                return iter([])
                    return iter([])
        self.validate_comparison(MySet())
    def test_hash_Set(self):
@ -1496,16 +1468,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertTrue(hash(a) == hash(b))
    def test_isdisjoint_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MySet(Set):
-            class MySet(Set):
+            def __init__(self, itr):
-                def __init__(self, itr):
+                self.contents = itr
-                    self.contents = itr
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return x in self.contents
-                    return x in self.contents
+            def __iter__(self):
-                def __iter__(self):
+                return iter(self.contents)
-                    return iter(self.contents)
+            def __len__(self):
-                def __len__(self):
+                return len([x for x in self.contents])
                    return len([x for x in self.contents])
        s1 = MySet((1, 2, 3))
        s2 = MySet((4, 5, 6))
        s3 = MySet((1, 5, 6))
@ -1513,16 +1484,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertFalse(s1.isdisjoint(s3))
    def test_equality_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MySet(Set):
-            class MySet(Set):
+            def __init__(self, itr):
-                def __init__(self, itr):
+                self.contents = itr
-                    self.contents = itr
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return x in self.contents
-                    return x in self.contents
+            def __iter__(self):
-                def __iter__(self):
+                return iter(self.contents)
-                    return iter(self.contents)
+            def __len__(self):
-                def __len__(self):
+                return len([x for x in self.contents])
                    return len([x for x in self.contents])
        s1 = MySet((1,))
        s2 = MySet((1, 2))
        s3 = MySet((3, 4))
@ -1536,16 +1506,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertNotEqual(s2, s3)
    def test_arithmetic_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MySet(Set):
-            class MySet(Set):
+            def __init__(self, itr):
-                def __init__(self, itr):
+                self.contents = itr
-                    self.contents = itr
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return x in self.contents
-                    return x in self.contents
+            def __iter__(self):
-                def __iter__(self):
+                return iter(self.contents)
-                    return iter(self.contents)
+            def __len__(self):
-                def __len__(self):
+                return len([x for x in self.contents])
                    return len([x for x in self.contents])
        s1 = MySet((1, 2, 3))
        s2 = MySet((3, 4, 5))
        s3 = s1 & s2
@ -1567,29 +1536,28 @@ class TestCollectionABCs(ABCTestCase):
    def test_issue_4920(self):
        # MutableSet.pop() method did not work
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MySet(MutableSet):
-            class MySet(MutableSet):
+            __slots__=['__s']
-                __slots__=['__s']
+            def __init__(self,items=None):
-                def __init__(self,items=None):
+                if items is None:
-                    if items is None:
+                    items=[]
-                        items=[]
+                self.__s=set(items)
-                    self.__s=set(items)
+            def __contains__(self,v):
-                def __contains__(self,v):
+                return v in self.__s
-                    return v in self.__s
+            def __iter__(self):
-                def __iter__(self):
+                return iter(self.__s)
-                    return iter(self.__s)
+            def __len__(self):
-                def __len__(self):
+                return len(self.__s)
-                    return len(self.__s)
+            def add(self,v):
-                def add(self,v):
+                result=v not in self.__s
-                    result=v not in self.__s
+                self.__s.add(v)
-                    self.__s.add(v)
+                return result
-                    return result
+            def discard(self,v):
-                def discard(self,v):
+                result=v in self.__s
-                    result=v in self.__s
+                self.__s.discard(v)
-                    self.__s.discard(v)
+                return result
-                    return result
+            def __repr__(self):
-                def __repr__(self):
+                return "MySet(%s)" % repr(list(self))
                    return "MySet(%s)" % repr(list(self))
        items = [5,43,2,1]
        s = MySet(items)
        r = s.pop()
@ -1615,25 +1583,24 @@ class TestCollectionABCs(ABCTestCase):
    def test_issue16373(self):
        # Recursion error comparing comparable and noncomparable
        # Set instances
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MyComparableSet(Set):
-            class MyComparableSet(Set):
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return False
-                    return False
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __iter__(self):
-                def __iter__(self):
+                return iter([])
-                    return iter([])
+        class MyNonComparableSet(Set):
-            class MyNonComparableSet(Set):
+            def __contains__(self, x):
-                def __contains__(self, x):
+                return False
-                    return False
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __iter__(self):
-                def __iter__(self):
+                return iter([])
-                    return iter([])
+            def __le__(self, x):
-                def __le__(self, x):
+                return NotImplemented
-                    return NotImplemented
+            def __lt__(self, x):
-                def __lt__(self, x):
+                return NotImplemented
                    return NotImplemented
        cs = MyComparableSet()
        ncs = MyNonComparableSet()
@ -1644,14 +1611,13 @@ class TestCollectionABCs(ABCTestCase):
    def test_issue26915(self):
        # Container membership test should check identity first
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class CustomSequence(Sequence):
-            class CustomSequence(Sequence):
+            def __init__(self, seq):
-                def __init__(self, seq):
+                self._seq = seq
-                    self._seq = seq
+            def __getitem__(self, index):
-                def __getitem__(self, index):
+                return self._seq[index]
-                    return self._seq[index]
+            def __len__(self):
-                def __len__(self):
+                return len(self._seq)
                    return len(self._seq)
        nan = float('nan')
        obj = support.NEVER_EQ
@ -1676,31 +1642,30 @@ class TestCollectionABCs(ABCTestCase):
    def test_Set_from_iterable(self):
        """Verify _from_iterable overridden to an instance method works."""
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class SetUsingInstanceFromIterable(MutableSet):
-            class SetUsingInstanceFromIterable(MutableSet):
+            def __init__(self, values, created_by):
-                def __init__(self, values, created_by):
+                if not created_by:
-                    if not created_by:
+                    raise ValueError('created_by must be specified')
-                        raise ValueError('created_by must be specified')
+                self.created_by = created_by
-                    self.created_by = created_by
+                self._values = set(values)
                    self._values = set(values)
-                def _from_iterable(self, values):
+            def _from_iterable(self, values):
-                    return type(self)(values, 'from_iterable')
+                return type(self)(values, 'from_iterable')
-                def __contains__(self, value):
+            def __contains__(self, value):
-                    return value in self._values
+                return value in self._values
-                def __iter__(self):
+            def __iter__(self):
-                    yield from self._values
+                yield from self._values
-                def __len__(self):
+            def __len__(self):
-                    return len(self._values)
+                return len(self._values)
-                def add(self, value):
+            def add(self, value):
-                    self._values.add(value)
+                self._values.add(value)
-                def discard(self, value):
+            def discard(self, value):
-                    self._values.discard(value)
+                self._values.discard(value)
        impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
@ -1733,21 +1698,20 @@ class TestCollectionABCs(ABCTestCase):
    def test_Set_interoperability_with_real_sets(self):
        # Issue: 8743
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class ListSet(Set):
-            class ListSet(Set):
+            def __init__(self, elements=()):
-                def __init__(self, elements=()):
+                self.data = []
-                    self.data = []
+                for elem in elements:
-                    for elem in elements:
+                    if elem not in self.data:
-                        if elem not in self.data:
+                        self.data.append(elem)
-                            self.data.append(elem)
+            def __contains__(self, elem):
-                def __contains__(self, elem):
+                return elem in self.data
-                    return elem in self.data
+            def __iter__(self):
-                def __iter__(self):
+                return iter(self.data)
-                    return iter(self.data)
+            def __len__(self):
-                def __len__(self):
+                return len(self.data)
-                    return len(self.data)
+            def __repr__(self):
-                def __repr__(self):
+                return 'Set({!r})'.format(self.data)
                    return 'Set({!r})'.format(self.data)
        r1 = set('abc')
        r2 = set('bcd')
@ -1902,14 +1866,13 @@ class TestCollectionABCs(ABCTestCase):
            self.assertTrue(issubclass(sample, Mapping))
        self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
            '__getitem__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MyMapping(Mapping):
-            class MyMapping(Mapping):
+            def __len__(self):
-                def __len__(self):
+                return 0
-                    return 0
+            def __getitem__(self, i):
-                def __getitem__(self, i):
+                raise IndexError
-                    raise IndexError
+            def __iter__(self):
-                def __iter__(self):
+                return iter(())
                    return iter(())
        self.validate_comparison(MyMapping())
        self.assertRaises(TypeError, reversed, MyMapping())
@ -1917,7 +1880,7 @@ class TestCollectionABCs(ABCTestCase):
        for sample in [dict]:
            self.assertIsInstance(sample(), MutableMapping)
            self.assertTrue(issubclass(sample, MutableMapping))
-        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
+        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
            '__getitem__', '__setitem__', '__delitem__')
    def test_MutableMapping_subclass(self):
@ -1960,16 +1923,15 @@ class TestCollectionABCs(ABCTestCase):
            '__getitem__')
    def test_Sequence_mixins(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class SequenceSubclass(Sequence):
-            class SequenceSubclass(Sequence):
+            def __init__(self, seq=()):
-                def __init__(self, seq=()):
+                self.seq = seq
                    self.seq = seq
-                def __getitem__(self, index):
+            def __getitem__(self, index):
-                    return self.seq[index]
+                return self.seq[index]
-                def __len__(self):
+            def __len__(self):
-                    return len(self.seq)
+                return len(self.seq)
        # Compare Sequence.index() behavior to (list|str).index() behavior
        def assert_index_same(seq1, seq2, index_args):
@ -2041,25 +2003,24 @@ class TestCollectionABCs(ABCTestCase):
    def test_MutableSequence_mixins(self):
        # Test the mixins of MutableSequence by creating a minimal concrete
        # class inherited from it.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MutableSequenceSubclass(MutableSequence):
-            class MutableSequenceSubclass(MutableSequence):
+            def __init__(self):
-                def __init__(self):
+                self.lst = []
                    self.lst = []
-                def __setitem__(self, index, value):
+            def __setitem__(self, index, value):
-                    self.lst[index] = value
+                self.lst[index] = value
-                def __getitem__(self, index):
+            def __getitem__(self, index):
-                    return self.lst[index]
+                return self.lst[index]
-                def __len__(self):
+            def __len__(self):
-                    return len(self.lst)
+                return len(self.lst)
-                def __delitem__(self, index):
+            def __delitem__(self, index):
-                    del self.lst[index]
+                del self.lst[index]
-                def insert(self, index, value):
+            def insert(self, index, value):
-                    self.lst.insert(index, value)
+                self.lst.insert(index, value)
        mss = MutableSequenceSubclass()
        mss.append(0)
@ -2284,9 +2245,8 @@ class TestCounter(__TestCase):
        check(Counter(words))
    def test_copy_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        class MyCounter(Counter):
-            class MyCounter(Counter):
+            pass
                pass
        c = MyCounter('slartibartfast')
        d = c.copy()
        self.assertEqual(d, c)
--- a/test/dynamo/cpython/3_13/test_itertools.diff
+++ b/test/dynamo/cpython/3_13/test_itertools.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
-index 7d5ba727389..8d462284884 100644
+index 7d5ba727389..d15d83a2184 100644
 --- a/test/dynamo/cpython/3_13/test_itertools.py
 +++ b/test/dynamo/cpython/3_13/test_itertools.py
@@ -1,3 +1,25 @@
@ -151,7 +151,7 @@ index 7d5ba727389..8d462284884 100644
             _, g = next(it)
             next(it)
             next(it)
-@@ -1002,29 +1015,30 @@ class TestBasicOps(unittest.TestCase):
+@@ -1002,27 +1015,29 @@ class TestBasicOps(unittest.TestCase):
         self.assertEqual(list(filter(None, [0,1,0,2,0])), [1,2])
         self.assertEqual(list(filter(bool, [0,1,0,2,0])), [1,2])
         self.assertEqual(take(4, filter(isEven, count())), [0,2,4,6])
@ -198,24 +198,8 @@ index 7d5ba727389..8d462284884 100644
 +        #     c = filter(isEven, range(6))
 +        #     self.pickletest(proto, c)
-    @pickle_deprecated
+     @pickle_deprecated
     def test_filterfalse(self):
         self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
         self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
@@ -1034,9 +1048,10 @@ class TestBasicOps(unittest.TestCase):
         self.assertRaises(TypeError, filterfalse, lambda x:x)
         self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
         self.assertRaises(TypeError, filterfalse, isEven, 3)
 -        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
 -        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 -            self.pickletest(proto, filterfalse(isEven, range(6)))
 +        with torch._dynamo.set_fullgraph(fullgraph=False):
 +            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
 +            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
 +                self.pickletest(proto, filterfalse(isEven, range(6)))
     def test_zip(self):
         # XXX This is rather silly now that builtin zip() calls zip()...
@@ -1047,8 +1062,8 @@ class TestBasicOps(unittest.TestCase):
         self.assertEqual(take(3,zip('abcdef', count())), lzip('abcdef', range(3)))
         self.assertEqual(list(zip('abcdef')), lzip('abcdef'))
--- a/test/dynamo/cpython/3_13/test_itertools.py
+++ b/test/dynamo/cpython/3_13/test_itertools.py
@ -1039,6 +1039,7 @@ class TestBasicOps(__TestCase):
        #     c = filter(isEven, range(6))
        #     self.pickletest(proto, c)
    @pickle_deprecated
    def test_filterfalse(self):
        self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
        self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
@ -1048,10 +1049,9 @@ class TestBasicOps(__TestCase):
        self.assertRaises(TypeError, filterfalse, lambda x:x)
        self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
        self.assertRaises(TypeError, filterfalse, isEven, 3)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
+        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
-            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            self.pickletest(proto, filterfalse(isEven, range(6)))
                self.pickletest(proto, filterfalse(isEven, range(6)))
    def test_zip(self):
        # XXX This is rather silly now that builtin zip() calls zip()...
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@ -1742,83 +1742,6 @@ class GraphModule(torch.nn.Module):
        opt_f = torch.compile(f, backend="eager")
        opt_f(torch.randn(2, 2))
    # Regression test to make sure dynamo won't crash on these kwargs.
    def test_sdpa_kernel_ctx_manager_kwargs(self):
        backends = [torch.nn.attention.SDPBackend.MATH]
        @torch._dynamo.allow_in_graph
        def check_backend_state_is_modified():
            self.assertEqual(
                set(torch.nn.attention._cur_sdpa_kernel_backends()),
                set(backends),
            )
        def f(x):
            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
                x = x + 1
                check_backend_state_is_modified()
                x = x + 1
            return x
        opt_f = torch.compile(f, backend="eager")
        opt_f(torch.randn(2, 2))
    # Regression test to make sure dynamo won't graph break on calling functions
    # decorated with special context manager.
    def test_sdpa_kernel_ctx_manager_as_decorator(self):
        SDPA_BACKEND_PRIORITY = [
            torch.nn.attention.SDPBackend.MATH,
            torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
            torch.nn.attention.SDPBackend.FLASH_ATTENTION,
        ]
        @torch.nn.attention.sdpa_kernel(
            backends=SDPA_BACKEND_PRIORITY, set_priority=True
        )
        def scaled_dot_product_attention(q, k, v, *args, **kwargs):
            return torch.nn.functional.scaled_dot_product_attention(
                q, k, v, *args, **kwargs
            )
        def f(x):
            return scaled_dot_product_attention(x, x, x)
        opt_f = torch.compile(f, backend="eager", fullgraph=True)
        x = torch.rand(16, 16, 64, 256, dtype=torch.float16)
        ref = f(x)
        res = opt_f(x)
        self.assertEqual(ref, res)
    # Regression test to make sure the value of set_priority is used correctly.
    def test_sdpa_kernel_ctx_manager_set_priority(self):
        backends = [torch.nn.attention.SDPBackend.MATH]
        default_priority = torch._C._get_sdp_priority_order()
        @torch._dynamo.allow_in_graph
        def check_backend_priority(changed: bool):
            self.assertEqual(
                changed,
                torch._C._get_sdp_priority_order() != default_priority,
            )
        def f(x):
            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
                x = x + 1
                check_backend_priority(changed=True)
                x = x + 1
            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=False):
                x = x + 1
                check_backend_priority(changed=False)
                x = x + 1
            return x
        opt_f = torch.compile(f, backend="eager")
        opt_f(torch.randn(2, 2))
    def test_torch_profiler_use_after_with_block(self):
        counters.clear()
--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@ -310,12 +310,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        itertools.permutations(filter(lambda x: True, [1, 2]))
        return a
    @make_test
    def test_itertools_filterfalse_basic(a, b):
        for x in itertools.filterfalse(lambda x: x > 0, [-0.5, 0, 0.5]):
            a += x
        return a
    @make_test
    def test_itertools_chain(a, b):
        v = a
@ -568,11 +562,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        args = [a, b]
        return sub(*args)
    @make_test
    def test_tuple_map(a, b):
        t = tuple(map(torch.sin, [a, b]))
        return t[0] + t[1]
    def test_size_tuple_add(self):
        def fn():
            size = torch.Size([])
@ -2027,21 +2016,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        tmp = mytuple(a, xy=b)
        return mytuple(tmp.x, tmp[1], tmp.xy + b)
    @make_test
    def test_namedtuple_replace(a, b):
        mytuple = collections.namedtuple("mytuple", ["x", "y"])
        t = mytuple(a, b)
        t._replace(x=b)
        return t.x + t.y
    @make_test
    def test_namedtuple_fields(a, b):
        mytuple = collections.namedtuple("mytuple", ["x", "y"])
        if mytuple._fields == ("x", "y"):
            return a + b
        else:
            return a - b
    class MyNamedTuple(NamedTuple):
        first: torch.Tensor
        second: torch.Tensor
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@ -4,16 +4,13 @@ import contextlib
 import torch
 import torch.fx
 from torch._dynamo.graph_deduplication import apply_graph_deduplication
 from torch._dynamo.graph_utils import _detect_cycles
 from torch._dynamo.output_graph import FakeRootModule
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import (
    AotEagerAndRecordGraphs,
    extract_graph_and_tracker,
    normalize_gm,
 )
 from torch.compiler import allow_in_graph
 from torch.utils._ordered_set import OrderedSet
@ -1109,104 +1106,6 @@ def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
    """,
        )
    def test_tuple_return(self):
        @allow_in_graph
        def tuple_return(x, y):
            return x, y
        def inner_fn(x, y):
            x0 = x + x + 1
            y0 = y + y + 1
            return tuple_return(x0, y0)
        def fn(x0, x1, x2, y0, y1, y2):
            x0 = inner_fn(x0, y0)
            x1 = inner_fn(x1, y1)
            x2 = inner_fn(x2, y2)
            return x0, x1, x2
        fn_opt = torch.compile(fn, fullgraph=True)
        inps = [torch.rand(10, 10) for _ in range(6)]
        result_compiled = fn_opt(*inps)
        result_eager = fn(*inps)
        self.assertEqual(result_compiled, result_eager)
    def test_tuple_inputs(self):
        with (
            torch._dynamo.config.patch("use_graph_deduplication", False),
            torch._dynamo.config.patch("track_nodes_for_deduplication", True),
        ):
            def inner(x, y):
                x0, x1 = torch.split(x, 5)
                return x0 + x1 + y
            def fn(x, y):
                o1 = inner(x, y)
                o2 = inner(x, y)
                o3 = inner(x, y)
                o4 = inner(x, y)
                return o1.sum() + o2.sum() + o3.sum() + o4.sum()
            graph, tracker = extract_graph_and_tracker(
                fn, torch.rand(10, 10), torch.rand(5, 10)
            )
            class MockOutputGraph:
                def __init__(self):
                    self.graph = graph
                    self.region_tracker = tracker
                    self.nn_modules = FakeRootModule({})
                def install_subgraph(self, name, subgraph):
                    return ""
            splits = [
                n
                for n in graph.nodes
                if n.op == "call_function" and n.target == torch.split
            ]
            for split in splits:
                tracker.node_to_duplicates.pop(split)
            apply_graph_deduplication(MockOutputGraph())
            self.assertExpectedInline(
                graph,
                """\
 graph():
    %_unnamed : [num_users=4] = get_attr[target=]
    %l_x_ : torch.Tensor [num_users=4] = placeholder[target=L_x_]
    %l_y_ : torch.Tensor [num_users=4] = placeholder[target=L_y_]
    %split : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
    %x0 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 0), kwargs = {})
    %x1 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 1), kwargs = {})
    %split_1 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
    %x0_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 0), kwargs = {})
    %x1_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 1), kwargs = {})
    %split_2 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
    %x0_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 0), kwargs = {})
    %x1_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 1), kwargs = {})
    %split_3 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
    %x0_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 0), kwargs = {})
    %x1_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 1), kwargs = {})
    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0, %x1, %l_y_), kwargs = {})
    %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
    %sum_1 : [num_users=1] = call_method[target=sum](args = (%getitem_8,), kwargs = {})
    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_1, %x1_1, %l_y_), kwargs = {})
    %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
    %sum_2 : [num_users=1] = call_method[target=sum](args = (%getitem_9,), kwargs = {})
    %add_8 : [num_users=1] = call_function[target=operator.add](args = (%sum_1, %sum_2), kwargs = {})
    %invoke_subgraph_2 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_2, %x1_2, %l_y_), kwargs = {})
    %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_2, 0), kwargs = {})
    %sum_3 : [num_users=1] = call_method[target=sum](args = (%getitem_10,), kwargs = {})
    %add_9 : [num_users=1] = call_function[target=operator.add](args = (%add_8, %sum_3), kwargs = {})
    %invoke_subgraph_3 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_3, %x1_3, %l_y_), kwargs = {})
    %getitem_11 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_3, 0), kwargs = {})
    %sum_4 : [num_users=1] = call_method[target=sum](args = (%getitem_11,), kwargs = {})
    %add_10 : [num_users=1] = call_function[target=operator.add](args = (%add_9, %sum_4), kwargs = {})
    return (add_10,)""",
            )
    def test_param_transfer_to_submodule(self):
        def inner_fn(x, y):
            return x + y + y + x
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@ -9,6 +9,28 @@ from torch._dynamo.testing import extract_graph_and_tracker
 from torch.utils._pytree import tree_map
 def get_nodes_by_name(graph, names):
    nodes = []
    for node in graph.nodes:
        if node.name in names:
            nodes.append(node)
    return nodes
 unique_ind = 0
 def track_same_nodes(names, graph, region_tracker):
    global unique_ind
    unique_ind += 1
    # find nodes in graph with names and track them
    # as if they were at the same code location
    nodes = get_nodes_by_name(graph, names)
    for node in nodes:
        region_tracker.track_node("x", unique_ind, node)
 class GraphRegionTrackerTests(TestCase):
    def setUp(self):
        self.exit_stack = contextlib.ExitStack()
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@ -1205,45 +1205,6 @@ class TagSafetyChecks(RecursiveDictTagTests):
        with install_guard_manager_testing_hook(hook):
            opt_fn(torch.randn(4, 4))
    def test_nn_module_tag_overridden_getattr_safe(self):
        class Baz(torch.nn.Module, metaclass=abc.ABCMeta):
            def __init__(self):
                super().__init__()
                self.norm = 2
            def __getattr__(self, key):
                if key == "a":
                    return 5
                return super().__getattr__(key)
            def forward(self, x):
                return x + self.a + self.norm
        baz = Baz()
        def fn(x):
            x = x + baz(x)
            return x
        try:
            from .utils import install_guard_manager_testing_hook
        except ImportError:
            from utils import install_guard_manager_testing_hook
        def hook(guard_wrapper, f_locals, builder):
            from torch._dynamo.source import LocalSource
            baz_source = LocalSource("baz")
            # Check tagness of baz
            baz_mgr = builder.get_guard_manager_from_source(baz_source)
            self.assertTrue(baz_mgr.is_tag_safe())
            self.assertTrue(baz_mgr.is_tag_safe_root())
        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
        with install_guard_manager_testing_hook(hook):
            opt_fn(torch.randn(4, 4))
 class RecursiveDictGuardTests(RecursiveDictTagTests):
    def test_disabling(self):
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@ -261,7 +261,6 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
    def _test_serialization(self, guard_type, fn, *args, **kwargs):
        # kwargs might contain a callable that generates kwargs
        torch._dynamo.reset()
        kwarg_gen_fn = kwargs.get("_gen_fn", None)
        if kwarg_gen_fn is not None:
            kwargs = kwarg_gen_fn()
@ -347,7 +346,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
                    self._frame_state.f_code,
                    tracer.output,
                    guard_filter_fn=guard_filter_fn,
-                    save_guards=True,
+                    guards_serialization_mode="save",
                )
                guards_state = check_fn_manager.guards_state
                self._cached_guards_state = guards_state
@ -358,6 +357,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
                check_fn_manager = CheckFunctionManager(
                    self._frame_state.f_code,
                    guards_state.output_graph,
                    guards_serialization_mode="load",
                    shape_code_parts=guards_state.shape_code_parts,
                    runtime_global_scope=self._frame_state.f_globals,
                )
@ -1180,6 +1180,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
            check_fn_manager = CheckFunctionManager(
                self._cached_f_code,
                guards_state.output_graph,
                guards_serialization_mode="load",
                shape_code_parts=guards_state.shape_code_parts,
            )
            loaded = check_fn_manager.guard_manager
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -1705,17 +1705,16 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            if hasattr(packed, "b"):
                b = packed.b + 1
            c = packed[2]
-            d = len(packed._fields)
+            return a + b + c
            return a + b + c + d
        v1 = torch.Tensor([1])
        v2 = torch.Tensor([2])
        v3 = torch.Tensor([3])
        cnts = torch._dynamo.testing.CompileCounter()
        opt_fn = torch.compile(fn, backend=cnts)
-        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 10)
+        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 7)
        self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 4)
+        self.assertEqual(cnts.op_count, 3)
    def test_namedtuple3(self):
        def fn(x, packed):
@ -1962,31 +1961,6 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
        self.assertEqual(exp, act)
    def test_class_binop(self):
        class Foo:
            def __init__(self, x):
                self.x = x
            def __add__(self, other):
                return Foo(self.x + other.x)
        def fn(a, b):
            return a + b
        x = torch.randn(2)
        a, b = Foo(x), Foo(x + 1)
        cnts = torch._dynamo.testing.CompileCounter()
        opt_fn = torch.compile(fn, backend=cnts)
        self.assertEqual(opt_fn(a, b).x, 2 * x + 1)
        self.assertEqual(cnts.frame_count, 1)
        self.assertEqual(cnts.op_count, 1)
        def fn(a, b):
            return a - b
        opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
        self.assertRaises(torch._dynamo.exc.Unsupported, opt_fn, a, b)
    def test_user_getattr1(self):
        class MyConfig(dict):
            def __getattr__(self, name):
@ -8572,6 +8546,7 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            guard_manager = torch._dynamo.guards.CheckFunctionManager(
                foo.__code__,
                guards_state.output_graph,
                guards_serialization_mode="load",
                shape_code_parts=guards_state.shape_code_parts,
                runtime_global_scope=new_globals,
            ).guard_manager
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@ -16,7 +16,7 @@ from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch._functorch import config as functorch_config
-from torch._inductor.mock_cache import global_stats, PatchCaches
+from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
@ -452,33 +452,27 @@ def add(x, y):
        def fn(x, y):
            return x.sin() + y
-        arg1 = torch.randn(32, 32, device=device)
+        arg1 = torch.randn(3, 3, device=device)
-        arg2 = torch.randn(32, 32, device=device)
+        arg2 = torch.randn(3, 3, device=device)
        expected = fn(arg1, arg2).clone()
        with PatchCaches():
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            result = compiled_fn1(arg1, arg2).clone()
            self.assertEqual(expected, result)
-            self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
+            self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
            DynamoCache.clear()
            total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
            self._save_and_reload(
                expected_backends=1, expected_dynamo=1, expected_autotune=1
            )
            # During save, we check the autotune cache another time, and now it should hit
            self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            with torch.compiler.set_stance("fail_on_recompile"):
                result1 = compiled_fn1(arg1, arg2).clone()
                self.assertEqual(expected, result1)
            self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
-            # No new hits or misses
+            self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))
            # Unfortunately, we don't *actually* know how many puts there will be, because
            # it's possible the best autotune config was found by coordesc.
            self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
            self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
    @parametrize("device", ("cpu", "cuda", "xpu"))
    @torch._dynamo.config.patch(caching_precompile=True)
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@ -362,74 +362,6 @@ def run(cnt):
        write_load_and_run(path2)
        self.assertEqual(cnts.frame_count, 1)
    @torch._dynamo.config.patch(
        automatic_dynamic_remote_pgo=True, automatic_dynamic_local_pgo=False
    )
    def test_sticky_pgo_read_write(self):
        cnts = CompileCounter()
        @torch.compile(backend=cnts, fullgraph=True)
        def f(x, y):
            return x * 2, y * 3
        def t(x, y):
            return torch.randn(x, y)
        with mock_cache.PatchCaches():
            # we pretend to disable the default remote cache, by keying different job ids per run
            with torch.compiler.config.patch(job_id="a"):
                f(t(2, 2), t(2, 2))
                f(t(2, 4), t(2, 2))
                self.assertEqual(cnts.frame_count, 2)
            # first test we're not reading from local/default remote cache;
            # we should recompile when x wobbles
            self.reset()
            cnts.clear()
            with torch.compiler.config.patch(
                job_id="b", pgo_extra_write_key="sticky_0"
            ):
                f(t(2, 2), t(2, 2))
                f(t(2, 4), t(2, 2))
                self.assertEqual(cnts.frame_count, 2)
            # now with the extra sticky_0 key, we start with dynamic x;
            # no recompiles
            self.reset()
            cnts.clear()
            with torch.compiler.config.patch(job_id="c", pgo_extra_read_key="sticky_0"):
                f(t(2, 2), t(2, 2))
                f(t(2, 4), t(2, 2))
                self.assertEqual(cnts.frame_count, 1)
            # last test: wobble y and write to sticky_1 key
            self.reset()
            cnts.clear()
            with torch.compiler.config.patch(
                job_id="d", pgo_extra_write_key="sticky_1"
            ):
                f(t(2, 2), t(2, 2))
                f(t(2, 2), t(2, 4))
                f(t(2, 2), t(4, 4))
                self.assertEqual(cnts.frame_count, 3)
            # start using default remote PGO, create run that wobbles y
            self.reset()
            cnts.clear()
            f(t(2, 2), t(2, 2))
            f(t(2, 4), t(2, 2))
            f(t(4, 2), t(2, 2))
            # with default remote (dynamic x) + extra remote (dynamic y),
            # we should be able to wobble x & y with no recompiles.
            self.reset()
            cnts.clear()
            with torch.compiler.config.patch(pgo_extra_read_key="sticky_1"):
                f(t(2, 2), t(2, 2))
                f(t(2, 4), t(4, 2))
                f(t(4, 2), t(2, 4))
                self.assertEqual(cnts.frame_count, 1)
 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -66,7 +66,6 @@ from torch.testing._internal.common_utils import (
    parametrize,
    serialTest,
    skipIfHpu,
    skipIfRocm,
    skipIfWindows,
    TEST_WITH_ROCM,
 )
@ -7406,7 +7405,6 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
            out = f_compiled(x, s0, s1, s2)
            self.assertEqual(out_ref, out)
    @skipIfRocm
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "requires gpu with fp8 support")
    @requires_cuda
    def test_partitioner_saves_weights_for_bw(self):
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@ -28,6 +28,7 @@ from torch.testing._internal.triton_utils import requires_cuda_and_triton
 if torch.distributed.is_available():
    from torch.testing._internal.distributed.fake_pg import FakeStore
 HAS_TLPARSE = shutil.which("tlparse") is not None
 requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
 requires_distributed = functools.partial(
@ -1197,13 +1198,13 @@ def forward(self, x_1: "f32[2][1]cpu"):
    @contextmanager
    def _setup_runtime_estimates_capture(self):
-        """Helper to turn on and capture the combined 'inductor_runtime_and_tensor_meta' structured trace."""
+        """Helper to turn on and capture the 'inductor_tlparse_runtime' structured trace."""
        payload_buffer = io.StringIO()
        payload_handler = logging.StreamHandler(payload_buffer)
        payload_handler.setLevel(logging.DEBUG)
        payload_handler.setFormatter(StructuredTracePayloadFormatter())
        payload_handler.addFilter(
-            StructuredTraceTestingFilter("inductor_runtime_and_tensor_meta")
+            StructuredTraceTestingFilter("inductor_tlparse_runtime")
        )
        trace_log.addHandler(payload_handler)
        try:
@ -1244,10 +1245,8 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))
-                # Verify runtime + tensor meta artifact was logged
+                # Verify runtime estimates artifact was logged
-                self.assertIn(
+                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())
                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
                )
                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1311,10 +1310,8 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))
-                # Verify artifact was logged
+                # Verify runtime estimates artifact was logged
-                self.assertIn(
+                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())
                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
                )
                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1336,145 +1333,6 @@ def forward(self, x_1: "f32[2][1]cpu"):
        finally:
            dist.destroy_process_group()
    @requires_tlparse
    @requires_distributed()
    @requires_cuda_and_triton
    @torch._inductor.config.patch("fx_graph_cache", False)
    @torch._inductor.config.patch("log_tlparse", True)
    def test_tensor_metadata_logging_multiple_ops(self):
        import torch.distributed as dist
        store = FakeStore()
        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
        class Mixed(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = torch.nn.Linear(4, 4)
            def forward(self, x):
                y = torch.relu(self.linear(x))
                y = torch.ops._c10d_functional.all_reduce.default(y, "sum", "0")
                y = torch.ops._c10d_functional.wait_tensor.default(y)
                return y + 1
        try:
            with self._setup_runtime_estimates_capture() as payload_buffer:
                torch._dynamo.reset()
                mod = Mixed().cuda()
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))
                payload = payload_buffer.getvalue().strip()
                if payload:
                    data = json.loads(payload)
                    types = sorted({op.get("type") for op in data.get("ops", [])})
                    self.assertExpectedInline(
                        str(types), """['collective', 'compute']"""
                    )
                self.assertParses()
        finally:
            dist.destroy_process_group()
    @requires_tlparse
    @torch._inductor.config.patch("log_tlparse", True)
    def test_tensor_metadata_logging(self):
        """Emit unified runtime+tensor-metadata artifact and assert a stable simplified JSON inline."""
        with self._setup_runtime_estimates_capture() as payload_buffer:
            def f(x):
                y = x.transpose(0, 1)
                z = y.mean(dim=0)
                w = z.to(torch.float16)
                return w
            compiled = torch.compile(f, backend="inductor", fullgraph=True)
            compiled(torch.ones(2, 3))
            # Verify artifact was logged
            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
            payload = payload_buffer.getvalue().strip()
            if payload:
                data = json.loads(payload)
                ops = data.get("ops", [])
                simplified_ops = []
                for op in ops:
                    outs = [
                        {
                            "shape": out.get("shape", []),
                            "stride": out.get("stride", []),
                            "dtype": out.get("dtype", None),
                        }
                        for out in op.get("outputs", [])
                    ]
                    if outs:
                        simplified_ops.append(
                            {
                                "type": op.get("type", ""),
                                "outputs": outs,
                            }
                        )
                self.assertExpectedInline(
                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
                    """{'ops': [{'type': 'compute', 'outputs': [{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}""",
                )
            self.assertParses()
    @requires_tlparse
    @torch._inductor.config.patch("log_tlparse", True)
    def test_tensor_metadata_logging_dynamic_shapes(self):
        """Same as test_tensor_metadata_logging, but with dynamic shapes enabled to cover to_size_hints."""
        with self._setup_runtime_estimates_capture() as payload_buffer:
            def f(x):
                y = x.transpose(0, 1)
                z = y.mean(dim=0)
                w = z.to(torch.float16)
                return w
            compiled = torch.compile(f, backend="inductor", dynamic=True)
            compiled(torch.ones(2, 3))
            # Verify artifact was logged
            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
            payload = payload_buffer.getvalue().strip()
            if payload:
                data = json.loads(payload)
                ops = data.get("ops", [])
                simplified_ops = []
                for op in ops:
                    outs = [
                        {
                            "shape": out.get("shape", []),
                            "stride": out.get("stride", []),
                            "dtype": out.get("dtype", None),
                        }
                        for out in op.get("outputs", [])
                    ]
                    if outs:
                        simplified_ops.append(
                            {
                                "type": op.get("type", ""),
                                "outputs": outs,
                            }
                        )
                self.assertExpectedInline(
                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
                    (
                        "{'ops': [{'type': 'compute', 'outputs': ["
                        "{'shape': [2], 'stride': [1], 'dtype': 'float32'}, "
                        "{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}"
                    ),
                )
            self.assertParses()
 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing
--- a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
+++ b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product
--- a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
+++ b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
--- a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
+++ b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
--- a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
+++ b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
--- a/Show More
+++ b/Show More
`@ -1 +1 @@`
	`02351a683668dd65bc82343e55245e308eb97b4e`	`f92ceca80df7a36194468665d62b0f791b1826c5`
`@ -1 +1 @@`
	`0fc8fa751a4321d6531467537ff77cf3c1c70260`	`0ca2393b47e72c4424a49aa3b32c7c5d0e378a72`
`@ -1 +1 @@`
	`a1c6ee92c85e8b0955c20892ed68f032a6015c09`	`095faec1e7b6cc47220181e74ae9cde2605f9b00`