test

2025-11-03 15:35:04 +08:00 · 2025-08-19 17:29:55 -07:00
188 changed files with 2626 additions and 5855 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,7 +92,6 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
-        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -210,6 +209,8 @@ if __name__ == "__main__":
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
+        # nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
+        build_vars += "USE_NVSHMEM=OFF "

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -64,10 +64,6 @@ FROM cuda as cuda12.9
 RUN bash ./install_cuda.sh 12.9
 ENV DESIRED_CUDA=12.9

-FROM cuda as cuda13.0
-RUN bash ./install_cuda.sh 13.0
-ENV DESIRED_CUDA=13.0
-
 FROM ${ROCM_IMAGE} as rocm
 ENV PYTORCH_ROCM_ARCH="gfx900;gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
 ADD ./common/install_mkl.sh install_mkl.sh
@ -83,7 +79,6 @@ FROM base as all_cuda
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
-COPY --from=cuda13.0  /usr/local/cuda-13.0 /usr/local/cuda-13.0

 # Final step
 FROM ${BASE_TARGET} as final
--- a/.ci/docker/build.sh
+++ b/.ci/docker/build.sh
@ -168,7 +168,7 @@ case "$tag" in
    TRITON=yes
    ;;
  pytorch-linux-jammy-py3-clang12-onnx)
-    ANACONDA_PYTHON_VERSION=3.10
+    ANACONDA_PYTHON_VERSION=3.9
    CLANG_VERSION=12
    VISION=yes
    ONNX=yes
@ -288,6 +288,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
@ -298,6 +299,7 @@ case "$tag" in
    GCC_VERSION=11
    ACL=yes
    VISION=yes
+    CONDA_CMAKE=yes
    OPENBLAS=yes
    # snadampal: skipping llvm src build install because the current version
    # from pytorch/llvm:9.0.1 is x86 specific
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -1,2 +0,0 @@
-transformers==4.54.0
-soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -0,0 +1 @@
+v4.54.0
--- a/.ci/docker/ci_commit_pins/nccl-cu13.txt
+++ b/.ci/docker/ci_commit_pins/nccl-cu13.txt
@ -1 +0,0 @@
-v2.27.7-1
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.20
+NVSHMEM_VERSION=3.3.9

 function install_cuda {
  version=$1
@ -62,16 +62,14 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
-  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
-  suffix=".tar.xz"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"
+  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}${suffix}"
-  cp -a "${filename}/include/"* /usr/local/cuda/include/
-  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/
+  tar xf "${filename}.tar.gz"
+  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
+  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
@ -128,6 +126,74 @@ function install_129 {
  ldconfig
 }

+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.4 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+function prune_126 {
+  echo "Pruning CUDA 12.6"
+  #####################################################################################
+  # CUDA 12.6 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.6/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.6/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+  if [[ -n "$OVERRIDE_GENCODE_CUDNN" ]]; then
+      export GENCODE_CUDNN=$OVERRIDE_GENCODE_CUDNN
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.6 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.6/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.3.2 $CUDA_BASE/nsight-systems-2024.5.1/
+}
+
 function install_128 {
  CUDNN_VERSION=9.8.0.87
  echo "Installing CUDA 12.8.1 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
@ -146,39 +212,18 @@ function install_128 {
  ldconfig
 }

-function install_130 {
-  CUDNN_VERSION=9.12.0.46
-  NVSHMEM_VERSION=3.3.20
-  echo "Installing CUDA 13.0 and cuDNN ${CUDNN_VERSION} and NVSHMEM and NCCL and cuSparseLt-0.7.1"
-  # install CUDA 13.0 in the same container
-  install_cuda 13.0.0 cuda_13.0.0_580.65.06_linux
-
-  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
-  install_cudnn 13 $CUDNN_VERSION
-
-  install_nvshmem 13 $NVSHMEM_VERSION
-
-  CUDA_VERSION=13.0 bash install_nccl.sh
-
-  CUDA_VERSION=13.0 bash install_cusparselt.sh
-
-  ldconfig
-}
-
 # idiomatic parameter and option handling in sh
 while test $# -gt 0
 do
    case "$1" in
-    12.4) install_124;
+    12.4) install_124; prune_124
        ;;
-    12.6|12.6.*) install_126;
+    12.6|12.6.*) install_126; prune_126
        ;;
    12.8|12.8.*) install_128;
        ;;
    12.9|12.9.*) install_129;
        ;;
-    13.0|13.0.*) install_130;
-        ;;
    *) echo "bad argument $1"; exit 1
        ;;
    esac
--- a/.ci/docker/common/install_cusparselt.sh
+++ b/.ci/docker/common/install_cusparselt.sh
@ -5,15 +5,7 @@ set -ex
 # cuSPARSELt license: https://docs.nvidia.com/cuda/cusparselt/license.html
 mkdir tmp_cusparselt && cd tmp_cusparselt

-if [[ ${CUDA_VERSION:0:4} =~ "13" ]]; then
-    arch_path='sbsa'
-    export TARGETARCH=${TARGETARCH:-$(uname -m)}
-    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
-        arch_path='x86_64'
-    fi
-    CUSPARSELT_NAME="libcusparse_lt-linux-${arch_path}-0.8.0.4_cuda13-archive"
-    curl --retry 3 -OLs https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-${arch_path}/${CUSPARSELT_NAME}.tar.xz
-elif [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
+if [[ ${CUDA_VERSION:0:4} =~ ^12\.[5-9]$ ]]; then
    arch_path='sbsa'
    export TARGETARCH=${TARGETARCH:-$(uname -m)}
    if [ ${TARGETARCH} = 'amd64' ] || [ "${TARGETARCH}" = 'x86_64' ]; then
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,7 +5,9 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  pip_install -r huggingface-requirements.txt
+  local version
+  commit=$(get_pinned_commit huggingface)
+  pip_install "git+https://github.com/huggingface/transformers@${commit}"
 }

 function install_timm() {
@ -24,6 +26,9 @@ function install_torchbench() {

  python install.py --continue_on_fail

+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0
+
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
--- a/.ci/docker/common/install_nccl.sh
+++ b/.ci/docker/common/install_nccl.sh
@ -7,8 +7,6 @@ if [[ ${CUDA_VERSION:0:2} == "11" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu11.txt)
 elif [[ ${CUDA_VERSION:0:2} == "12" ]]; then
  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu12.txt)
-elif [[ ${CUDA_VERSION:0:2} == "13" ]]; then
-  NCCL_VERSION=$(cat ci_commit_pins/nccl-cu13.txt)
 else
  echo "Unexpected CUDA_VERSION ${CUDA_VERSION}"
  exit 1
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
+COPY ci_commit_pins/huggingface.txt huggingface.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -174,15 +174,17 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
-  popd

-  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # soxr comes from https://github.com/huggingface/transformers/pull/39429
+  pip install transformers==4.54.0 soxr==0.5.0
+
  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
  # its current version 0.12.0 doesn't work with transformers 4.54.0
  pip uninstall -y torchao

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
+  popd
 }

 torchbench_setup_macos() {
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-02351a683668dd65bc82343e55245e308eb97b4e
+f92ceca80df7a36194468665d62b0f791b1826c5
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-0fc8fa751a4321d6531467537ff77cf3c1c70260
+0ca2393b47e72c4424a49aa3b32c7c5d0e378a72
--- a/.github/ci_commit_pins/xla.txt
+++ b/.github/ci_commit_pins/xla.txt
@ -1 +1 @@
-a1c6ee92c85e8b0955c20892ed68f032a6015c09
+095faec1e7b6cc47220181e74ae9cde2605f9b00
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -1,20 +0,0 @@
-version: 2
-updates:
-  # Update to the latest transformers version with dependabot
-  - package-ecosystem: "pip"
-    directory: "/.ci/docker/ci_commit_pins"
-    schedule:
-      interval: "daily"
-    target-branch: "main"
-    allow:
-      - dependency-name: "transformers"
-    commit-message:
-      prefix: "[Dependabot] Update"
-      include: "scope"
-    labels:
-      - "dependencies"
-      - "open source"
-      - "python"
-      - "topic: not user facing"
-      - "module: ci"
-      - "module: inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,7 +27,6 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,13 +96,6 @@ on:
        required: false
        type: string
        default: ""
-      build-external-packages:
-        description: |
-          If set, the build external packages and saves their wheels as artifacts
-          use command separated list of packages to build ex: 'vllm,transformers'.
-        required: false
-        type: string
-        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -363,26 +356,6 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

-      - name: Build external packages
-        id: build-external-packages
-        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
-        uses: ./.github/actions/build-external-packages
-        with:
-          build-targets: ${{ inputs.build-external-packages }}
-          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
-          cuda-arch-list: ${{ inputs.cuda-arch-list }}
-          output-dir: external
-
-      - name: Move external packages to dist
-        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
-        shell: bash
-        run: |
-          src="${{ steps.build-external-packages.outputs.output_dir }}"
-          if [ -d "$src" ]; then
-            mkdir -p "dist/$(dirname "$src")"
-            mv "$src" "dist/$(dirname "$src")/"
-          fi
-
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/build-almalinux-images.yml
+++ b/.github/workflows/build-almalinux-images.yml
@ -36,7 +36,7 @@ jobs:
    runs-on: linux.9xlarge.ephemeral
    strategy:
      matrix:
-        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "cuda13.0", "rocm6.3", "rocm6.4", "cpu"]
+        tag: ["cuda12.6", "cuda12.8", "cuda12.9", "rocm6.3", "rocm6.4", "cpu"]
    steps:
      - name: Build docker image
        uses: pytorch/pytorch/.github/actions/binary-docker-build@main
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@ -57,11 +57,6 @@ jobs:
          echo "PT_RELEASE_FILE=pytorch-$tag_or_branch.tar.gz" >> "$GITHUB_ENV"
      - name: Checkout optional submodules
        run: python3 tools/optional_submodules.py
-      - name: Copy docs requirements for inclusion
-        run: |
-          # Replace symlink with actual file
-          rm docs/requirements.txt || true
-          cp .ci/docker/requirements-docs.txt docs/requirements.txt
      - name: Create source distribution
        run: |
            # Create new folder with specified name so extracting the archive yields that
--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -354,7 +354,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -465,7 +465,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +687,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -798,7 +798,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -909,7 +909,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_14t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -785,7 +785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -851,7 +851,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1311,7 +1311,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1377,7 +1377,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1508,7 +1508,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -1968,7 +1968,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2034,7 +2034,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2100,7 +2100,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2560,7 +2560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2626,7 +2626,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2692,7 +2692,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3152,7 +3152,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3218,7 +3218,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3284,7 +3284,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -3744,7 +3744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3810,7 +3810,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3876,7 +3876,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -4336,7 +4336,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -4402,7 +4402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -4468,7 +4468,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@ -156,13 +156,13 @@ jobs:
      sync-tag: asan-test
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-build:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-build:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-build.yml
    needs: get-label-type
    with:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
-      build-environment: linux-jammy-py3.10-clang12-onnx
+      build-environment: linux-jammy-py3.9-clang12-onnx
      docker-image-name: ci-image:pytorch-linux-jammy-py3-clang12-onnx
      test-matrix: |
        { include: [
@ -171,16 +171,16 @@ jobs:
        ]}
    secrets: inherit

-  linux-jammy-py3_10-clang12-onnx-test:
-    name: linux-jammy-py3.10-clang12-onnx
+  linux-jammy-py3_9-clang12-onnx-test:
+    name: linux-jammy-py3.9-clang12-onnx
    uses: ./.github/workflows/_linux-test.yml
    needs:
-      - linux-jammy-py3_10-clang12-onnx-build
+      - linux-jammy-py3_9-clang12-onnx-build
      - target-determination
    with:
-      build-environment: linux-jammy-py3.10-clang12-onnx
-      docker-image: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.docker-image }}
-      test-matrix: ${{ needs.linux-jammy-py3_10-clang12-onnx-build.outputs.test-matrix }}
+      build-environment: linux-jammy-py3.9-clang12-onnx
+      docker-image: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.docker-image }}
+      test-matrix: ${{ needs.linux-jammy-py3_9-clang12-onnx-build.outputs.test-matrix }}
    secrets: inherit

  linux-jammy-py3_9-clang12-build:
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -1,45 +0,0 @@
-name: vllm-test
-
-on:
-  push:
-    tags:
-      - ciflow/vllm/*
-  workflow_dispatch:
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
-  cancel-in-progress: true
-
-permissions:
-  id-token: write
-  contents: read
-
-jobs:
-  get-label-type:
-    name: get-label-type
-    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
-    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
-    with:
-      triggering_actor: ${{ github.triggering_actor }}
-      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
-      curr_branch: ${{ github.head_ref || github.ref_name }}
-      curr_ref_type: ${{ github.ref_type }}
-      opt_out_experiments: lf
-
-  torch-build-sm89:
-    name: sm89-vllm-test
-    uses: ./.github/workflows/_linux-build.yml
-    needs: get-label-type
-    with:
-      build-additional-packages: "vision audio torchao"
-      build-external-packages: "vllm"
-      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
-      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
-      cuda-arch-list: '8.9'
-      runner: linux.24xlarge.memory
-      test-matrix: |
-        { include: [
-          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu"  },
-          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
-        ]}
-    secrets: inherit
--- a/.gitignore
+++ b/.gitignore
@ -32,7 +32,6 @@ coverage.xml
 aten/build/
 aten/src/ATen/Config.h
 aten/src/ATen/cuda/CUDAConfig.h
-aten/src/ATen/hip/HIPConfig.h
 benchmarks/.data
 caffe2/cpp_test/
 dist/
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1) {
-      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
        continue;
      }
    }
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
    if (tensor.dim() == 1) {
-      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
        continue;
      }
    }
--- a/aten/src/ATen/cuda/CUDABlas.cpp
+++ b/aten/src/ATen/cuda/CUDABlas.cpp
@ -1847,12 +1847,8 @@ int get_scale_mode(ScalingType scaling_type, ScalarType scale_dtype, bool use_fa
  switch (scaling_type) {
    case ScalingType::BlockWise1x32:
      TORCH_CHECK(scale_dtype == kFloat8_e8m0fnu);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
-#ifdef USE_ROCM
-      return HIPBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
-#else
+#if CUDA_VERSION >= 12080
      return CUBLASLT_MATMUL_MATRIX_SCALE_VEC32_UE8M0;
-#endif // USE_ROCM
 #else
      TORCH_CHECK(false, "scaled_gemm with `torch.float8_e8m0fnu` scales of 1x32 blocks is only supported for CUDA 12.8 and above");
 #endif // if CUDA_VERSION >= 12080
@ -1950,26 +1946,12 @@ void scaled_gemm(
  // hipblaslt supported row-wise before cublas, and did so their own way (via
  // the SCALE_POINTERSs), but then migrated to match how cublas does it (via
  // the SCALE_MODEs). Here we check for this early custom mode.
-  bool use_rowwise = (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise);
 #if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
-  if (use_rowwise) {
+  if (mat1_scaling_type == ScalingType::RowWise && mat2_scaling_type == ScalingType::RowWise) {
    matmulDescA = HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER_VEC_EXT;
    matmulDescB = HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER_VEC_EXT;
  }
-  else if (mat1_scale_dtype == kFloat8_e8m0fnu && mat2_scale_dtype == kFloat8_e8m0fnu) {
-  #if ROCM_VERSION >= 70000
-            if (at::detail::getCUDAHooks().isGPUArch({"gfx950"})) {
-                // TODO: add constraints based on hipblaslt internals
-                TORCH_CHECK((m % 32 == 0) && (n % 32 == 0) && (k % 32 == 0),
-                           "Matrix dimensions must be multiples of 32 for MX format. "
-                           "Got m=", m, ", n=", n, ", k=", k);
-            }
-  #endif
-  }
-#else
-  // rowwise isn't supported using cublaslt or older hipblaslt
-  TORCH_INTERNAL_ASSERT(use_rowwise == false, "rowwise scaled_gemm not supported with blaslt");
-#endif  // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
+#endif // if defined(USE_ROCM) && !defined(HIPBLASLT_OUTER_VEC) && defined(HIPBLASLT_VEC_EXT)
  computeDesc.setAttribute(matmulDescA, mat1_scale_ptr);
  computeDesc.setAttribute(matmulDescB, mat2_scale_ptr);
  if (result_scale_ptr != nullptr) {
@ -2008,16 +1990,15 @@ void scaled_gemm(
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, CUBLASLT_EPILOGUE_BIAS);
    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, ScalarTypeToCudaDataType(bias_dtype));
  }
-    // For other data types, use the get_scale_mode function based on scaling type
-    // The SCALE_MODE attrs only exist in cuBLAS 12.8+/ROCm 7.0 or in recent hipblaslt,
-    // but we must invoke get_scale_mode anyways to trigger the version checks.
-    // Note that AMD/ROCm follows OCP Spec 1.0, which is different from NVIDIA's implementation. See get_scale_mode() for details.
-    [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
-    [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
-#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
-    computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
-#endif // if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && ROCM_VERSION >= 70000 && defined(HIPBLASLT_OUTER_VEC))
+
+  // The SCALE_MODE attrs only exist in cuBLAS 12.8+ or in recent hipblaslt,
+  // but we must invoke get_scale_mode anyways to trigger the version checks.
+  [[maybe_unused]] int a_scale_mode = get_scale_mode(mat1_scaling_type, mat1_scale_dtype, use_fast_accum);
+  [[maybe_unused]] int b_scale_mode = get_scale_mode(mat2_scaling_type, mat2_scale_dtype, use_fast_accum);
+#if CUDA_VERSION >= 12080 || (defined(USE_ROCM) && defined(HIPBLASLT_OUTER_VEC))
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_MODE, a_scale_mode);
+  computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_MODE, b_scale_mode);
+#endif

  CuBlasLtMatmulPreference preference;
  auto ltworkspace = CublasLtWorkspace();
--- a/aten/src/ATen/cuda/CUDADataType.h
+++ b/aten/src/ATen/cuda/CUDADataType.h
@ -90,7 +90,7 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type)
    case c10::ScalarType::Float8_e5m2fnuz:
      return HIP_R_8F_E5M2_FNUZ;
 #endif
-#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080) || (defined(USE_ROCM) && ROCM_VERSION >= 70000)
+#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12080)
    case c10::ScalarType::Float4_e2m1fn_x2:
      return CUDA_R_4F_E2M1;
 #endif
--- a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
+++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h
@ -85,15 +85,6 @@ constexpr hipDataType HipDataTypeFor<c10::Float8_e8m0fnu>() {
  return static_cast<hipDataType>(500);
 }

-template <>
-constexpr hipDataType HipDataTypeFor<c10::Float4_e2m1fn_x2>() {
-#if ROCM_VERSION >= 70000
-  return HIP_R_4F_E2M1;
-#else
-  return static_cast<hipDataType>(33);
-#endif
-}
-
 template <typename T>
 int GetBatchFromParams(const GemmParams<T>* params) {
  return 1;
--- a/aten/src/ATen/native/cuda/Blas.cpp
+++ b/aten/src/ATen/native/cuda/Blas.cpp
@ -1283,35 +1283,15 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
  if (use_fast_accum) {
    TORCH_CHECK(mat1.scalar_type() != ScalarType::Float4_e2m1fn_x2 && mat2.scalar_type() != ScalarType::Float4_e2m1fn_x2, "`use_fast_accum` is not supported when `mat1` or `mat2` tensors have the `Float4_e2m1fn_x2` dtype.");
  }
-#ifdef USE_ROCM
-  if (mat1.scalar_type() == ScalarType::Float4_e2m1fn_x2 || mat2.scalar_type() == ScalarType::Float4_e2m1fn_x2) {
-    TORCH_CHECK(ROCM_VERSION >= 70000, "Float4_e2m1fn_x2 is only supported for ROCm 7.0 and above");
-  }
-  if (mat1.scalar_type() == ScalarType::Float8_e5m2 || mat2.scalar_type() == ScalarType::Float8_e5m2) {
-    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e5m2 is only supported for ROCm 6.5 and above");
-  }
-  if (mat1.scalar_type() == ScalarType::Float8_e4m3fn || mat2.scalar_type() == ScalarType::Float8_e4m3fn) {
-    TORCH_CHECK(ROCM_VERSION >= 60500, "Float8_e4m3fn is only supported for ROCm 6.5 and above");
-  }
-#endif
  if (bias) {
-    TORCH_CHECK(out.scalar_type() != kFloat,
-        "Bias is not supported when out_dtype is set to Float32");
-
-    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be BFloat16 or Half, but got ", bias->scalar_type());
-
-    TORCH_CHECK((out.scalar_type() != kFloat &&
-                 out.scalar_type() != ScalarType::BFloat16) ||
-                bias->scalar_type() == ScalarType::BFloat16,
-        "Bias must be BFloat16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
-
-    TORCH_CHECK(out.scalar_type() != ScalarType::Half ||
-                bias->scalar_type() == ScalarType::Half,
-        "Bias must be Float16 to compute ", out.scalar_type(),
-        " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != kFloat, "Bias is not supported when out_dtype is set to Float32");
+    TORCH_CHECK(bias->scalar_type() == ScalarType::BFloat16 || bias->scalar_type() == ScalarType::Half,
+         "Bias must be either Half or BFloat16, but got ", bias->scalar_type());
+    TORCH_CHECK((out.scalar_type() != kFloat && out.scalar_type() != ScalarType::BFloat16) ||
+          bias->scalar_type() == ScalarType::BFloat16,
+          "Bias must be BFloat16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
+    TORCH_CHECK(out.scalar_type() != ScalarType::Half || bias->scalar_type() == ScalarType::Half,
+          "Bias must be Float16 to compute ", out.scalar_type(), " output, but got ", bias->scalar_type());
  }
  {
    auto bias_ = bias.value_or(Tensor());
@ -1373,22 +1353,6 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16,
         "hipblaslt rowwise _scaled_mm only supports BFloat16 output but got ", out.scalar_type());
  }
-  else if (scaling_choice_a == ScalingType::BlockWise1x32 && scaling_choice_b == ScalingType::BlockWise1x32) {
-    #if ROCM_VERSION >= 70000
-    TORCH_CHECK(at::detail::getCUDAHooks().isGPUArch({"gfx950"}),
-                "Block-wise scaling for Float8_e8m0fnu is only supported on gfx950");
-
-    TORCH_CHECK(mat1.size(0) % 32 == 0 && mat1.size(1) % 32 == 0 &&
-                mat2.size(0) % 32 == 0 && mat2.size(1) % 32 == 0,
-                "Matrix dimensions must be multiples of 32 for block-wise scaling");
-
-    TORCH_CHECK(out.scalar_type() == ScalarType::BFloat16 ||
-                out.scalar_type() == ScalarType::Half,
-                "Block-wise scaling only supports BFloat16 or Half output types");
-#else
-    TORCH_CHECK(false, "Block-wise scaling for Float8_e8m0fnu requires ROCm 7.0 or later");
-#endif
-  }
 #endif

  cublasCommonArgs args(mat1, mat2, out, scale_a, scale_b, scale_result, scaling_choice_a, scaling_choice_b);
@ -1466,14 +1430,12 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2,
      params.k = args.k;
      params.a = args.mata->data_ptr();
      params.a_scale_ptr = args.scale_mata_ptr;
-      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.lda = args.lda;
      params.a_dtype = args.mata->scalar_type();
      params.a_scale_dtype = args.scale_mata_dtype.value();
      params.a_scaling_type = args.scaling_mata_type.value();
      params.b = args.matb->data_ptr();
      params.b_scale_ptr = args.scale_matb_ptr;
-      params.b_scale_dtype = args.scale_matb_dtype.value();
      params.ldb = args.ldb;
      params.b_dtype = args.matb->scalar_type();
      params.b_scale_dtype = args.scale_matb_dtype.value();
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -19,7 +19,9 @@ struct GridSamplerOffsets {
 static GridSamplerOffsets find_grid_sampler_offsets(
    constant int32_t* output_sizes,
    constant int32_t* output_strides,
+    constant int32_t* input_sizes,
    constant int32_t* input_strides,
+    constant int32_t* grid_sizes,
    constant int32_t* grid_strides,
    int32_t sampler_dims,
    uint tid) {
@ -276,13 +278,16 @@ kernel void grid_sampler(
  auto output_strides = params.output_strides.data();
  auto input_sizes = params.input_sizes.data();
  auto input_strides = params.input_strides.data();
+  auto grid_sizes = params.grid_sizes.data();
  auto grid_strides = params.grid_strides.data();
  auto sampler_dims = params.sampler_dims;

  auto offsets = find_grid_sampler_offsets(
      output_sizes,
      output_strides,
+      input_sizes,
      input_strides,
+      grid_sizes,
      grid_strides,
      sampler_dims,
      tid);
--- a/aten/src/ATen/native/mps/operations/ReduceOps.mm
+++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm
@ -456,7 +456,7 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t,
    errMessage += ": reduction dim must be in the range of input shape";
    for (const auto dim : dim_value) {
      auto wrap_dim = maybe_wrap_dim(dim, num_input_dims);
-      TORCH_CHECK(wrap_dim < (num_input_dims ? num_input_dims : 1), errMessage.c_str())
+      TORCH_CHECK(wrap_dim < static_cast<decltype(wrap_dim)>(input_shape.size()), errMessage.c_str())
    }
  }

--- a/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
+++ b/aten/src/ATen/native/transformers/hip/flash_attn/aot/mha_all_aot.hip
@ -243,6 +243,12 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  } else {
    softmax_fa_t = at::empty({ 0, 0, 0, 0 }, opts);
  }
+
+  at::Tensor atomic_counter;
+  if (is_causal) {
+    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
+  }
+
  auto [needs_swa, window_left, window_right] = calculate_swa(window_size_left,
                                                              window_size_right,
                                                              seqlen_q,
@ -256,14 +262,6 @@ mha_fwd_aot(const at::Tensor &q,         // batch_size x seqlen_q x num_heads x
  constexpr bool uses_swa = false;
 #endif

-  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
-  is_causal = is_causal || uses_swa;
-
-  at::Tensor atomic_counter;
-  if (is_causal) {
-    atomic_counter = at::zeros({1}, opts.dtype(at::kInt));
-  }
-
  hipError_t err; // TODO: Error handling
  using aotriton::v2::flash::attn_fwd;
  using sdp::aotriton_adapter::mk_aotensor;
@ -457,9 +455,6 @@ mha_varlen_fwd_aot(const at::Tensor &q,  // total_q x num_heads x head_size, tot
  constexpr bool uses_swa = false;
 #endif

-  // SWA in AOTriton Kernels is treated as "Generalized Causal masks"
-  is_causal = is_causal || needs_swa;
-
  auto [seed_t, offset_t, philox_state, use_philox_state] =
    prepare_philox_arguments(p_dropout, batch_size * num_heads * 32);

--- a/benchmarks/dynamo/common.py
+++ b/benchmarks/dynamo/common.py
@ -4190,7 +4190,7 @@ def run(runner, args, original_dir=None):
                nonlocal marked
                for i, s in enumerate(t.size()):
                    if s == batch_size:
-                        torch._dynamo.maybe_mark_dynamic(t, i)
+                        torch._dynamo.mark_dynamic(t, i)
                        marked = True
                        break

--- a/benchmarks/dynamo/huggingface.py
+++ b/benchmarks/dynamo/huggingface.py
@ -370,7 +370,6 @@ class HuggingfaceRunner(BenchmarkRunner):
        return name in [
            "ElectraForQuestionAnswering",
            "MegatronBertForQuestionAnswering",
-            "GPT2ForSequenceClassification",
        ]

    def _get_model_cls_and_config(self, model_name):
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -631,7 +631,6 @@ libtorch_nativert_sources = [
    "torch/nativert/kernels/NativeKernels.cpp",
    "torch/nativert/kernels/GeneratedStaticDispatchKernels.cpp",
    "torch/nativert/kernels/GeneratedNativeStaticDispatchKernels.cpp",
-    "torch/nativert/graph/passes/SubgraphRewriter.cpp",
 ]

 torch_mobile_tracer_sources = [
--- a/requirements.txt
+++ b/requirements.txt
@ -10,7 +10,7 @@ filelock
 fsspec>=0.8.5
 hypothesis
 jinja2
-lintrunner ; platform_machine != "s390x" and platform_machine != "riscv64"
+lintrunner ; platform_machine != "s390x"
 networkx>=2.5.1
 optree>=0.13.0
 psutil
--- a/test/cpp/nativert/CMakeLists.txt
+++ b/test/cpp/nativert/CMakeLists.txt
@ -36,7 +36,6 @@ set(NATIVERT_TEST_SRCS
  ${TORCH_ROOT}/torch/nativert/kernels/AutoFunctionalizeKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/CallTorchBindKernel.cpp
  ${TORCH_ROOT}/torch/nativert/kernels/HigherOrderKernel.cpp
-  ${TORCH_ROOT}/torch/nativert/graph/passes/SubgraphRewriter.cpp
 )

 add_executable(test_nativert
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/csrc/kernel.cpp
@ -288,16 +288,6 @@ void boxed_empty_like(StableIValue* stack, uint64_t num_args, uint64_t num_outpu
  stack[0] = from(res);
 }

-bool my_is_cpu(Tensor t) {
-  return t.is_cpu();
-}
-
-
-void boxed_my_is_cpu(StableIValue* stack, uint64_t num_args, uint64_t num_outputs) {
-  auto res = my_is_cpu(to<Tensor>(stack[0]));
-  stack[0] = from(res);
-}
-
 Tensor fill_infinity(Tensor t) {
  auto value = std::numeric_limits<float>::infinity();
  return fill_(t, value);
@ -354,7 +344,6 @@ STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeExplicitAutograd, m) {
  m.impl("my_transpose", &boxed_my_transpose);
  m.impl("my_empty_like", &boxed_empty_like);
  m.impl("fill_infinity", &boxed_fill_infinity);
-  m.impl("my_is_cpu", &boxed_my_is_cpu);
 }

 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CompositeImplicitAutograd, m) {
@ -373,8 +362,6 @@ void boxed_my_zero_(StableIValue* stack, uint64_t num_args, uint64_t num_outputs

 STABLE_TORCH_LIBRARY_FRAGMENT(libtorch_agnostic, m) {
  m.def("my_zero_(Tensor(a!) t) -> Tensor(a!)");
-  m.def("my_is_cpu(Tensor t) -> bool");
-
 }

 STABLE_TORCH_LIBRARY_IMPL(libtorch_agnostic, CPU, m) {
--- a/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/libtorch_agnostic/ops.py
@ -51,19 +51,6 @@ def my_abs(t) -> Tensor:
    return torch.ops.libtorch_agnostic.my_abs.default(t)


-def my_is_cpu(t) -> bool:
-    """
-    Returns is_cpu on the input tensor.
-
-    Args:
-        t: any Tensor
-
-    Returns:
-        a bool
-    """
-    return torch.ops.libtorch_agnostic.my_is_cpu.default(t)
-
-
 def my_ones_like(tensor, device) -> Tensor:
    """
    Returns a new Tensor like the input tensor, but with all ones
--- a/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
+++ b/test/cpp_extensions/libtorch_agnostic_extension/test/test_libtorch_agnostic.py
@ -209,13 +209,6 @@ if not IS_WINDOWS:
            self.assertEqual(id(out), id(t))
            self.assertEqual(out, torch.zeros_like(t))

-        def test_my_is_cpu(self, device):
-            import libtorch_agnostic
-
-            t = torch.rand(2, 7, device=device)
-            out = libtorch_agnostic.ops.my_is_cpu(t)
-            self.assertEqual(out, t.is_cpu)
-
        def test_fill_infinity(self, device):
            import libtorch_agnostic

--- a/test/dynamo/cpython/3_13/test_collections.diff
+++ b/test/dynamo/cpython/3_13/test_collections.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_collections.py b/test/dynamo/cpython/3_13/test_collections.py
-index cafc44007d1..4571e5a14fd 100644
+index cafc44007d1..1ee548abc7d 100644
 --- a/test/dynamo/cpython/3_13/test_collections.py
 +++ b/test/dynamo/cpython/3_13/test_collections.py
@@ -1,3 +1,23 @@
@ -35,21 +35,7 @@ index cafc44007d1..4571e5a14fd 100644
     def _superset_test(self, a, b):
         self.assertGreaterEqual(
             set(dir(a)),
-@@ -73,9 +93,10 @@ class TestUserObjects(unittest.TestCase):
-         self._copy_test(obj)
- 
-     def test_dict_missing(self):
-        class A(UserDict):
-            def __missing__(self, key):
-                return 456
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class A(UserDict):
-+                def __missing__(self, key):
-+                    return 456
-         self.assertEqual(A()[123], 456)
-         # get() ignores __missing__ on dict
-         self.assertIs(A().get(123), None)
-@@ -85,7 +106,7 @@ class TestUserObjects(unittest.TestCase):
+@@ -85,7 +105,7 @@ class TestUserObjects(unittest.TestCase):
 ### ChainMap (helper class for configparser and the string module)
 ################################################################################
 
@ -58,69 +44,7 @@ index cafc44007d1..4571e5a14fd 100644
 
     def test_basics(self):
         c = ChainMap()
-@@ -172,9 +193,10 @@ class TestChainMap(unittest.TestCase):
-         self.assertTrue(ChainMap({}, {1:2}))
- 
-     def test_missing(self):
-        class DefaultChainMap(ChainMap):
-            def __missing__(self, key):
-                return 999
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class DefaultChainMap(ChainMap):
-+                def __missing__(self, key):
-+                    return 999
-         d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
-         for k, v in dict(a=1, b=2, c=30, d=999).items():
-             self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
-@@ -206,13 +228,14 @@ class TestChainMap(unittest.TestCase):
-              ('i', 9999), ('j', 0)])
- 
-     def test_iter_not_calling_getitem_on_maps(self):
-        class DictWithGetItem(UserDict):
-            def __init__(self, *args, **kwds):
-                self.called = False
-                UserDict.__init__(self, *args, **kwds)
-            def __getitem__(self, item):
-                self.called = True
-                UserDict.__getitem__(self, item)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class DictWithGetItem(UserDict):
-+                def __init__(self, *args, **kwds):
-+                    self.called = False
-+                    UserDict.__init__(self, *args, **kwds)
-+                def __getitem__(self, item):
-+                    self.called = True
-+                    UserDict.__getitem__(self, item)
- 
-         d = DictWithGetItem(a=1)
-         c = ChainMap(d)
-@@ -237,15 +260,16 @@ class TestChainMap(unittest.TestCase):
-         self.assertIs(m, d.maps[0])
- 
-         # Use a different map than a dict
-        class lowerdict(dict):
-            def __getitem__(self, key):
-                if isinstance(key, str):
-                    key = key.lower()
-                return dict.__getitem__(self, key)
-            def __contains__(self, key):
-                if isinstance(key, str):
-                    key = key.lower()
-                return dict.__contains__(self, key)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class lowerdict(dict):
-+                def __getitem__(self, key):
-+                    if isinstance(key, str):
-+                        key = key.lower()
-+                    return dict.__getitem__(self, key)
-+                def __contains__(self, key):
-+                    if isinstance(key, str):
-+                        key = key.lower()
-+                    return dict.__contains__(self, key)
- 
-         c = ChainMap()
-         c['a'] = 1
-@@ -315,7 +339,7 @@ class TestChainMap(unittest.TestCase):
+@@ -315,7 +335,7 @@ class TestChainMap(unittest.TestCase):
 
 TestNT = namedtuple('TestNT', 'x y z')    # type used for pickle tests
 
@ -129,19 +53,7 @@ index cafc44007d1..4571e5a14fd 100644
 
     def test_factory(self):
         Point = namedtuple('Point', 'x y')
-@@ -666,8 +690,9 @@ class TestNamedTuple(unittest.TestCase):
-             NT = namedtuple('NT', ['abc', 'def'], False, True)
- 
-     def test_namedtuple_subclass_issue_24931(self):
-        class Point(namedtuple('_Point', ['x', 'y'])):
-            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class Point(namedtuple('_Point', ['x', 'y'])):
-+                pass
- 
-         a = Point(3, 4)
-         self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
-@@ -722,21 +747,26 @@ class TestNamedTuple(unittest.TestCase):
+@@ -722,7 +742,7 @@ class TestNamedTuple(unittest.TestCase):
 ### Abstract Base Classes
 ################################################################################
 
@ -150,750 +62,7 @@ index cafc44007d1..4571e5a14fd 100644
 
     def validate_abstract_methods(self, abc, *names):
         methodstubs = dict.fromkeys(names, lambda s, *args: 0)
- 
-         # everything should work will all required methods are present
-        C = type('C', (abc,), methodstubs)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            C = type('C', (abc,), methodstubs)
-         C()
- 
-+        # Dynamo raises a hard error here that we can't easily capture
-+        # Commenting this part as this would also fail in eager if a user
-+        # attempt to run the same code
-+
-         # instantiation should fail if a required method is missing
-        for name in names:
-            stubs = methodstubs.copy()
-            del stubs[name]
-            C = type('C', (abc,), stubs)
-            self.assertRaises(TypeError, C, name)
-+        # for name in names:
-+        #     stubs = methodstubs.copy()
-+        #     del stubs[name]
-+        #     C = type('C', (abc,), stubs)
-+        #     self.assertRaises(TypeError, C, name)
- 
-     def validate_isinstance(self, abc, name):
-         stub = lambda s, *args: 0
-@@ -981,19 +1011,21 @@ class TestOneTrickPonyABCs(ABCTestCase):
-         for x in samples:
-             self.assertIsInstance(x, Iterable)
-             self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
-        # Check direct subclassing
-        class I(Iterable):
-            def __iter__(self):
-                return super().__iter__()
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check direct subclassing
-+            class I(Iterable):
-+                def __iter__(self):
-+                    return super().__iter__()
-         self.assertEqual(list(I()), [])
-         self.assertFalse(issubclass(str, I))
-         self.validate_abstract_methods(Iterable, '__iter__')
-         self.validate_isinstance(Iterable, '__iter__')
-        # Check None blocking
-        class It:
-            def __iter__(self): return iter([])
-        class ItBlocked(It):
-            __iter__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check None blocking
-+            class It:
-+                def __iter__(self): return iter([])
-+            class ItBlocked(It):
-+                __iter__ = None
-         self.assertTrue(issubclass(It, Iterable))
-         self.assertTrue(isinstance(It(), Iterable))
-         self.assertFalse(issubclass(ItBlocked, Iterable))
-@@ -1023,32 +1055,35 @@ class TestOneTrickPonyABCs(ABCTestCase):
-         self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
-         self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
-         self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
-        # Check direct subclassing
-        class R(Reversible):
-            def __iter__(self):
-                return iter(list())
-            def __reversed__(self):
-                return iter(list())
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check direct subclassing
-+            class R(Reversible):
-+                def __iter__(self):
-+                    return iter(list())
-+                def __reversed__(self):
-+                    return iter(list())
-         self.assertEqual(list(reversed(R())), [])
-         self.assertFalse(issubclass(float, R))
-         self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
-        # Check reversible non-iterable (which is not Reversible)
-        class RevNoIter:
-            def __reversed__(self): return reversed([])
-        class RevPlusIter(RevNoIter):
-            def __iter__(self): return iter([])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check reversible non-iterable (which is not Reversible)
-+            class RevNoIter:
-+                def __reversed__(self): return reversed([])
-+            class RevPlusIter(RevNoIter):
-+                def __iter__(self): return iter([])
-         self.assertFalse(issubclass(RevNoIter, Reversible))
-         self.assertFalse(isinstance(RevNoIter(), Reversible))
-         self.assertTrue(issubclass(RevPlusIter, Reversible))
-         self.assertTrue(isinstance(RevPlusIter(), Reversible))
-        # Check None blocking
-        class Rev:
-            def __iter__(self): return iter([])
-            def __reversed__(self): return reversed([])
-        class RevItBlocked(Rev):
-            __iter__ = None
-        class RevRevBlocked(Rev):
-            __reversed__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check None blocking
-+            class Rev:
-+                def __iter__(self): return iter([])
-+                def __reversed__(self): return reversed([])
-+            class RevItBlocked(Rev):
-+                __iter__ = None
-+            class RevRevBlocked(Rev):
-+                __reversed__ = None
-         self.assertTrue(issubclass(Rev, Reversible))
-         self.assertTrue(isinstance(Rev(), Reversible))
-         self.assertFalse(issubclass(RevItBlocked, Reversible))
-@@ -1082,15 +1117,16 @@ class TestOneTrickPonyABCs(ABCTestCase):
-         self.assertTrue(issubclass(Set, Collection), repr(Set))
-         self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
-         self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
-        # Check direct subclassing
-        class Col(Collection):
-            def __iter__(self):
-                return iter(list())
-            def __len__(self):
-                return 0
-            def __contains__(self, item):
-                return False
-        class DerCol(Col): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check direct subclassing
-+            class Col(Collection):
-+                def __iter__(self):
-+                    return iter(list())
-+                def __len__(self):
-+                    return 0
-+                def __contains__(self, item):
-+                    return False
-+            class DerCol(Col): pass
-         self.assertEqual(list(iter(Col())), [])
-         self.assertFalse(issubclass(list, Col))
-         self.assertFalse(issubclass(set, Col))
-@@ -1102,44 +1138,48 @@ class TestOneTrickPonyABCs(ABCTestCase):
-         self.validate_abstract_methods(Collection, '__len__', '__iter__',
-                                                    '__contains__')
-         # Check sized container non-iterable (which is not Collection) etc.
-        class ColNoIter:
-            def __len__(self): return 0
-            def __contains__(self, item): return False
-        class ColNoSize:
-            def __iter__(self): return iter([])
-            def __contains__(self, item): return False
-        class ColNoCont:
-            def __iter__(self): return iter([])
-            def __len__(self): return 0
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class ColNoIter:
-+                def __len__(self): return 0
-+                def __contains__(self, item): return False
-+            class ColNoSize:
-+                def __iter__(self): return iter([])
-+                def __contains__(self, item): return False
-+            class ColNoCont:
-+                def __iter__(self): return iter([])
-+                def __len__(self): return 0
-         self.assertFalse(issubclass(ColNoIter, Collection))
-         self.assertFalse(isinstance(ColNoIter(), Collection))
-         self.assertFalse(issubclass(ColNoSize, Collection))
-         self.assertFalse(isinstance(ColNoSize(), Collection))
-         self.assertFalse(issubclass(ColNoCont, Collection))
-         self.assertFalse(isinstance(ColNoCont(), Collection))
-        # Check None blocking
-        class SizeBlock:
-            def __iter__(self): return iter([])
-            def __contains__(self): return False
-            __len__ = None
-        class IterBlock:
-            def __len__(self): return 0
-            def __contains__(self): return True
-            __iter__ = None
-+
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check None blocking
-+            class SizeBlock:
-+                def __iter__(self): return iter([])
-+                def __contains__(self): return False
-+                __len__ = None
-+            class IterBlock:
-+                def __len__(self): return 0
-+                def __contains__(self): return True
-+                __iter__ = None
-         self.assertFalse(issubclass(SizeBlock, Collection))
-         self.assertFalse(isinstance(SizeBlock(), Collection))
-         self.assertFalse(issubclass(IterBlock, Collection))
-         self.assertFalse(isinstance(IterBlock(), Collection))
-        # Check None blocking in subclass
-        class ColImpl:
-            def __iter__(self):
-                return iter(list())
-            def __len__(self):
-                return 0
-            def __contains__(self, item):
-                return False
-        class NonCol(ColImpl):
-            __contains__ = None
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Check None blocking in subclass
-+            class ColImpl:
-+                def __iter__(self):
-+                    return iter(list())
-+                def __len__(self):
-+                    return 0
-+                def __contains__(self, item):
-+                    return False
-+            class NonCol(ColImpl):
-+                __contains__ = None
-         self.assertFalse(issubclass(NonCol, Collection))
-         self.assertFalse(isinstance(NonCol(), Collection))
- 
-@@ -1162,30 +1202,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
-             self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
-         self.validate_abstract_methods(Iterator, '__next__', '__iter__')
- 
-        # Issue 10565
-        class NextOnly:
-            def __next__(self):
-                yield 1
-                return
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            # Issue 10565
-+            class NextOnly:
-+                def __next__(self):
-+                    yield 1
-+                    return
-         self.assertNotIsInstance(NextOnly(), Iterator)
- 
-     def test_Generator(self):
-        class NonGen1:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def throw(self, typ, val=None, tb=None): pass
-
-        class NonGen2:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def send(self, value): return value
-
-        class NonGen3:
-            def close(self): pass
-            def send(self, value): return value
-            def throw(self, typ, val=None, tb=None): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class NonGen1:
-+                def __iter__(self): return self
-+                def __next__(self): return None
-+                def close(self): pass
-+                def throw(self, typ, val=None, tb=None): pass
-+
-+            class NonGen2:
-+                def __iter__(self): return self
-+                def __next__(self): return None
-+                def close(self): pass
-+                def send(self, value): return value
-+
-+            class NonGen3:
-+                def close(self): pass
-+                def send(self, value): return value
-+                def throw(self, typ, val=None, tb=None): pass
- 
-         non_samples = [
-             None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
-@@ -1194,18 +1236,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
-             self.assertNotIsInstance(x, Generator)
-             self.assertFalse(issubclass(type(x), Generator), repr(type(x)))
- 
-        class Gen:
-            def __iter__(self): return self
-            def __next__(self): return None
-            def close(self): pass
-            def send(self, value): return value
-            def throw(self, typ, val=None, tb=None): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class Gen:
-+                def __iter__(self): return self
-+                def __next__(self): return None
-+                def close(self): pass
-+                def send(self, value): return value
-+                def throw(self, typ, val=None, tb=None): pass
- 
-        class MinimalGen(Generator):
-            def send(self, value):
-                return value
-            def throw(self, typ, val=None, tb=None):
-                super().throw(typ, val, tb)
-+            class MinimalGen(Generator):
-+                def send(self, value):
-+                    return value
-+                def throw(self, typ, val=None, tb=None):
-+                    super().throw(typ, val, tb)
- 
-         def gen():
-             yield 1
-@@ -1228,15 +1271,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
-                                mgen.throw, ValueError, ValueError("huhu"))
-         self.assertRaises(StopIteration, mgen.throw, StopIteration())
- 
-        class FailOnClose(Generator):
-            def send(self, value): return value
-            def throw(self, *args): raise ValueError
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class FailOnClose(Generator):
-+                def send(self, value): return value
-+                def throw(self, *args): raise ValueError
- 
-         self.assertRaises(ValueError, FailOnClose().close)
- 
-        class IgnoreGeneratorExit(Generator):
-            def send(self, value): return value
-            def throw(self, *args): pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class IgnoreGeneratorExit(Generator):
-+                def send(self, value): return value
-+                def throw(self, *args): pass
- 
-         self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)
- 
-@@ -1379,15 +1424,17 @@ class TestOneTrickPonyABCs(ABCTestCase):
- 
-     def test_direct_subclassing(self):
-         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            class C(B):
-                pass
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
-+                class C(B):
-+                    pass
-             self.assertTrue(issubclass(C, B))
-             self.assertFalse(issubclass(int, C))
- 
-     def test_registration(self):
-         for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            class C:
-                __hash__ = None  # Make sure it isn't hashable by default
-+            with torch._dynamo.set_fullgraph(fullgraph=False):
-+                class C:
-+                    __hash__ = None  # Make sure it isn't hashable by default
-             self.assertFalse(issubclass(C, B), B.__name__)
-             B.register(C)
-             self.assertTrue(issubclass(C, B))
-@@ -1423,13 +1470,14 @@ class TestCollectionABCs(ABCTestCase):
-             self.assertIsInstance(sample(), Set)
-             self.assertTrue(issubclass(sample, Set))
-         self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
-        class MySet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MySet(Set):
-+                def __contains__(self, x):
-+                    return False
-+                def __len__(self):
-+                    return 0
-+                def __iter__(self):
-+                    return iter([])
-         self.validate_comparison(MySet())
- 
-     def test_hash_Set(self):
-@@ -1448,15 +1496,16 @@ class TestCollectionABCs(ABCTestCase):
-         self.assertTrue(hash(a) == hash(b))
- 
-     def test_isdisjoint_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MySet(Set):
-+                def __init__(self, itr):
-+                    self.contents = itr
-+                def __contains__(self, x):
-+                    return x in self.contents
-+                def __iter__(self):
-+                    return iter(self.contents)
-+                def __len__(self):
-+                    return len([x for x in self.contents])
-         s1 = MySet((1, 2, 3))
-         s2 = MySet((4, 5, 6))
-         s3 = MySet((1, 5, 6))
-@@ -1464,15 +1513,16 @@ class TestCollectionABCs(ABCTestCase):
-         self.assertFalse(s1.isdisjoint(s3))
- 
-     def test_equality_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MySet(Set):
-+                def __init__(self, itr):
-+                    self.contents = itr
-+                def __contains__(self, x):
-+                    return x in self.contents
-+                def __iter__(self):
-+                    return iter(self.contents)
-+                def __len__(self):
-+                    return len([x for x in self.contents])
-         s1 = MySet((1,))
-         s2 = MySet((1, 2))
-         s3 = MySet((3, 4))
-@@ -1486,15 +1536,16 @@ class TestCollectionABCs(ABCTestCase):
-         self.assertNotEqual(s2, s3)
- 
-     def test_arithmetic_Set(self):
-        class MySet(Set):
-            def __init__(self, itr):
-                self.contents = itr
-            def __contains__(self, x):
-                return x in self.contents
-            def __iter__(self):
-                return iter(self.contents)
-            def __len__(self):
-                return len([x for x in self.contents])
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MySet(Set):
-+                def __init__(self, itr):
-+                    self.contents = itr
-+                def __contains__(self, x):
-+                    return x in self.contents
-+                def __iter__(self):
-+                    return iter(self.contents)
-+                def __len__(self):
-+                    return len([x for x in self.contents])
-         s1 = MySet((1, 2, 3))
-         s2 = MySet((3, 4, 5))
-         s3 = s1 & s2
-@@ -1516,28 +1567,29 @@ class TestCollectionABCs(ABCTestCase):
- 
-     def test_issue_4920(self):
-         # MutableSet.pop() method did not work
-        class MySet(MutableSet):
-            __slots__=['__s']
-            def __init__(self,items=None):
-                if items is None:
-                    items=[]
-                self.__s=set(items)
-            def __contains__(self,v):
-                return v in self.__s
-            def __iter__(self):
-                return iter(self.__s)
-            def __len__(self):
-                return len(self.__s)
-            def add(self,v):
-                result=v not in self.__s
-                self.__s.add(v)
-                return result
-            def discard(self,v):
-                result=v in self.__s
-                self.__s.discard(v)
-                return result
-            def __repr__(self):
-                return "MySet(%s)" % repr(list(self))
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MySet(MutableSet):
-+                __slots__=['__s']
-+                def __init__(self,items=None):
-+                    if items is None:
-+                        items=[]
-+                    self.__s=set(items)
-+                def __contains__(self,v):
-+                    return v in self.__s
-+                def __iter__(self):
-+                    return iter(self.__s)
-+                def __len__(self):
-+                    return len(self.__s)
-+                def add(self,v):
-+                    result=v not in self.__s
-+                    self.__s.add(v)
-+                    return result
-+                def discard(self,v):
-+                    result=v in self.__s
-+                    self.__s.discard(v)
-+                    return result
-+                def __repr__(self):
-+                    return "MySet(%s)" % repr(list(self))
-         items = [5,43,2,1]
-         s = MySet(items)
-         r = s.pop()
-@@ -1563,24 +1615,25 @@ class TestCollectionABCs(ABCTestCase):
-     def test_issue16373(self):
-         # Recursion error comparing comparable and noncomparable
-         # Set instances
-        class MyComparableSet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
-        class MyNonComparableSet(Set):
-            def __contains__(self, x):
-                return False
-            def __len__(self):
-                return 0
-            def __iter__(self):
-                return iter([])
-            def __le__(self, x):
-                return NotImplemented
-            def __lt__(self, x):
-                return NotImplemented
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MyComparableSet(Set):
-+                def __contains__(self, x):
-+                    return False
-+                def __len__(self):
-+                    return 0
-+                def __iter__(self):
-+                    return iter([])
-+            class MyNonComparableSet(Set):
-+                def __contains__(self, x):
-+                    return False
-+                def __len__(self):
-+                    return 0
-+                def __iter__(self):
-+                    return iter([])
-+                def __le__(self, x):
-+                    return NotImplemented
-+                def __lt__(self, x):
-+                    return NotImplemented
- 
-         cs = MyComparableSet()
-         ncs = MyNonComparableSet()
-@@ -1591,13 +1644,14 @@ class TestCollectionABCs(ABCTestCase):
- 
-     def test_issue26915(self):
-         # Container membership test should check identity first
-        class CustomSequence(Sequence):
-            def __init__(self, seq):
-                self._seq = seq
-            def __getitem__(self, index):
-                return self._seq[index]
-            def __len__(self):
-                return len(self._seq)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class CustomSequence(Sequence):
-+                def __init__(self, seq):
-+                    self._seq = seq
-+                def __getitem__(self, index):
-+                    return self._seq[index]
-+                def __len__(self):
-+                    return len(self._seq)
- 
-         nan = float('nan')
-         obj = support.NEVER_EQ
-@@ -1622,30 +1676,31 @@ class TestCollectionABCs(ABCTestCase):
- 
-     def test_Set_from_iterable(self):
-         """Verify _from_iterable overridden to an instance method works."""
-        class SetUsingInstanceFromIterable(MutableSet):
-            def __init__(self, values, created_by):
-                if not created_by:
-                    raise ValueError('created_by must be specified')
-                self.created_by = created_by
-                self._values = set(values)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class SetUsingInstanceFromIterable(MutableSet):
-+                def __init__(self, values, created_by):
-+                    if not created_by:
-+                        raise ValueError('created_by must be specified')
-+                    self.created_by = created_by
-+                    self._values = set(values)
- 
-            def _from_iterable(self, values):
-                return type(self)(values, 'from_iterable')
-+                def _from_iterable(self, values):
-+                    return type(self)(values, 'from_iterable')
- 
-            def __contains__(self, value):
-                return value in self._values
-+                def __contains__(self, value):
-+                    return value in self._values
- 
-            def __iter__(self):
-                yield from self._values
-+                def __iter__(self):
-+                    yield from self._values
- 
-            def __len__(self):
-                return len(self._values)
-+                def __len__(self):
-+                    return len(self._values)
- 
-            def add(self, value):
-                self._values.add(value)
-+                def add(self, value):
-+                    self._values.add(value)
- 
-            def discard(self, value):
-                self._values.discard(value)
-+                def discard(self, value):
-+                    self._values.discard(value)
- 
-         impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')
- 
-@@ -1678,20 +1733,21 @@ class TestCollectionABCs(ABCTestCase):
- 
-     def test_Set_interoperability_with_real_sets(self):
-         # Issue: 8743
-        class ListSet(Set):
-            def __init__(self, elements=()):
-                self.data = []
-                for elem in elements:
-                    if elem not in self.data:
-                        self.data.append(elem)
-            def __contains__(self, elem):
-                return elem in self.data
-            def __iter__(self):
-                return iter(self.data)
-            def __len__(self):
-                return len(self.data)
-            def __repr__(self):
-                return 'Set({!r})'.format(self.data)
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class ListSet(Set):
-+                def __init__(self, elements=()):
-+                    self.data = []
-+                    for elem in elements:
-+                        if elem not in self.data:
-+                            self.data.append(elem)
-+                def __contains__(self, elem):
-+                    return elem in self.data
-+                def __iter__(self):
-+                    return iter(self.data)
-+                def __len__(self):
-+                    return len(self.data)
-+                def __repr__(self):
-+                    return 'Set({!r})'.format(self.data)
- 
-         r1 = set('abc')
-         r2 = set('bcd')
-@@ -1846,13 +1902,14 @@ class TestCollectionABCs(ABCTestCase):
-             self.assertTrue(issubclass(sample, Mapping))
-         self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
-             '__getitem__')
-        class MyMapping(Mapping):
-            def __len__(self):
-                return 0
-            def __getitem__(self, i):
-                raise IndexError
-            def __iter__(self):
-                return iter(())
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MyMapping(Mapping):
-+                def __len__(self):
-+                    return 0
-+                def __getitem__(self, i):
-+                    raise IndexError
-+                def __iter__(self):
-+                    return iter(())
-         self.validate_comparison(MyMapping())
-         self.assertRaises(TypeError, reversed, MyMapping())
- 
-@@ -1860,7 +1917,7 @@ class TestCollectionABCs(ABCTestCase):
-         for sample in [dict]:
-             self.assertIsInstance(sample(), MutableMapping)
-             self.assertTrue(issubclass(sample, MutableMapping))
-        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
-+        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
-             '__getitem__', '__setitem__', '__delitem__')
- 
-     def test_MutableMapping_subclass(self):
-@@ -1903,15 +1960,16 @@ class TestCollectionABCs(ABCTestCase):
-             '__getitem__')
- 
-     def test_Sequence_mixins(self):
-        class SequenceSubclass(Sequence):
-            def __init__(self, seq=()):
-                self.seq = seq
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class SequenceSubclass(Sequence):
-+                def __init__(self, seq=()):
-+                    self.seq = seq
- 
-            def __getitem__(self, index):
-                return self.seq[index]
-+                def __getitem__(self, index):
-+                    return self.seq[index]
- 
-            def __len__(self):
-                return len(self.seq)
-+                def __len__(self):
-+                    return len(self.seq)
- 
-         # Compare Sequence.index() behavior to (list|str).index() behavior
-         def assert_index_same(seq1, seq2, index_args):
-@@ -1983,24 +2041,25 @@ class TestCollectionABCs(ABCTestCase):
-     def test_MutableSequence_mixins(self):
-         # Test the mixins of MutableSequence by creating a minimal concrete
-         # class inherited from it.
-        class MutableSequenceSubclass(MutableSequence):
-            def __init__(self):
-                self.lst = []
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MutableSequenceSubclass(MutableSequence):
-+                def __init__(self):
-+                    self.lst = []
- 
-            def __setitem__(self, index, value):
-                self.lst[index] = value
-+                def __setitem__(self, index, value):
-+                    self.lst[index] = value
- 
-            def __getitem__(self, index):
-                return self.lst[index]
-+                def __getitem__(self, index):
-+                    return self.lst[index]
- 
-            def __len__(self):
-                return len(self.lst)
-+                def __len__(self):
-+                    return len(self.lst)
- 
-            def __delitem__(self, index):
-                del self.lst[index]
-+                def __delitem__(self, index):
-+                    del self.lst[index]
- 
-            def insert(self, index, value):
-                self.lst.insert(index, value)
-+                def insert(self, index, value):
-+                    self.lst.insert(index, value)
- 
-         mss = MutableSequenceSubclass()
-         mss.append(0)
-@@ -2059,7 +2118,7 @@ class CounterSubclassWithGet(Counter):
+@@ -2059,7 +2079,7 @@ class CounterSubclassWithGet(Counter):
         self.called = True
         return Counter.get(self, key, default)
 
@ -902,19 +71,7 @@ index cafc44007d1..4571e5a14fd 100644
 
     def test_basics(self):
         c = Counter('abcaba')
-@@ -2225,8 +2284,9 @@ class TestCounter(unittest.TestCase):
-         check(Counter(words))
- 
-     def test_copy_subclass(self):
-        class MyCounter(Counter):
-            pass
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            class MyCounter(Counter):
-+                pass
-         c = MyCounter('slartibartfast')
-         d = c.copy()
-         self.assertEqual(d, c)
-@@ -2402,10 +2462,5 @@ class TestCounter(unittest.TestCase):
+@@ -2402,10 +2422,5 @@ class TestCounter(unittest.TestCase):
         self.assertFalse(Counter(a=2, b=1, c=0) > Counter('aab'))
 
 
--- a/test/dynamo/cpython/3_13/test_collections.py
+++ b/test/dynamo/cpython/3_13/test_collections.py
@ -93,10 +93,9 @@ class TestUserObjects(__TestCase):
        self._copy_test(obj)

    def test_dict_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class A(UserDict):
-                def __missing__(self, key):
-                    return 456
+        class A(UserDict):
+            def __missing__(self, key):
+                return 456
        self.assertEqual(A()[123], 456)
        # get() ignores __missing__ on dict
        self.assertIs(A().get(123), None)
@ -193,10 +192,9 @@ class TestChainMap(__TestCase):
        self.assertTrue(ChainMap({}, {1:2}))

    def test_missing(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class DefaultChainMap(ChainMap):
-                def __missing__(self, key):
-                    return 999
+        class DefaultChainMap(ChainMap):
+            def __missing__(self, key):
+                return 999
        d = DefaultChainMap(dict(a=1, b=2), dict(b=20, c=30))
        for k, v in dict(a=1, b=2, c=30, d=999).items():
            self.assertEqual(d[k], v)                                  # check __getitem__ w/missing
@ -228,14 +226,13 @@ class TestChainMap(__TestCase):
             ('i', 9999), ('j', 0)])

    def test_iter_not_calling_getitem_on_maps(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class DictWithGetItem(UserDict):
-                def __init__(self, *args, **kwds):
-                    self.called = False
-                    UserDict.__init__(self, *args, **kwds)
-                def __getitem__(self, item):
-                    self.called = True
-                    UserDict.__getitem__(self, item)
+        class DictWithGetItem(UserDict):
+            def __init__(self, *args, **kwds):
+                self.called = False
+                UserDict.__init__(self, *args, **kwds)
+            def __getitem__(self, item):
+                self.called = True
+                UserDict.__getitem__(self, item)

        d = DictWithGetItem(a=1)
        c = ChainMap(d)
@ -260,16 +257,15 @@ class TestChainMap(__TestCase):
        self.assertIs(m, d.maps[0])

        # Use a different map than a dict
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class lowerdict(dict):
-                def __getitem__(self, key):
-                    if isinstance(key, str):
-                        key = key.lower()
-                    return dict.__getitem__(self, key)
-                def __contains__(self, key):
-                    if isinstance(key, str):
-                        key = key.lower()
-                    return dict.__contains__(self, key)
+        class lowerdict(dict):
+            def __getitem__(self, key):
+                if isinstance(key, str):
+                    key = key.lower()
+                return dict.__getitem__(self, key)
+            def __contains__(self, key):
+                if isinstance(key, str):
+                    key = key.lower()
+                return dict.__contains__(self, key)

        c = ChainMap()
        c['a'] = 1
@ -690,9 +686,8 @@ class TestNamedTuple(__TestCase):
            NT = namedtuple('NT', ['abc', 'def'], False, True)

    def test_namedtuple_subclass_issue_24931(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class Point(namedtuple('_Point', ['x', 'y'])):
-                pass
+        class Point(namedtuple('_Point', ['x', 'y'])):
+            pass

        a = Point(3, 4)
        self.assertEqual(a._asdict(), OrderedDict([('x', 3), ('y', 4)]))
@ -753,20 +748,15 @@ class ABCTestCase(__TestCase):
        methodstubs = dict.fromkeys(names, lambda s, *args: 0)

        # everything should work will all required methods are present
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            C = type('C', (abc,), methodstubs)
+        C = type('C', (abc,), methodstubs)
        C()

-        # Dynamo raises a hard error here that we can't easily capture
-        # Commenting this part as this would also fail in eager if a user
-        # attempt to run the same code
-
        # instantiation should fail if a required method is missing
-        # for name in names:
-        #     stubs = methodstubs.copy()
-        #     del stubs[name]
-        #     C = type('C', (abc,), stubs)
-        #     self.assertRaises(TypeError, C, name)
+        for name in names:
+            stubs = methodstubs.copy()
+            del stubs[name]
+            C = type('C', (abc,), stubs)
+            self.assertRaises(TypeError, C, name)

    def validate_isinstance(self, abc, name):
        stub = lambda s, *args: 0
@ -1011,21 +1001,19 @@ class TestOneTrickPonyABCs(ABCTestCase):
        for x in samples:
            self.assertIsInstance(x, Iterable)
            self.assertTrue(issubclass(type(x), Iterable), repr(type(x)))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check direct subclassing
-            class I(Iterable):
-                def __iter__(self):
-                    return super().__iter__()
+        # Check direct subclassing
+        class I(Iterable):
+            def __iter__(self):
+                return super().__iter__()
        self.assertEqual(list(I()), [])
        self.assertFalse(issubclass(str, I))
        self.validate_abstract_methods(Iterable, '__iter__')
        self.validate_isinstance(Iterable, '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check None blocking
-            class It:
-                def __iter__(self): return iter([])
-            class ItBlocked(It):
-                __iter__ = None
+        # Check None blocking
+        class It:
+            def __iter__(self): return iter([])
+        class ItBlocked(It):
+            __iter__ = None
        self.assertTrue(issubclass(It, Iterable))
        self.assertTrue(isinstance(It(), Iterable))
        self.assertFalse(issubclass(ItBlocked, Iterable))
@ -1055,35 +1043,32 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.assertTrue(issubclass(Sequence, Reversible), repr(Sequence))
        self.assertFalse(issubclass(Mapping, Reversible), repr(Mapping))
        self.assertFalse(issubclass(MutableMapping, Reversible), repr(MutableMapping))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check direct subclassing
-            class R(Reversible):
-                def __iter__(self):
-                    return iter(list())
-                def __reversed__(self):
-                    return iter(list())
+        # Check direct subclassing
+        class R(Reversible):
+            def __iter__(self):
+                return iter(list())
+            def __reversed__(self):
+                return iter(list())
        self.assertEqual(list(reversed(R())), [])
        self.assertFalse(issubclass(float, R))
        self.validate_abstract_methods(Reversible, '__reversed__', '__iter__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check reversible non-iterable (which is not Reversible)
-            class RevNoIter:
-                def __reversed__(self): return reversed([])
-            class RevPlusIter(RevNoIter):
-                def __iter__(self): return iter([])
+        # Check reversible non-iterable (which is not Reversible)
+        class RevNoIter:
+            def __reversed__(self): return reversed([])
+        class RevPlusIter(RevNoIter):
+            def __iter__(self): return iter([])
        self.assertFalse(issubclass(RevNoIter, Reversible))
        self.assertFalse(isinstance(RevNoIter(), Reversible))
        self.assertTrue(issubclass(RevPlusIter, Reversible))
        self.assertTrue(isinstance(RevPlusIter(), Reversible))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check None blocking
-            class Rev:
-                def __iter__(self): return iter([])
-                def __reversed__(self): return reversed([])
-            class RevItBlocked(Rev):
-                __iter__ = None
-            class RevRevBlocked(Rev):
-                __reversed__ = None
+        # Check None blocking
+        class Rev:
+            def __iter__(self): return iter([])
+            def __reversed__(self): return reversed([])
+        class RevItBlocked(Rev):
+            __iter__ = None
+        class RevRevBlocked(Rev):
+            __reversed__ = None
        self.assertTrue(issubclass(Rev, Reversible))
        self.assertTrue(isinstance(Rev(), Reversible))
        self.assertFalse(issubclass(RevItBlocked, Reversible))
@ -1117,16 +1102,15 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.assertTrue(issubclass(Set, Collection), repr(Set))
        self.assertTrue(issubclass(MutableSet, Collection), repr(MutableSet))
        self.assertTrue(issubclass(Sequence, Collection), repr(MutableSet))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check direct subclassing
-            class Col(Collection):
-                def __iter__(self):
-                    return iter(list())
-                def __len__(self):
-                    return 0
-                def __contains__(self, item):
-                    return False
-            class DerCol(Col): pass
+        # Check direct subclassing
+        class Col(Collection):
+            def __iter__(self):
+                return iter(list())
+            def __len__(self):
+                return 0
+            def __contains__(self, item):
+                return False
+        class DerCol(Col): pass
        self.assertEqual(list(iter(Col())), [])
        self.assertFalse(issubclass(list, Col))
        self.assertFalse(issubclass(set, Col))
@ -1138,48 +1122,44 @@ class TestOneTrickPonyABCs(ABCTestCase):
        self.validate_abstract_methods(Collection, '__len__', '__iter__',
                                                   '__contains__')
        # Check sized container non-iterable (which is not Collection) etc.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class ColNoIter:
-                def __len__(self): return 0
-                def __contains__(self, item): return False
-            class ColNoSize:
-                def __iter__(self): return iter([])
-                def __contains__(self, item): return False
-            class ColNoCont:
-                def __iter__(self): return iter([])
-                def __len__(self): return 0
+        class ColNoIter:
+            def __len__(self): return 0
+            def __contains__(self, item): return False
+        class ColNoSize:
+            def __iter__(self): return iter([])
+            def __contains__(self, item): return False
+        class ColNoCont:
+            def __iter__(self): return iter([])
+            def __len__(self): return 0
        self.assertFalse(issubclass(ColNoIter, Collection))
        self.assertFalse(isinstance(ColNoIter(), Collection))
        self.assertFalse(issubclass(ColNoSize, Collection))
        self.assertFalse(isinstance(ColNoSize(), Collection))
        self.assertFalse(issubclass(ColNoCont, Collection))
        self.assertFalse(isinstance(ColNoCont(), Collection))
-
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check None blocking
-            class SizeBlock:
-                def __iter__(self): return iter([])
-                def __contains__(self): return False
-                __len__ = None
-            class IterBlock:
-                def __len__(self): return 0
-                def __contains__(self): return True
-                __iter__ = None
+        # Check None blocking
+        class SizeBlock:
+            def __iter__(self): return iter([])
+            def __contains__(self): return False
+            __len__ = None
+        class IterBlock:
+            def __len__(self): return 0
+            def __contains__(self): return True
+            __iter__ = None
        self.assertFalse(issubclass(SizeBlock, Collection))
        self.assertFalse(isinstance(SizeBlock(), Collection))
        self.assertFalse(issubclass(IterBlock, Collection))
        self.assertFalse(isinstance(IterBlock(), Collection))
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Check None blocking in subclass
-            class ColImpl:
-                def __iter__(self):
-                    return iter(list())
-                def __len__(self):
-                    return 0
-                def __contains__(self, item):
-                    return False
-            class NonCol(ColImpl):
-                __contains__ = None
+        # Check None blocking in subclass
+        class ColImpl:
+            def __iter__(self):
+                return iter(list())
+            def __len__(self):
+                return 0
+            def __contains__(self, item):
+                return False
+        class NonCol(ColImpl):
+            __contains__ = None
        self.assertFalse(issubclass(NonCol, Collection))
        self.assertFalse(isinstance(NonCol(), Collection))

@ -1202,32 +1182,30 @@ class TestOneTrickPonyABCs(ABCTestCase):
            self.assertTrue(issubclass(type(x), Iterator), repr(type(x)))
        self.validate_abstract_methods(Iterator, '__next__', '__iter__')

-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            # Issue 10565
-            class NextOnly:
-                def __next__(self):
-                    yield 1
-                    return
+        # Issue 10565
+        class NextOnly:
+            def __next__(self):
+                yield 1
+                return
        self.assertNotIsInstance(NextOnly(), Iterator)

    def test_Generator(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class NonGen1:
-                def __iter__(self): return self
-                def __next__(self): return None
-                def close(self): pass
-                def throw(self, typ, val=None, tb=None): pass
+        class NonGen1:
+            def __iter__(self): return self
+            def __next__(self): return None
+            def close(self): pass
+            def throw(self, typ, val=None, tb=None): pass

-            class NonGen2:
-                def __iter__(self): return self
-                def __next__(self): return None
-                def close(self): pass
-                def send(self, value): return value
+        class NonGen2:
+            def __iter__(self): return self
+            def __next__(self): return None
+            def close(self): pass
+            def send(self, value): return value

-            class NonGen3:
-                def close(self): pass
-                def send(self, value): return value
-                def throw(self, typ, val=None, tb=None): pass
+        class NonGen3:
+            def close(self): pass
+            def send(self, value): return value
+            def throw(self, typ, val=None, tb=None): pass

        non_samples = [
            None, 42, 3.14, 1j, b"", "", (), [], {}, set(),
@ -1236,19 +1214,18 @@ class TestOneTrickPonyABCs(ABCTestCase):
            self.assertNotIsInstance(x, Generator)
            self.assertFalse(issubclass(type(x), Generator), repr(type(x)))

-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class Gen:
-                def __iter__(self): return self
-                def __next__(self): return None
-                def close(self): pass
-                def send(self, value): return value
-                def throw(self, typ, val=None, tb=None): pass
+        class Gen:
+            def __iter__(self): return self
+            def __next__(self): return None
+            def close(self): pass
+            def send(self, value): return value
+            def throw(self, typ, val=None, tb=None): pass

-            class MinimalGen(Generator):
-                def send(self, value):
-                    return value
-                def throw(self, typ, val=None, tb=None):
-                    super().throw(typ, val, tb)
+        class MinimalGen(Generator):
+            def send(self, value):
+                return value
+            def throw(self, typ, val=None, tb=None):
+                super().throw(typ, val, tb)

        def gen():
            yield 1
@ -1271,17 +1248,15 @@ class TestOneTrickPonyABCs(ABCTestCase):
                               mgen.throw, ValueError, ValueError("huhu"))
        self.assertRaises(StopIteration, mgen.throw, StopIteration())

-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class FailOnClose(Generator):
-                def send(self, value): return value
-                def throw(self, *args): raise ValueError
+        class FailOnClose(Generator):
+            def send(self, value): return value
+            def throw(self, *args): raise ValueError

        self.assertRaises(ValueError, FailOnClose().close)

-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class IgnoreGeneratorExit(Generator):
-                def send(self, value): return value
-                def throw(self, *args): pass
+        class IgnoreGeneratorExit(Generator):
+            def send(self, value): return value
+            def throw(self, *args): pass

        self.assertRaises(RuntimeError, IgnoreGeneratorExit().close)

@ -1424,17 +1399,15 @@ class TestOneTrickPonyABCs(ABCTestCase):

    def test_direct_subclassing(self):
        for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
-                class C(B):
-                    pass
+            class C(B):
+                pass
            self.assertTrue(issubclass(C, B))
            self.assertFalse(issubclass(int, C))

    def test_registration(self):
        for B in Hashable, Iterable, Iterator, Reversible, Sized, Container, Callable:
-            with torch._dynamo.set_fullgraph(fullgraph=False):
-                class C:
-                    __hash__ = None  # Make sure it isn't hashable by default
+            class C:
+                __hash__ = None  # Make sure it isn't hashable by default
            self.assertFalse(issubclass(C, B), B.__name__)
            B.register(C)
            self.assertTrue(issubclass(C, B))
@ -1470,14 +1443,13 @@ class TestCollectionABCs(ABCTestCase):
            self.assertIsInstance(sample(), Set)
            self.assertTrue(issubclass(sample, Set))
        self.validate_abstract_methods(Set, '__contains__', '__iter__', '__len__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MySet(Set):
-                def __contains__(self, x):
-                    return False
-                def __len__(self):
-                    return 0
-                def __iter__(self):
-                    return iter([])
+        class MySet(Set):
+            def __contains__(self, x):
+                return False
+            def __len__(self):
+                return 0
+            def __iter__(self):
+                return iter([])
        self.validate_comparison(MySet())

    def test_hash_Set(self):
@ -1496,16 +1468,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertTrue(hash(a) == hash(b))

    def test_isdisjoint_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MySet(Set):
-                def __init__(self, itr):
-                    self.contents = itr
-                def __contains__(self, x):
-                    return x in self.contents
-                def __iter__(self):
-                    return iter(self.contents)
-                def __len__(self):
-                    return len([x for x in self.contents])
+        class MySet(Set):
+            def __init__(self, itr):
+                self.contents = itr
+            def __contains__(self, x):
+                return x in self.contents
+            def __iter__(self):
+                return iter(self.contents)
+            def __len__(self):
+                return len([x for x in self.contents])
        s1 = MySet((1, 2, 3))
        s2 = MySet((4, 5, 6))
        s3 = MySet((1, 5, 6))
@ -1513,16 +1484,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertFalse(s1.isdisjoint(s3))

    def test_equality_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MySet(Set):
-                def __init__(self, itr):
-                    self.contents = itr
-                def __contains__(self, x):
-                    return x in self.contents
-                def __iter__(self):
-                    return iter(self.contents)
-                def __len__(self):
-                    return len([x for x in self.contents])
+        class MySet(Set):
+            def __init__(self, itr):
+                self.contents = itr
+            def __contains__(self, x):
+                return x in self.contents
+            def __iter__(self):
+                return iter(self.contents)
+            def __len__(self):
+                return len([x for x in self.contents])
        s1 = MySet((1,))
        s2 = MySet((1, 2))
        s3 = MySet((3, 4))
@ -1536,16 +1506,15 @@ class TestCollectionABCs(ABCTestCase):
        self.assertNotEqual(s2, s3)

    def test_arithmetic_Set(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MySet(Set):
-                def __init__(self, itr):
-                    self.contents = itr
-                def __contains__(self, x):
-                    return x in self.contents
-                def __iter__(self):
-                    return iter(self.contents)
-                def __len__(self):
-                    return len([x for x in self.contents])
+        class MySet(Set):
+            def __init__(self, itr):
+                self.contents = itr
+            def __contains__(self, x):
+                return x in self.contents
+            def __iter__(self):
+                return iter(self.contents)
+            def __len__(self):
+                return len([x for x in self.contents])
        s1 = MySet((1, 2, 3))
        s2 = MySet((3, 4, 5))
        s3 = s1 & s2
@ -1567,29 +1536,28 @@ class TestCollectionABCs(ABCTestCase):

    def test_issue_4920(self):
        # MutableSet.pop() method did not work
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MySet(MutableSet):
-                __slots__=['__s']
-                def __init__(self,items=None):
-                    if items is None:
-                        items=[]
-                    self.__s=set(items)
-                def __contains__(self,v):
-                    return v in self.__s
-                def __iter__(self):
-                    return iter(self.__s)
-                def __len__(self):
-                    return len(self.__s)
-                def add(self,v):
-                    result=v not in self.__s
-                    self.__s.add(v)
-                    return result
-                def discard(self,v):
-                    result=v in self.__s
-                    self.__s.discard(v)
-                    return result
-                def __repr__(self):
-                    return "MySet(%s)" % repr(list(self))
+        class MySet(MutableSet):
+            __slots__=['__s']
+            def __init__(self,items=None):
+                if items is None:
+                    items=[]
+                self.__s=set(items)
+            def __contains__(self,v):
+                return v in self.__s
+            def __iter__(self):
+                return iter(self.__s)
+            def __len__(self):
+                return len(self.__s)
+            def add(self,v):
+                result=v not in self.__s
+                self.__s.add(v)
+                return result
+            def discard(self,v):
+                result=v in self.__s
+                self.__s.discard(v)
+                return result
+            def __repr__(self):
+                return "MySet(%s)" % repr(list(self))
        items = [5,43,2,1]
        s = MySet(items)
        r = s.pop()
@ -1615,25 +1583,24 @@ class TestCollectionABCs(ABCTestCase):
    def test_issue16373(self):
        # Recursion error comparing comparable and noncomparable
        # Set instances
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MyComparableSet(Set):
-                def __contains__(self, x):
-                    return False
-                def __len__(self):
-                    return 0
-                def __iter__(self):
-                    return iter([])
-            class MyNonComparableSet(Set):
-                def __contains__(self, x):
-                    return False
-                def __len__(self):
-                    return 0
-                def __iter__(self):
-                    return iter([])
-                def __le__(self, x):
-                    return NotImplemented
-                def __lt__(self, x):
-                    return NotImplemented
+        class MyComparableSet(Set):
+            def __contains__(self, x):
+                return False
+            def __len__(self):
+                return 0
+            def __iter__(self):
+                return iter([])
+        class MyNonComparableSet(Set):
+            def __contains__(self, x):
+                return False
+            def __len__(self):
+                return 0
+            def __iter__(self):
+                return iter([])
+            def __le__(self, x):
+                return NotImplemented
+            def __lt__(self, x):
+                return NotImplemented

        cs = MyComparableSet()
        ncs = MyNonComparableSet()
@ -1644,14 +1611,13 @@ class TestCollectionABCs(ABCTestCase):

    def test_issue26915(self):
        # Container membership test should check identity first
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class CustomSequence(Sequence):
-                def __init__(self, seq):
-                    self._seq = seq
-                def __getitem__(self, index):
-                    return self._seq[index]
-                def __len__(self):
-                    return len(self._seq)
+        class CustomSequence(Sequence):
+            def __init__(self, seq):
+                self._seq = seq
+            def __getitem__(self, index):
+                return self._seq[index]
+            def __len__(self):
+                return len(self._seq)

        nan = float('nan')
        obj = support.NEVER_EQ
@ -1676,31 +1642,30 @@ class TestCollectionABCs(ABCTestCase):

    def test_Set_from_iterable(self):
        """Verify _from_iterable overridden to an instance method works."""
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class SetUsingInstanceFromIterable(MutableSet):
-                def __init__(self, values, created_by):
-                    if not created_by:
-                        raise ValueError('created_by must be specified')
-                    self.created_by = created_by
-                    self._values = set(values)
+        class SetUsingInstanceFromIterable(MutableSet):
+            def __init__(self, values, created_by):
+                if not created_by:
+                    raise ValueError('created_by must be specified')
+                self.created_by = created_by
+                self._values = set(values)

-                def _from_iterable(self, values):
-                    return type(self)(values, 'from_iterable')
+            def _from_iterable(self, values):
+                return type(self)(values, 'from_iterable')

-                def __contains__(self, value):
-                    return value in self._values
+            def __contains__(self, value):
+                return value in self._values

-                def __iter__(self):
-                    yield from self._values
+            def __iter__(self):
+                yield from self._values

-                def __len__(self):
-                    return len(self._values)
+            def __len__(self):
+                return len(self._values)

-                def add(self, value):
-                    self._values.add(value)
+            def add(self, value):
+                self._values.add(value)

-                def discard(self, value):
-                    self._values.discard(value)
+            def discard(self, value):
+                self._values.discard(value)

        impl = SetUsingInstanceFromIterable([1, 2, 3], 'test')

@ -1733,21 +1698,20 @@ class TestCollectionABCs(ABCTestCase):

    def test_Set_interoperability_with_real_sets(self):
        # Issue: 8743
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class ListSet(Set):
-                def __init__(self, elements=()):
-                    self.data = []
-                    for elem in elements:
-                        if elem not in self.data:
-                            self.data.append(elem)
-                def __contains__(self, elem):
-                    return elem in self.data
-                def __iter__(self):
-                    return iter(self.data)
-                def __len__(self):
-                    return len(self.data)
-                def __repr__(self):
-                    return 'Set({!r})'.format(self.data)
+        class ListSet(Set):
+            def __init__(self, elements=()):
+                self.data = []
+                for elem in elements:
+                    if elem not in self.data:
+                        self.data.append(elem)
+            def __contains__(self, elem):
+                return elem in self.data
+            def __iter__(self):
+                return iter(self.data)
+            def __len__(self):
+                return len(self.data)
+            def __repr__(self):
+                return 'Set({!r})'.format(self.data)

        r1 = set('abc')
        r2 = set('bcd')
@ -1902,14 +1866,13 @@ class TestCollectionABCs(ABCTestCase):
            self.assertTrue(issubclass(sample, Mapping))
        self.validate_abstract_methods(Mapping, '__contains__', '__iter__', '__len__',
            '__getitem__')
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MyMapping(Mapping):
-                def __len__(self):
-                    return 0
-                def __getitem__(self, i):
-                    raise IndexError
-                def __iter__(self):
-                    return iter(())
+        class MyMapping(Mapping):
+            def __len__(self):
+                return 0
+            def __getitem__(self, i):
+                raise IndexError
+            def __iter__(self):
+                return iter(())
        self.validate_comparison(MyMapping())
        self.assertRaises(TypeError, reversed, MyMapping())

@ -1917,7 +1880,7 @@ class TestCollectionABCs(ABCTestCase):
        for sample in [dict]:
            self.assertIsInstance(sample(), MutableMapping)
            self.assertTrue(issubclass(sample, MutableMapping))
-        self.validate_abstract_methods(MutableMapping, '__iter__', '__len__',
+        self.validate_abstract_methods(MutableMapping, '__contains__', '__iter__', '__len__',
            '__getitem__', '__setitem__', '__delitem__')

    def test_MutableMapping_subclass(self):
@ -1960,16 +1923,15 @@ class TestCollectionABCs(ABCTestCase):
            '__getitem__')

    def test_Sequence_mixins(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class SequenceSubclass(Sequence):
-                def __init__(self, seq=()):
-                    self.seq = seq
+        class SequenceSubclass(Sequence):
+            def __init__(self, seq=()):
+                self.seq = seq

-                def __getitem__(self, index):
-                    return self.seq[index]
+            def __getitem__(self, index):
+                return self.seq[index]

-                def __len__(self):
-                    return len(self.seq)
+            def __len__(self):
+                return len(self.seq)

        # Compare Sequence.index() behavior to (list|str).index() behavior
        def assert_index_same(seq1, seq2, index_args):
@ -2041,25 +2003,24 @@ class TestCollectionABCs(ABCTestCase):
    def test_MutableSequence_mixins(self):
        # Test the mixins of MutableSequence by creating a minimal concrete
        # class inherited from it.
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MutableSequenceSubclass(MutableSequence):
-                def __init__(self):
-                    self.lst = []
+        class MutableSequenceSubclass(MutableSequence):
+            def __init__(self):
+                self.lst = []

-                def __setitem__(self, index, value):
-                    self.lst[index] = value
+            def __setitem__(self, index, value):
+                self.lst[index] = value

-                def __getitem__(self, index):
-                    return self.lst[index]
+            def __getitem__(self, index):
+                return self.lst[index]

-                def __len__(self):
-                    return len(self.lst)
+            def __len__(self):
+                return len(self.lst)

-                def __delitem__(self, index):
-                    del self.lst[index]
+            def __delitem__(self, index):
+                del self.lst[index]

-                def insert(self, index, value):
-                    self.lst.insert(index, value)
+            def insert(self, index, value):
+                self.lst.insert(index, value)

        mss = MutableSequenceSubclass()
        mss.append(0)
@ -2284,9 +2245,8 @@ class TestCounter(__TestCase):
        check(Counter(words))

    def test_copy_subclass(self):
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            class MyCounter(Counter):
-                pass
+        class MyCounter(Counter):
+            pass
        c = MyCounter('slartibartfast')
        d = c.copy()
        self.assertEqual(d, c)
--- a/test/dynamo/cpython/3_13/test_itertools.diff
+++ b/test/dynamo/cpython/3_13/test_itertools.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_itertools.py b/test/dynamo/cpython/3_13/test_itertools.py
-index 7d5ba727389..8d462284884 100644
+index 7d5ba727389..d15d83a2184 100644
 --- a/test/dynamo/cpython/3_13/test_itertools.py
 +++ b/test/dynamo/cpython/3_13/test_itertools.py
@@ -1,3 +1,25 @@
@ -151,7 +151,7 @@ index 7d5ba727389..8d462284884 100644
             _, g = next(it)
             next(it)
             next(it)
-@@ -1002,29 +1015,30 @@ class TestBasicOps(unittest.TestCase):
+@@ -1002,27 +1015,29 @@ class TestBasicOps(unittest.TestCase):
         self.assertEqual(list(filter(None, [0,1,0,2,0])), [1,2])
         self.assertEqual(list(filter(bool, [0,1,0,2,0])), [1,2])
         self.assertEqual(take(4, filter(isEven, count())), [0,2,4,6])
@ -198,24 +198,8 @@ index 7d5ba727389..8d462284884 100644
 +        #     c = filter(isEven, range(6))
 +        #     self.pickletest(proto, c)
 
-    @pickle_deprecated
+     @pickle_deprecated
     def test_filterfalse(self):
-         self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
-         self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
-@@ -1034,9 +1048,10 @@ class TestBasicOps(unittest.TestCase):
-         self.assertRaises(TypeError, filterfalse, lambda x:x)
-         self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
-         self.assertRaises(TypeError, filterfalse, isEven, 3)
-        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
-        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-            self.pickletest(proto, filterfalse(isEven, range(6)))
-+        with torch._dynamo.set_fullgraph(fullgraph=False):
-+            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
-+            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-+                self.pickletest(proto, filterfalse(isEven, range(6)))
- 
-     def test_zip(self):
-         # XXX This is rather silly now that builtin zip() calls zip()...
@@ -1047,8 +1062,8 @@ class TestBasicOps(unittest.TestCase):
         self.assertEqual(take(3,zip('abcdef', count())), lzip('abcdef', range(3)))
         self.assertEqual(list(zip('abcdef')), lzip('abcdef'))
--- a/test/dynamo/cpython/3_13/test_itertools.py
+++ b/test/dynamo/cpython/3_13/test_itertools.py
@ -1039,6 +1039,7 @@ class TestBasicOps(__TestCase):
        #     c = filter(isEven, range(6))
        #     self.pickletest(proto, c)

+    @pickle_deprecated
    def test_filterfalse(self):
        self.assertEqual(list(filterfalse(isEven, range(6))), [1,3,5])
        self.assertEqual(list(filterfalse(None, [0,1,0,2,0])), [0,0,0])
@ -1048,10 +1049,9 @@ class TestBasicOps(__TestCase):
        self.assertRaises(TypeError, filterfalse, lambda x:x)
        self.assertRaises(TypeError, filterfalse, lambda x:x, range(6), 7)
        self.assertRaises(TypeError, filterfalse, isEven, 3)
-        with torch._dynamo.set_fullgraph(fullgraph=False):
-            self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
-            for proto in range(pickle.HIGHEST_PROTOCOL + 1):
-                self.pickletest(proto, filterfalse(isEven, range(6)))
+        self.assertRaises(TypeError, next, filterfalse(range(6), range(6)))
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            self.pickletest(proto, filterfalse(isEven, range(6)))

    def test_zip(self):
        # XXX This is rather silly now that builtin zip() calls zip()...
--- a/test/dynamo/test_ctx_manager.py
+++ b/test/dynamo/test_ctx_manager.py
@ -1742,83 +1742,6 @@ class GraphModule(torch.nn.Module):
        opt_f = torch.compile(f, backend="eager")
        opt_f(torch.randn(2, 2))

-    # Regression test to make sure dynamo won't crash on these kwargs.
-    def test_sdpa_kernel_ctx_manager_kwargs(self):
-        backends = [torch.nn.attention.SDPBackend.MATH]
-
-        @torch._dynamo.allow_in_graph
-        def check_backend_state_is_modified():
-            self.assertEqual(
-                set(torch.nn.attention._cur_sdpa_kernel_backends()),
-                set(backends),
-            )
-
-        def f(x):
-            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
-                x = x + 1
-                check_backend_state_is_modified()
-                x = x + 1
-
-            return x
-
-        opt_f = torch.compile(f, backend="eager")
-        opt_f(torch.randn(2, 2))
-
-    # Regression test to make sure dynamo won't graph break on calling functions
-    # decorated with special context manager.
-    def test_sdpa_kernel_ctx_manager_as_decorator(self):
-        SDPA_BACKEND_PRIORITY = [
-            torch.nn.attention.SDPBackend.MATH,
-            torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION,
-            torch.nn.attention.SDPBackend.FLASH_ATTENTION,
-        ]
-
-        @torch.nn.attention.sdpa_kernel(
-            backends=SDPA_BACKEND_PRIORITY, set_priority=True
-        )
-        def scaled_dot_product_attention(q, k, v, *args, **kwargs):
-            return torch.nn.functional.scaled_dot_product_attention(
-                q, k, v, *args, **kwargs
-            )
-
-        def f(x):
-            return scaled_dot_product_attention(x, x, x)
-
-        opt_f = torch.compile(f, backend="eager", fullgraph=True)
-        x = torch.rand(16, 16, 64, 256, dtype=torch.float16)
-        ref = f(x)
-        res = opt_f(x)
-
-        self.assertEqual(ref, res)
-
-    # Regression test to make sure the value of set_priority is used correctly.
-    def test_sdpa_kernel_ctx_manager_set_priority(self):
-        backends = [torch.nn.attention.SDPBackend.MATH]
-        default_priority = torch._C._get_sdp_priority_order()
-
-        @torch._dynamo.allow_in_graph
-        def check_backend_priority(changed: bool):
-            self.assertEqual(
-                changed,
-                torch._C._get_sdp_priority_order() != default_priority,
-            )
-
-        def f(x):
-            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=True):
-                x = x + 1
-                check_backend_priority(changed=True)
-                x = x + 1
-
-            with torch.nn.attention.sdpa_kernel(backends=backends, set_priority=False):
-                x = x + 1
-                check_backend_priority(changed=False)
-                x = x + 1
-
-            return x
-
-        opt_f = torch.compile(f, backend="eager")
-        opt_f(torch.randn(2, 2))
-
    def test_torch_profiler_use_after_with_block(self):
        counters.clear()

--- a/test/dynamo/test_functions.py
+++ b/test/dynamo/test_functions.py
@ -310,12 +310,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        itertools.permutations(filter(lambda x: True, [1, 2]))
        return a

-    @make_test
-    def test_itertools_filterfalse_basic(a, b):
-        for x in itertools.filterfalse(lambda x: x > 0, [-0.5, 0, 0.5]):
-            a += x
-        return a
-
    @make_test
    def test_itertools_chain(a, b):
        v = a
@ -568,11 +562,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        args = [a, b]
        return sub(*args)

-    @make_test
-    def test_tuple_map(a, b):
-        t = tuple(map(torch.sin, [a, b]))
-        return t[0] + t[1]
-
    def test_size_tuple_add(self):
        def fn():
            size = torch.Size([])
@ -2027,21 +2016,6 @@ class FunctionTests(torch._dynamo.test_case.TestCase):
        tmp = mytuple(a, xy=b)
        return mytuple(tmp.x, tmp[1], tmp.xy + b)

-    @make_test
-    def test_namedtuple_replace(a, b):
-        mytuple = collections.namedtuple("mytuple", ["x", "y"])
-        t = mytuple(a, b)
-        t._replace(x=b)
-        return t.x + t.y
-
-    @make_test
-    def test_namedtuple_fields(a, b):
-        mytuple = collections.namedtuple("mytuple", ["x", "y"])
-        if mytuple._fields == ("x", "y"):
-            return a + b
-        else:
-            return a - b
-
    class MyNamedTuple(NamedTuple):
        first: torch.Tensor
        second: torch.Tensor
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@ -4,16 +4,13 @@ import contextlib

 import torch
 import torch.fx
-from torch._dynamo.graph_deduplication import apply_graph_deduplication
 from torch._dynamo.graph_utils import _detect_cycles
-from torch._dynamo.output_graph import FakeRootModule
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import (
    AotEagerAndRecordGraphs,
    extract_graph_and_tracker,
    normalize_gm,
 )
-from torch.compiler import allow_in_graph
 from torch.utils._ordered_set import OrderedSet


@ -1109,104 +1106,6 @@ def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
    """,
        )

-    def test_tuple_return(self):
-        @allow_in_graph
-        def tuple_return(x, y):
-            return x, y
-
-        def inner_fn(x, y):
-            x0 = x + x + 1
-            y0 = y + y + 1
-            return tuple_return(x0, y0)
-
-        def fn(x0, x1, x2, y0, y1, y2):
-            x0 = inner_fn(x0, y0)
-            x1 = inner_fn(x1, y1)
-            x2 = inner_fn(x2, y2)
-            return x0, x1, x2
-
-        fn_opt = torch.compile(fn, fullgraph=True)
-        inps = [torch.rand(10, 10) for _ in range(6)]
-        result_compiled = fn_opt(*inps)
-        result_eager = fn(*inps)
-        self.assertEqual(result_compiled, result_eager)
-
-    def test_tuple_inputs(self):
-        with (
-            torch._dynamo.config.patch("use_graph_deduplication", False),
-            torch._dynamo.config.patch("track_nodes_for_deduplication", True),
-        ):
-
-            def inner(x, y):
-                x0, x1 = torch.split(x, 5)
-                return x0 + x1 + y
-
-            def fn(x, y):
-                o1 = inner(x, y)
-                o2 = inner(x, y)
-                o3 = inner(x, y)
-                o4 = inner(x, y)
-                return o1.sum() + o2.sum() + o3.sum() + o4.sum()
-
-            graph, tracker = extract_graph_and_tracker(
-                fn, torch.rand(10, 10), torch.rand(5, 10)
-            )
-
-            class MockOutputGraph:
-                def __init__(self):
-                    self.graph = graph
-                    self.region_tracker = tracker
-                    self.nn_modules = FakeRootModule({})
-
-                def install_subgraph(self, name, subgraph):
-                    return ""
-
-            splits = [
-                n
-                for n in graph.nodes
-                if n.op == "call_function" and n.target == torch.split
-            ]
-            for split in splits:
-                tracker.node_to_duplicates.pop(split)
-
-            apply_graph_deduplication(MockOutputGraph())
-            self.assertExpectedInline(
-                graph,
-                """\
-graph():
-    %_unnamed : [num_users=4] = get_attr[target=]
-    %l_x_ : torch.Tensor [num_users=4] = placeholder[target=L_x_]
-    %l_y_ : torch.Tensor [num_users=4] = placeholder[target=L_y_]
-    %split : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 0), kwargs = {})
-    %x1 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 1), kwargs = {})
-    %split_1 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 0), kwargs = {})
-    %x1_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 1), kwargs = {})
-    %split_2 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 0), kwargs = {})
-    %x1_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 1), kwargs = {})
-    %split_3 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 0), kwargs = {})
-    %x1_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 1), kwargs = {})
-    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0, %x1, %l_y_), kwargs = {})
-    %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
-    %sum_1 : [num_users=1] = call_method[target=sum](args = (%getitem_8,), kwargs = {})
-    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_1, %x1_1, %l_y_), kwargs = {})
-    %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
-    %sum_2 : [num_users=1] = call_method[target=sum](args = (%getitem_9,), kwargs = {})
-    %add_8 : [num_users=1] = call_function[target=operator.add](args = (%sum_1, %sum_2), kwargs = {})
-    %invoke_subgraph_2 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_2, %x1_2, %l_y_), kwargs = {})
-    %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_2, 0), kwargs = {})
-    %sum_3 : [num_users=1] = call_method[target=sum](args = (%getitem_10,), kwargs = {})
-    %add_9 : [num_users=1] = call_function[target=operator.add](args = (%add_8, %sum_3), kwargs = {})
-    %invoke_subgraph_3 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_3, %x1_3, %l_y_), kwargs = {})
-    %getitem_11 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_3, 0), kwargs = {})
-    %sum_4 : [num_users=1] = call_method[target=sum](args = (%getitem_11,), kwargs = {})
-    %add_10 : [num_users=1] = call_function[target=operator.add](args = (%add_9, %sum_4), kwargs = {})
-    return (add_10,)""",
-            )
-
    def test_param_transfer_to_submodule(self):
        def inner_fn(x, y):
            return x + y + y + x
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@ -9,6 +9,28 @@ from torch._dynamo.testing import extract_graph_and_tracker
 from torch.utils._pytree import tree_map


+def get_nodes_by_name(graph, names):
+    nodes = []
+    for node in graph.nodes:
+        if node.name in names:
+            nodes.append(node)
+
+    return nodes
+
+
+unique_ind = 0
+
+
+def track_same_nodes(names, graph, region_tracker):
+    global unique_ind
+    unique_ind += 1
+    # find nodes in graph with names and track them
+    # as if they were at the same code location
+    nodes = get_nodes_by_name(graph, names)
+    for node in nodes:
+        region_tracker.track_node("x", unique_ind, node)
+
+
 class GraphRegionTrackerTests(TestCase):
    def setUp(self):
        self.exit_stack = contextlib.ExitStack()
--- a/test/dynamo/test_guard_manager.py
+++ b/test/dynamo/test_guard_manager.py
@ -1205,45 +1205,6 @@ class TagSafetyChecks(RecursiveDictTagTests):
        with install_guard_manager_testing_hook(hook):
            opt_fn(torch.randn(4, 4))

-    def test_nn_module_tag_overridden_getattr_safe(self):
-        class Baz(torch.nn.Module, metaclass=abc.ABCMeta):
-            def __init__(self):
-                super().__init__()
-                self.norm = 2
-
-            def __getattr__(self, key):
-                if key == "a":
-                    return 5
-                return super().__getattr__(key)
-
-            def forward(self, x):
-                return x + self.a + self.norm
-
-        baz = Baz()
-
-        def fn(x):
-            x = x + baz(x)
-            return x
-
-        try:
-            from .utils import install_guard_manager_testing_hook
-        except ImportError:
-            from utils import install_guard_manager_testing_hook
-
-        def hook(guard_wrapper, f_locals, builder):
-            from torch._dynamo.source import LocalSource
-
-            baz_source = LocalSource("baz")
-
-            # Check tagness of baz
-            baz_mgr = builder.get_guard_manager_from_source(baz_source)
-            self.assertTrue(baz_mgr.is_tag_safe())
-            self.assertTrue(baz_mgr.is_tag_safe_root())
-
-        opt_fn = torch.compile(fn, backend="eager", fullgraph=True)
-        with install_guard_manager_testing_hook(hook):
-            opt_fn(torch.randn(4, 4))
-

 class RecursiveDictGuardTests(RecursiveDictTagTests):
    def test_disabling(self):
--- a/test/dynamo/test_guard_serialization.py
+++ b/test/dynamo/test_guard_serialization.py
@ -261,7 +261,6 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):

    def _test_serialization(self, guard_type, fn, *args, **kwargs):
        # kwargs might contain a callable that generates kwargs
-        torch._dynamo.reset()
        kwarg_gen_fn = kwargs.get("_gen_fn", None)
        if kwarg_gen_fn is not None:
            kwargs = kwarg_gen_fn()
@ -347,7 +346,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
                    self._frame_state.f_code,
                    tracer.output,
                    guard_filter_fn=guard_filter_fn,
-                    save_guards=True,
+                    guards_serialization_mode="save",
                )
                guards_state = check_fn_manager.guards_state
                self._cached_guards_state = guards_state
@ -358,6 +357,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
                check_fn_manager = CheckFunctionManager(
                    self._frame_state.f_code,
                    guards_state.output_graph,
+                    guards_serialization_mode="load",
                    shape_code_parts=guards_state.shape_code_parts,
                    runtime_global_scope=self._frame_state.f_globals,
                )
@ -1180,6 +1180,7 @@ class TestGuardSerialization(torch._inductor.test_case.TestCase):
            check_fn_manager = CheckFunctionManager(
                self._cached_f_code,
                guards_state.output_graph,
+                guards_serialization_mode="load",
                shape_code_parts=guards_state.shape_code_parts,
            )
            loaded = check_fn_manager.guard_manager
--- a/test/dynamo/test_misc.py
+++ b/test/dynamo/test_misc.py
@ -1705,17 +1705,16 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            if hasattr(packed, "b"):
                b = packed.b + 1
            c = packed[2]
-            d = len(packed._fields)
-            return a + b + c + d
+            return a + b + c

        v1 = torch.Tensor([1])
        v2 = torch.Tensor([2])
        v3 = torch.Tensor([3])
        cnts = torch._dynamo.testing.CompileCounter()
        opt_fn = torch.compile(fn, backend=cnts)
-        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 10)
+        self.assertEqual(opt_fn(MyTuple(v1, v2, v3))[0], 7)
        self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 4)
+        self.assertEqual(cnts.op_count, 3)

    def test_namedtuple3(self):
        def fn(x, packed):
@ -1962,31 +1961,6 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):

        self.assertEqual(exp, act)

-    def test_class_binop(self):
-        class Foo:
-            def __init__(self, x):
-                self.x = x
-
-            def __add__(self, other):
-                return Foo(self.x + other.x)
-
-        def fn(a, b):
-            return a + b
-
-        x = torch.randn(2)
-        a, b = Foo(x), Foo(x + 1)
-        cnts = torch._dynamo.testing.CompileCounter()
-        opt_fn = torch.compile(fn, backend=cnts)
-        self.assertEqual(opt_fn(a, b).x, 2 * x + 1)
-        self.assertEqual(cnts.frame_count, 1)
-        self.assertEqual(cnts.op_count, 1)
-
-        def fn(a, b):
-            return a - b
-
-        opt_fn = torch.compile(fn, backend=cnts, fullgraph=True)
-        self.assertRaises(torch._dynamo.exc.Unsupported, opt_fn, a, b)
-
    def test_user_getattr1(self):
        class MyConfig(dict):
            def __getattr__(self, name):
@ -8572,6 +8546,7 @@ utils_device.CURRENT_DEVICE == None""".split("\n"):
            guard_manager = torch._dynamo.guards.CheckFunctionManager(
                foo.__code__,
                guards_state.output_graph,
+                guards_serialization_mode="load",
                shape_code_parts=guards_state.shape_code_parts,
                runtime_global_scope=new_globals,
            ).guard_manager
--- a/test/dynamo/test_package.py
+++ b/test/dynamo/test_package.py
@ -16,7 +16,7 @@ from torch._dynamo.package import CompilePackage, DiskDynamoStore, DynamoCache
 from torch._dynamo.precompile_context import PrecompileContext
 from torch._dynamo.testing import reduce_to_scalar_loss
 from torch._functorch import config as functorch_config
-from torch._inductor.mock_cache import global_stats, PatchCaches
+from torch._inductor.mock_cache import global_stats, PatchCaches, Stats
 from torch._inductor.runtime.runtime_utils import cache_dir
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
@ -452,33 +452,27 @@ def add(x, y):
        def fn(x, y):
            return x.sin() + y

-        arg1 = torch.randn(32, 32, device=device)
-        arg2 = torch.randn(32, 32, device=device)
+        arg1 = torch.randn(3, 3, device=device)
+        arg2 = torch.randn(3, 3, device=device)
        expected = fn(arg1, arg2).clone()

        with PatchCaches():
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            result = compiled_fn1(arg1, arg2).clone()
            self.assertEqual(expected, result)
-            self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
+            self.assertEqual(global_stats.autotune_local, Stats(1, 0, 1))
            DynamoCache.clear()

            total_frames = torch._dynamo.convert_frame.FRAME_COUNTER
            self._save_and_reload(
                expected_backends=1, expected_dynamo=1, expected_autotune=1
            )
-            # During save, we check the autotune cache another time, and now it should hit
-            self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
            compiled_fn1 = torch.compile(fn, mode="max-autotune")
            with torch.compiler.set_stance("fail_on_recompile"):
                result1 = compiled_fn1(arg1, arg2).clone()
                self.assertEqual(expected, result1)
            self.assertEqual(torch._dynamo.convert_frame.FRAME_COUNTER, total_frames)
-            # No new hits or misses
-            # Unfortunately, we don't *actually* know how many puts there will be, because
-            # it's possible the best autotune config was found by coordesc.
-            self.assertEqual(global_stats.autotune_local.num_get_hit, 1)
-            self.assertEqual(global_stats.autotune_local.num_get_miss, 1)
+            self.assertEqual(global_stats.autotune_local, Stats(2, 1, 1))

    @parametrize("device", ("cpu", "cuda", "xpu"))
    @torch._dynamo.config.patch(caching_precompile=True)
--- a/test/dynamo/test_pgo.py
+++ b/test/dynamo/test_pgo.py
@ -362,74 +362,6 @@ def run(cnt):
        write_load_and_run(path2)
        self.assertEqual(cnts.frame_count, 1)

-    @torch._dynamo.config.patch(
-        automatic_dynamic_remote_pgo=True, automatic_dynamic_local_pgo=False
-    )
-    def test_sticky_pgo_read_write(self):
-        cnts = CompileCounter()
-
-        @torch.compile(backend=cnts, fullgraph=True)
-        def f(x, y):
-            return x * 2, y * 3
-
-        def t(x, y):
-            return torch.randn(x, y)
-
-        with mock_cache.PatchCaches():
-            # we pretend to disable the default remote cache, by keying different job ids per run
-            with torch.compiler.config.patch(job_id="a"):
-                f(t(2, 2), t(2, 2))
-                f(t(2, 4), t(2, 2))
-                self.assertEqual(cnts.frame_count, 2)
-
-            # first test we're not reading from local/default remote cache;
-            # we should recompile when x wobbles
-            self.reset()
-            cnts.clear()
-            with torch.compiler.config.patch(
-                job_id="b", pgo_extra_write_key="sticky_0"
-            ):
-                f(t(2, 2), t(2, 2))
-                f(t(2, 4), t(2, 2))
-                self.assertEqual(cnts.frame_count, 2)
-
-            # now with the extra sticky_0 key, we start with dynamic x;
-            # no recompiles
-            self.reset()
-            cnts.clear()
-            with torch.compiler.config.patch(job_id="c", pgo_extra_read_key="sticky_0"):
-                f(t(2, 2), t(2, 2))
-                f(t(2, 4), t(2, 2))
-                self.assertEqual(cnts.frame_count, 1)
-
-            # last test: wobble y and write to sticky_1 key
-            self.reset()
-            cnts.clear()
-            with torch.compiler.config.patch(
-                job_id="d", pgo_extra_write_key="sticky_1"
-            ):
-                f(t(2, 2), t(2, 2))
-                f(t(2, 2), t(2, 4))
-                f(t(2, 2), t(4, 4))
-                self.assertEqual(cnts.frame_count, 3)
-
-            # start using default remote PGO, create run that wobbles y
-            self.reset()
-            cnts.clear()
-            f(t(2, 2), t(2, 2))
-            f(t(2, 4), t(2, 2))
-            f(t(4, 2), t(2, 2))
-
-            # with default remote (dynamic x) + extra remote (dynamic y),
-            # we should be able to wobble x & y with no recompiles.
-            self.reset()
-            cnts.clear()
-            with torch.compiler.config.patch(pgo_extra_read_key="sticky_1"):
-                f(t(2, 2), t(2, 2))
-                f(t(2, 4), t(4, 2))
-                f(t(4, 2), t(2, 4))
-                self.assertEqual(cnts.frame_count, 1)
-

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_repros.py
+++ b/test/dynamo/test_repros.py
@ -66,7 +66,6 @@ from torch.testing._internal.common_utils import (
    parametrize,
    serialTest,
    skipIfHpu,
-    skipIfRocm,
    skipIfWindows,
    TEST_WITH_ROCM,
 )
@ -7406,7 +7405,6 @@ class ReproTestsDevice(torch._dynamo.test_case.TestCase):
            out = f_compiled(x, s0, s1, s2)
            self.assertEqual(out_ref, out)

-    @skipIfRocm
    @unittest.skipIf(not PLATFORM_SUPPORTS_FP8, "requires gpu with fp8 support")
    @requires_cuda
    def test_partitioner_saves_weights_for_bw(self):
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@ -28,6 +28,7 @@ from torch.testing._internal.triton_utils import requires_cuda_and_triton
 if torch.distributed.is_available():
    from torch.testing._internal.distributed.fake_pg import FakeStore

+
 HAS_TLPARSE = shutil.which("tlparse") is not None
 requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
 requires_distributed = functools.partial(
@ -1197,13 +1198,13 @@ def forward(self, x_1: "f32[2][1]cpu"):

    @contextmanager
    def _setup_runtime_estimates_capture(self):
-        """Helper to turn on and capture the combined 'inductor_runtime_and_tensor_meta' structured trace."""
+        """Helper to turn on and capture the 'inductor_tlparse_runtime' structured trace."""
        payload_buffer = io.StringIO()
        payload_handler = logging.StreamHandler(payload_buffer)
        payload_handler.setLevel(logging.DEBUG)
        payload_handler.setFormatter(StructuredTracePayloadFormatter())
        payload_handler.addFilter(
-            StructuredTraceTestingFilter("inductor_runtime_and_tensor_meta")
+            StructuredTraceTestingFilter("inductor_tlparse_runtime")
        )
        trace_log.addHandler(payload_handler)
        try:
@ -1244,10 +1245,8 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))

-                # Verify runtime + tensor meta artifact was logged
-                self.assertIn(
-                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
-                )
+                # Verify runtime estimates artifact was logged
+                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())

                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1311,10 +1310,8 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))

-                # Verify artifact was logged
-                self.assertIn(
-                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
-                )
+                # Verify runtime estimates artifact was logged
+                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())

                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1336,145 +1333,6 @@ def forward(self, x_1: "f32[2][1]cpu"):
        finally:
            dist.destroy_process_group()

-    @requires_tlparse
-    @requires_distributed()
-    @requires_cuda_and_triton
-    @torch._inductor.config.patch("fx_graph_cache", False)
-    @torch._inductor.config.patch("log_tlparse", True)
-    def test_tensor_metadata_logging_multiple_ops(self):
-        import torch.distributed as dist
-
-        store = FakeStore()
-        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
-
-        class Mixed(torch.nn.Module):
-            def __init__(self):
-                super().__init__()
-                self.linear = torch.nn.Linear(4, 4)
-
-            def forward(self, x):
-                y = torch.relu(self.linear(x))
-                y = torch.ops._c10d_functional.all_reduce.default(y, "sum", "0")
-                y = torch.ops._c10d_functional.wait_tensor.default(y)
-                return y + 1
-
-        try:
-            with self._setup_runtime_estimates_capture() as payload_buffer:
-                torch._dynamo.reset()
-                mod = Mixed().cuda()
-                compiled = torch.compile(mod, backend="inductor")
-                compiled(torch.randn(4, 4, device="cuda"))
-                payload = payload_buffer.getvalue().strip()
-                if payload:
-                    data = json.loads(payload)
-                    types = sorted({op.get("type") for op in data.get("ops", [])})
-                    self.assertExpectedInline(
-                        str(types), """['collective', 'compute']"""
-                    )
-                self.assertParses()
-        finally:
-            dist.destroy_process_group()
-
-    @requires_tlparse
-    @torch._inductor.config.patch("log_tlparse", True)
-    def test_tensor_metadata_logging(self):
-        """Emit unified runtime+tensor-metadata artifact and assert a stable simplified JSON inline."""
-        with self._setup_runtime_estimates_capture() as payload_buffer:
-
-            def f(x):
-                y = x.transpose(0, 1)
-                z = y.mean(dim=0)
-                w = z.to(torch.float16)
-                return w
-
-            compiled = torch.compile(f, backend="inductor", fullgraph=True)
-            compiled(torch.ones(2, 3))
-
-            # Verify artifact was logged
-            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
-
-            payload = payload_buffer.getvalue().strip()
-            if payload:
-                data = json.loads(payload)
-                ops = data.get("ops", [])
-
-                simplified_ops = []
-                for op in ops:
-                    outs = [
-                        {
-                            "shape": out.get("shape", []),
-                            "stride": out.get("stride", []),
-                            "dtype": out.get("dtype", None),
-                        }
-                        for out in op.get("outputs", [])
-                    ]
-                    if outs:
-                        simplified_ops.append(
-                            {
-                                "type": op.get("type", ""),
-                                "outputs": outs,
-                            }
-                        )
-
-                self.assertExpectedInline(
-                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
-                    """{'ops': [{'type': 'compute', 'outputs': [{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}""",
-                )
-
-            self.assertParses()
-
-    @requires_tlparse
-    @torch._inductor.config.patch("log_tlparse", True)
-    def test_tensor_metadata_logging_dynamic_shapes(self):
-        """Same as test_tensor_metadata_logging, but with dynamic shapes enabled to cover to_size_hints."""
-        with self._setup_runtime_estimates_capture() as payload_buffer:
-
-            def f(x):
-                y = x.transpose(0, 1)
-                z = y.mean(dim=0)
-                w = z.to(torch.float16)
-                return w
-
-            compiled = torch.compile(f, backend="inductor", dynamic=True)
-            compiled(torch.ones(2, 3))
-
-            # Verify artifact was logged
-            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
-
-            payload = payload_buffer.getvalue().strip()
-            if payload:
-                data = json.loads(payload)
-                ops = data.get("ops", [])
-
-                simplified_ops = []
-                for op in ops:
-                    outs = [
-                        {
-                            "shape": out.get("shape", []),
-                            "stride": out.get("stride", []),
-                            "dtype": out.get("dtype", None),
-                        }
-                        for out in op.get("outputs", [])
-                    ]
-                    if outs:
-                        simplified_ops.append(
-                            {
-                                "type": op.get("type", ""),
-                                "outputs": outs,
-                            }
-                        )
-
-                self.assertExpectedInline(
-                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
-                    (
-                        "{'ops': [{'type': 'compute', 'outputs': ["
-                        "{'shape': [2], 'stride': [1], 'dtype': 'float32'}, "
-                        "{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}"
-                    ),
-                )
-
-            self.assertParses()
-

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_iter_not_calling_getitem_on_maps
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_missing
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_new_child
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_order_preservation
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_ordering
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Mapping
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableMapping_subclass
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSequence_mixins
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_MutableSet
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Sequence_mixins
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_from_iterable
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_Set_interoperability_with_real_sets
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_arithmetic_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_equality_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_isdisjoint_Set
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue16373
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue26915
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue8750
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_4920
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCollectionABCs.test_issue_5647
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_copy_subclass
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_keyword_only_arguments
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_namedtuple_subclass_issue_24931
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_odd_sizes
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Callable
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_Generator
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_direct_subclassing
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestOneTrickPonyABCs.test_registration
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_missing
--- a/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
+++ b/test/dynamo_expected_failures/CPython313-test_iter-TestCase.test_mutating_seq_class_exhausted_iter
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestBasicOps.test_filterfalse
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestExamples.test_filterfalse
--- a/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product
+++ b/test/dynamo_expected_failures/CPython313-test_itertools-TestVariousIteratorArgs.test_product
--- a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
+++ b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_keys
--- a/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
+++ b/test/dynamo_expected_failures/CPython313-test_userdict-UserDictTest.test_read
--- a/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
+++ b/test/dynamo_expected_failures/CPython313-test_userlist-UserListTest.test_extendedslicing
--- a/Show More
+++ b/Show More