Remove unused test code

ghstack-source-id: 8d6fad8d8f59a12a1711649cdd4558f23025a45c Pull Request resolved: https://github.com/pytorch/pytorch/pull/160823
[inductor] TLParse tensor metadata logging + test (#160132 )
2025-10-23 14:59:34 +08:00 · 2025-08-16 11:23:52 -07:00 · 2025-08-16 16:37:18 +00:00 · 2025-08-16 16:15:22 +00:00 · 2025-08-16 14:58:03 +00:00 · 2025-08-16 09:15:58 +00:00
259 changed files with 5927 additions and 2573 deletions
--- a/.ci/aarch64_linux/aarch64_wheel_ci_build.py
+++ b/.ci/aarch64_linux/aarch64_wheel_ci_build.py
@ -92,6 +92,7 @@ def package_cuda_wheel(wheel_path, desired_cuda) -> None:
        "/usr/local/cuda/lib64/libnccl.so.2",
        "/usr/local/cuda/lib64/libnvJitLink.so.12",
        "/usr/local/cuda/lib64/libnvrtc.so.12",
+        "/usr/local/cuda/lib64/libnvshmem_host.so.3",
        "/usr/local/cuda/lib64/libcudnn_adv.so.9",
        "/usr/local/cuda/lib64/libcudnn_cnn.so.9",
        "/usr/local/cuda/lib64/libcudnn_graph.so.9",
@ -209,8 +210,6 @@ if __name__ == "__main__":
    # MAX_JOB=5 is not required for CPU backend (see commit 465d98b)
    if enable_cuda:
        build_vars += "MAX_JOBS=5 "
-        # nvshmem is broken for aarch64 see https://github.com/pytorch/pytorch/issues/160425
-        build_vars += "USE_NVSHMEM=OFF "

    override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
    desired_cuda = os.getenv("DESIRED_CUDA")
--- a/.ci/docker/almalinux/Dockerfile
+++ b/.ci/docker/almalinux/Dockerfile
@ -76,7 +76,6 @@ ADD ./common/install_mnist.sh install_mnist.sh
 RUN bash ./install_mnist.sh

 FROM base as all_cuda
-COPY --from=cuda11.8  /usr/local/cuda-11.8 /usr/local/cuda-11.8
 COPY --from=cuda12.6  /usr/local/cuda-12.6 /usr/local/cuda-12.6
 COPY --from=cuda12.8  /usr/local/cuda-12.8 /usr/local/cuda-12.8
 COPY --from=cuda12.9  /usr/local/cuda-12.9 /usr/local/cuda-12.9
--- a/.ci/docker/ci_commit_pins/huggingface-requirements.txt
+++ b/.ci/docker/ci_commit_pins/huggingface-requirements.txt
@ -0,0 +1,2 @@
+transformers==4.54.0
+soxr==0.5.0
--- a/.ci/docker/ci_commit_pins/huggingface.txt
+++ b/.ci/docker/ci_commit_pins/huggingface.txt
@ -1 +0,0 @@
-v4.54.0
--- a/.ci/docker/ci_commit_pins/triton-xpu.txt
+++ b/.ci/docker/ci_commit_pins/triton-xpu.txt
@ -1 +1 @@
-ae324eeac8e102a2b40370e341460f3791353398
+0958dc9b2bb815e428f721f9da599dab0dc1c5d7
--- a/.ci/docker/common/install_cuda.sh
+++ b/.ci/docker/common/install_cuda.sh
@ -10,7 +10,7 @@ else
  arch_path='sbsa'
 fi

-NVSHMEM_VERSION=3.3.9
+NVSHMEM_VERSION=3.3.20

 function install_cuda {
  version=$1
@ -62,14 +62,16 @@ function install_nvshmem {
  mkdir -p "${tmpdir}" && cd "${tmpdir}"

  # nvSHMEM license: https://docs.nvidia.com/nvshmem/api/sla.html
-  filename="libnvshmem_cuda${cuda_major_version}-linux-${arch_path}-${nvshmem_version}"
-  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}.tar.gz"
+  # This pattern is a lie as it is not consistent across versions, for 3.3.9 it was cuda_ver-arch-nvshhem-ver
+  filename="libnvshmem-linux-${arch_path}-${nvshmem_version}_cuda${cuda_major_version}-archive"
+  suffix=".tar.xz"
+  url="https://developer.download.nvidia.com/compute/redist/nvshmem/${nvshmem_version}/builds/cuda${cuda_major_version}/txz/agnostic/${dl_arch}/${filename}${suffix}"

  # download, unpack, install
  wget -q "${url}"
-  tar xf "${filename}.tar.gz"
-  cp -a "libnvshmem/include/"* /usr/local/cuda/include/
-  cp -a "libnvshmem/lib/"*     /usr/local/cuda/lib64/
+  tar xf "${filename}${suffix}"
+  cp -a "${filename}/include/"* /usr/local/cuda/include/
+  cp -a "${filename}/lib/"*     /usr/local/cuda/lib64/

  # cleanup
  cd ..
--- a/.ci/docker/common/install_inductor_benchmark_deps.sh
+++ b/.ci/docker/common/install_inductor_benchmark_deps.sh
@ -5,9 +5,7 @@ set -ex
 source "$(dirname "${BASH_SOURCE[0]}")/common_utils.sh"

 function install_huggingface() {
-  local version
-  commit=$(get_pinned_commit huggingface)
-  pip_install "git+https://github.com/huggingface/transformers@${commit}"
+  pip_install -r huggingface-requirements.txt
 }

 function install_timm() {
@ -26,9 +24,6 @@ function install_torchbench() {

  python install.py --continue_on_fail

-  # soxr comes from https://github.com/huggingface/transformers/pull/39429
-  pip install transformers==4.54.0 soxr==0.5.0
-
  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
  popd
--- a/.ci/docker/ubuntu-rocm/Dockerfile
+++ b/.ci/docker/ubuntu-rocm/Dockerfile
@ -96,11 +96,11 @@ ARG ANACONDA_PYTHON_VERSION
 ENV ANACONDA_PYTHON_VERSION=$ANACONDA_PYTHON_VERSION
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

 # (optional) Install non-default Ninja version
 ARG NINJA_VERSION
--- a/.ci/docker/ubuntu-xpu/Dockerfile
+++ b/.ci/docker/ubuntu-xpu/Dockerfile
@ -56,10 +56,10 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt

 # Install XPU Dependencies
 ARG XPU_VERSION
--- a/.ci/docker/ubuntu/Dockerfile
+++ b/.ci/docker/ubuntu/Dockerfile
@ -96,11 +96,11 @@ RUN rm install_openssl.sh
 ARG INDUCTOR_BENCHMARKS
 COPY ./common/install_inductor_benchmark_deps.sh install_inductor_benchmark_deps.sh
 COPY ./common/common_utils.sh common_utils.sh
-COPY ci_commit_pins/huggingface.txt huggingface.txt
+COPY ci_commit_pins/huggingface-requirements.txt huggingface-requirements.txt
 COPY ci_commit_pins/timm.txt timm.txt
 COPY ci_commit_pins/torchbench.txt torchbench.txt
 RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi
-RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt torchbench.txt
+RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface-requirements.txt torchbench.txt

 ARG TRITON
 ARG TRITON_CPU
--- a/.ci/lumen_cli/cli/lib/core/vllm.py
+++ b/.ci/lumen_cli/cli/lib/core/vllm.py
@ -62,7 +62,7 @@ class VllmBuildParameters:
    )

    # OUTPUT_DIR: where docker buildx (local exporter) will write artifacts
-    output_dir: Path = env_path_field("OUTPUT_DIR", "shared")
+    output_dir: Path = env_path_field("OUTPUT_DIR", "external/vllm")

    # --- Build args ----------------------------------------------------------
    target_stage: str = env_str_field("TARGET_STAGE", "export-wheels")
--- a/.ci/manywheel/build_cuda.sh
+++ b/.ci/manywheel/build_cuda.sh
@ -134,7 +134,7 @@ if [[ $CUDA_VERSION == 12* ]]; then
            "/usr/local/cuda/lib64/libnvrtc-builtins.so"
            "/usr/local/cuda/lib64/libcufile.so.0"
            "/usr/local/cuda/lib64/libcufile_rdma.so.1"
-            "/usr/local/cuda/lib64/libnvshem_host.so.3"
+            "/usr/local/cuda/lib64/libnvshmem_host.so.3"
            "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12"
            "/usr/local/cuda/extras/CUPTI/lib64/libnvperf_host.so"
        )
--- a/.ci/pytorch/macos-test.sh
+++ b/.ci/pytorch/macos-test.sh
@ -174,13 +174,15 @@ checkout_install_torchbench() {
    # to install and test other models
    python install.py --continue_on_fail
  fi
+  popd

-  # soxr comes from https://github.com/huggingface/transformers/pull/39429
-  pip install transformers==4.54.0 soxr==0.5.0
+  pip install -r .ci/docker/ci_commit_pins/huggingface-requirements.txt
+  # https://github.com/pytorch/pytorch/issues/160689 to remove torchao because
+  # its current version 0.12.0 doesn't work with transformers 4.54.0
+  pip uninstall -y torchao

  echo "Print all dependencies after TorchBench is installed"
  python -mpip freeze
-  popd
 }

 torchbench_setup_macos() {
--- a/.ci/pytorch/test.sh
+++ b/.ci/pytorch/test.sh
@ -1701,7 +1701,7 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then
  fi
 elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper* ]]; then
  install_torchvision
-  PYTHONPATH=/torchbench:$PYTHONPATH test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
+  PYTHONPATH=/torchbench test_inductor_cpp_wrapper_shard "$SHARD_NUMBER"
  if [[ "$SHARD_NUMBER" -eq "1" ]]; then
    test_inductor_aoti
  fi
--- a/.ci/wheel/build_wheel.sh
+++ b/.ci/wheel/build_wheel.sh
@ -133,6 +133,25 @@ EXTRA_CONDA_INSTALL_FLAGS=""
 CONDA_ENV_CREATE_FLAGS=""
 RENAME_WHEEL=true
 case $desired_python in
+    3.14t)
+        echo "Using 3.14 deps"
+        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        CONDA_ENV_CREATE_FLAGS="python-freethreading"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
+    3.14)
+        echo "Using 3.14t deps"
+        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
+        PYYAML_PINNED_VERSION=">=6.0.1"
+        NUMPY_PINNED_VERSION="=2.1.0"
+        EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+        desired_python="3.14.0rc1"
+        RENAME_WHEEL=false
+        ;;
    3.13t)
        echo "Using 3.13 deps"
        SETUPTOOLS_PINNED_VERSION=">=70.1.0"
--- a/.github/actions/build-external-packages/action.yml
+++ b/.github/actions/build-external-packages/action.yml
@ -0,0 +1,80 @@
+# .github/workflows/build-external.yml
+name: Build External packages
+
+description: build external packages for PyTorch
+
+inputs:
+  cuda-arch-list:
+    description: TORCH_CUDA_ARCH_LIST (e.g., "8.0;8.9;9.0")
+    type: string
+    required: true
+    default: ""
+  docker-image:
+    description: Base image to use
+    type: string
+    required: true
+  build-targets:
+    description: Build targets
+    type: string
+    required: true
+  torch-wheel-dir:
+    description: Directory to built torch wheel
+    type: string
+    required: false
+    default: dist
+  output-dir:
+    description: Directory to store build artifact
+    default: external
+    type: string
+    required: false
+
+outputs:
+  build_time:
+    description: "Total build time in seconds"
+    value: ${{ steps.build-external.outputs.build_time }}
+  output_dir:
+    description: "Directory where build artifact is stored"
+    value: ${{ steps.build-external.outputs.output_dir }}
+
+runs:
+  using: composite
+  steps:
+    - name: Build external packages in sequence
+      id: build-external
+      env:
+        SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2
+        SCCACHE_REGION: us-east-1
+        TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }}
+        BASE_IMAGE: ${{ inputs.docker-image }}
+        BUILD_TARGETS: ${{ inputs.build-targets }}
+        PARENT_OUTPUT_DIR: ${{ inputs.output-dir}}
+      shell: bash
+      run: |
+        set -euo pipefail
+        python3 --version
+        docker images
+        START_TIME=$(date +%s)
+        (
+          cd .ci/lumen_cli
+          python3 -m pip install -e .
+        )
+        MAX_JOBS="$(nproc --ignore=6)"
+        export MAX_JOBS
+
+        # Split the comma-separated list and build each target
+        IFS=',' read -ra TARGETS <<< "$BUILD_TARGETS"
+        for target in "${TARGETS[@]}"; do
+          OUTPUT_DIR="$PARENT_OUTPUT_DIR/$target"
+          export OUTPUT_DIR
+          echo "Building external package: $target in directory $OUTPUT_DIR"
+          python3 -m cli.run build external "$target"
+
+        done
+
+        END_TIME=$(date +%s)
+        {
+          echo "build_time=$((END_TIME - START_TIME))"
+          if [ -d "$PARENT_OUTPUT_DIR" ]; then
+            echo "output_dir=$PARENT_OUTPUT_DIR"
+          fi
+        } >> "$GITHUB_OUTPUT"
--- a/.github/ci_commit_pins/audio.txt
+++ b/.github/ci_commit_pins/audio.txt
@ -1 +1 @@
-bdb88e1d66f272cad72156c90ac8428ca61a601c
+02351a683668dd65bc82343e55245e308eb97b4e
--- a/.github/ci_commit_pins/vllm.txt
+++ b/.github/ci_commit_pins/vllm.txt
@ -1 +1 @@
-0ca2393b47e72c4424a49aa3b32c7c5d0e378a72
+070da660c1bf9e7a7be8b9efeff4b06f91c7342f
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@ -0,0 +1,20 @@
+version: 2
+updates:
+  # Update to the latest transformers version with dependabot
+  - package-ecosystem: "pip"
+    directory: "/.ci/docker/ci_commit_pins"
+    schedule:
+      interval: "daily"
+    target-branch: "main"
+    allow:
+      - dependency-name: "transformers"
+    commit-message:
+      prefix: "[Dependabot] Update"
+      include: "scope"
+    labels:
+      - "dependencies"
+      - "open source"
+      - "python"
+      - "topic: not user facing"
+      - "module: ci"
+      - "module: inductor"
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -27,6 +27,7 @@ ciflow_push_tags:
 - ciflow/trunk
 - ciflow/unstable
 - ciflow/xpu
+- ciflow/vllm
 - ciflow/torchbench
 - ciflow/op-benchmark
 - ciflow/pull
--- a/.github/scripts/generate_binary_build_matrix.py
+++ b/.github/scripts/generate_binary_build_matrix.py
@ -54,7 +54,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -71,7 +71,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -88,7 +88,7 @@ PYTORCH_EXTRA_INSTALL_REQUIREMENTS = {
        "nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | "
-        "nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | "
+        "nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | "
        "nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'"
@ -315,7 +315,7 @@ def generate_wheels_matrix(
            if gpu_arch_type == "cpu-s390x" and python_version == "3.13t":
                continue
            # TODO: Enable python 3.14 on non linux OSes
-            if os != "linux" and (
+            if os not in ["linux", "linux-aarch64", "macos-arm64"] and (
                python_version == "3.14" or python_version == "3.14t"
            ):
                continue
--- a/.github/templates/macos_binary_build_workflow.yml.j2
+++ b/.github/templates/macos_binary_build_workflow.yml.j2
@ -110,12 +110,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

--- a/.github/workflows/_linux-build.yml
+++ b/.github/workflows/_linux-build.yml
@ -96,6 +96,13 @@ on:
        required: false
        type: string
        default: ""
+      build-external-packages:
+        description: |
+          If set, the build external packages and saves their wheels as artifacts
+          use command separated list of packages to build ex: 'vllm,transformers'.
+        required: false
+        type: string
+        default: ""

    secrets:
      HUGGING_FACE_HUB_TOKEN:
@ -356,6 +363,26 @@ jobs:
          END_TIME=$(date +%s)
          echo "build_time=$((END_TIME - START_TIME))" >> "$GITHUB_OUTPUT"

+      - name: Build external packages
+        id: build-external-packages
+        if: inputs.build-external-packages != '' &&  steps.build.outcome != 'skipped'
+        uses: ./.github/actions/build-external-packages
+        with:
+          build-targets: ${{ inputs.build-external-packages }}
+          docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
+          cuda-arch-list: ${{ inputs.cuda-arch-list }}
+          output-dir: external
+
+      - name: Move external packages to dist
+        if: steps.build-external-packages.outputs.output_dir != '' && steps.build-external-packages.outcome != 'skipped'
+        shell: bash
+        run: |
+          src="${{ steps.build-external-packages.outputs.output_dir }}"
+          if [ -d "$src" ]; then
+            mkdir -p "dist/$(dirname "$src")"
+            mv "$src" "dist/$(dirname "$src")/"
+          fi
+
      - name: Stop monitoring script
        if: ${{ always() && steps.monitor-script.outputs.monitor-script-pid }}
        shell: bash
--- a/.github/workflows/_mac-test.yml
+++ b/.github/workflows/_mac-test.yml
@ -136,7 +136,7 @@ jobs:
          MONITOR_LOG_INTERVAL: ${{ inputs.monitor-log-interval }}
          MONITOR_DATA_COLLECT_INTERVAL: ${{ inputs.monitor-data-collect-interval }}
        run: |
-          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_sajson==0.6.7
+          "$VENV_PATH/bin/python3" -m pip install psutil==5.9.8 dataclasses_json==0.6.7
          "$VENV_PATH/bin/python3" -m tools.stats.monitor --log-interval "$MONITOR_LOG_INTERVAL" --data-collect-interval "$MONITOR_DATA_COLLECT_INTERVAL" > usage_log.txt 2>&1 &
          echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}"

--- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml
@ -132,7 +132,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_9-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -243,7 +243,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_10-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -354,7 +354,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_11-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -465,7 +465,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_12-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -576,7 +576,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -687,7 +687,7 @@ jobs:
      ALPINE_IMAGE: "arm64v8/alpine"
      build_name: manywheel-py3_13t-cuda-aarch64-12_9
      build_environment: linux-aarch64-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
      timeout-minutes: 420
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
@ -712,3 +712,225 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14"
+      build_name: manywheel-py3_14-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cpu-aarch64-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-test:  # Testing
+    if: ${{ github.repository_owner == 'pytorch' }}
+    needs:
+      - manywheel-py3_14t-cpu-aarch64-build
+      - get-label-type
+    uses: ./.github/workflows/_binary-test-linux.yml
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+      build_environment: linux-aarch64-binary-manywheel
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.2xlarge
+      ALPINE_IMAGE: "arm64v8/alpine"
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cpu-aarch64-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cpu-aarch64-test
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu-aarch64
+      DOCKER_IMAGE: manylinux2_28_aarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu-aarch64
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cpu-aarch64
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+
+  manywheel-py3_14t-cuda-aarch64-12_9-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    uses: ./.github/workflows/_binary-build-linux.yml
+    needs: get-label-type
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
+      runs_on: linux.arm64.m7g.4xlarge.ephemeral
+      ALPINE_IMAGE: "arm64v8/alpine"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+      build_environment: linux-aarch64-binary-manywheel
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      timeout-minutes: 420
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+  manywheel-py3_14t-cuda-aarch64-12_9-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: manywheel-py3_14t-cuda-aarch64-12_9-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: manywheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cu129
+      GPU_ARCH_VERSION: 12.9-aarch64
+      GPU_ARCH_TYPE: cuda-aarch64
+      DOCKER_IMAGE: manylinuxaarch64-builder
+      DOCKER_IMAGE_TAG_PREFIX: cuda12.9
+      DESIRED_PYTHON: "3.14t"
+      build_name: manywheel-py3_14t-cuda-aarch64-12_9
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/generated-linux-binary-manywheel-main.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-main.yml
@ -60,7 +60,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
--- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml
+++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml
@ -127,7 +127,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_6-test:  # Testing
@ -193,7 +193,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_8-test:  # Testing
@ -259,7 +259,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_9-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_9-cuda12_9-test:  # Testing
@ -719,7 +719,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_6-test:  # Testing
@ -785,7 +785,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_8-test:  # Testing
@ -851,7 +851,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_10-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_10-cuda12_9-test:  # Testing
@ -1311,7 +1311,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_6-test:  # Testing
@ -1377,7 +1377,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_8-test:  # Testing
@ -1508,7 +1508,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_11-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_11-cuda12_9-test:  # Testing
@ -1968,7 +1968,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_6-test:  # Testing
@ -2034,7 +2034,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_8-test:  # Testing
@ -2100,7 +2100,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_12-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_12-cuda12_9-test:  # Testing
@ -2560,7 +2560,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_6-test:  # Testing
@ -2626,7 +2626,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_8-test:  # Testing
@ -2692,7 +2692,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13-cuda12_9-test:  # Testing
@ -3152,7 +3152,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_6-test:  # Testing
@ -3218,7 +3218,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_8-test:  # Testing
@ -3284,7 +3284,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_13t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_13t-cuda12_9-test:  # Testing
@ -3744,7 +3744,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_6-test:  # Testing
@ -3810,7 +3810,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_8-test:  # Testing
@ -3876,7 +3876,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14-cuda12_9-test:  # Testing
@ -4336,7 +4336,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_6
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.6.80; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.6.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.0.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.7.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.1.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.4.2; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.6.77; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.6.85; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.11.1.6; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_6-test:  # Testing
@ -4402,7 +4402,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_8
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.8.4.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.3.3.83; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.9.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.3.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.8.90; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.8.93; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.13.1.3; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_8-test:  # Testing
@ -4468,7 +4468,7 @@ jobs:
      runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
      build_name: manywheel-py3_14t-cuda12_9
      build_environment: linux-binary-manywheel
-      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.9; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
+      PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==9.10.2.21; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.9.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.4.1.4; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.10.19; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.7.5.82; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.5.10.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparselt-cu12==0.7.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.27.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvshmem-cu12==3.3.20; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.9.79; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.9.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufile-cu12==1.14.1.1; platform_system == 'Linux' and platform_machine == 'x86_64'
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
  manywheel-py3_14t-cuda12_9-test:  # Testing
--- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
+++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml
@ -115,12 +115,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -239,12 +260,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -363,12 +405,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -487,12 +550,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -611,12 +695,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -735,12 +840,33 @@ jobs:
          # Create new "clean" conda environment for testing

          SMOKE_TEST_PARAMS=""
-          if [[ $DESIRED_PYTHON == "3.13t" ]]; then
-            conda create -yn "test_conda_env" python="3.13" python-freethreading -c conda-forge
-            SMOKE_TEST_PARAMS="--torch-compile-check disabled"
-          else
-            conda create -yn "test_conda_env" python="$DESIRED_PYTHON"
-          fi
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
          conda activate test_conda_env
          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v

@ -774,3 +900,293 @@ jobs:
    secrets:
      github-token: ${{ secrets.GITHUB_TOKEN }}
    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_14-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.14"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_14-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_14-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_14-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.14"
+      build_name: wheel-py3_14-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
+  wheel-py3_14t-cpu-build:
+    if: ${{ github.repository_owner == 'pytorch' }}
+    runs-on: macos-14-xlarge
+    timeout-minutes: 240
+    env:
+      PYTORCH_ROOT: ${{ github.workspace }}/pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      SKIP_ALL_TESTS: 1
+      DESIRED_PYTHON: "3.14t"
+    steps:
+      # NOTE: These environment variables are put here so that they can be applied on every job equally
+      #       They are also here because setting them at a workflow level doesn't give us access to the
+      #       runner.temp variable, which we need.
+      - name: Populate binary env
+        shell: bash
+        run: |
+          # shellcheck disable=SC2129
+          echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}"
+          # shellcheck disable=SC2129
+          echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}"
+      - name: Install conda and dependencies
+        run: |
+          # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on
+          curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh"
+          chmod +x "${RUNNER_TEMP}/conda.sh"
+          /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda"
+          echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}"
+          if [ -d "/Applications/Xcode_14.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          elif [ -d "/Applications/Xcode_13.3.1.app" ]; then
+            echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}"
+          fi
+      - name: Checkout PyTorch
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+          submodules: recursive
+          path: pytorch
+          show-progress: false
+      - name: Clean PyTorch checkout
+        run: |
+          # Remove any artifacts from the previous checkouts
+          git clean -fxd
+        working-directory: pytorch
+      - name: Populate binary env
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh"
+      - name: Build PyTorch binary
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR"
+
+          # Build
+          USE_PYTORCH_METAL_EXPORT=1
+          USE_COREML_DELEGATE=1
+          TORCH_PACKAGE_NAME="${TORCH_PACKAGE_NAME//-/_}"
+          export USE_PYTORCH_METAL_EXPORT
+          export USE_COREML_DELEGATE
+          export TORCH_PACKAGE_NAME
+          "${PYTORCH_ROOT}/.ci/wheel/build_wheel.sh"
+      - name: Test PyTorch wheel
+        run: |
+          # shellcheck disable=SC1091
+          source "${RUNNER_TEMP}/anaconda/bin/activate"
+          set -eux -o pipefail
+          # shellcheck disable=SC1090
+          source "${BINARY_ENV_FILE:-/Users/distiller/project/env}"
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+          pip uninstall -y "$TORCH_PACKAGE_NAME" || true
+
+          # Create new "clean" conda environment for testing
+
+          SMOKE_TEST_PARAMS=""
+
+          EXTRA_CONDA_INSTALL_FLAGS=""
+          CONDA_ENV_CREATE_FLAGS=""
+          # shellcheck disable=SC2153
+          case $DESIRED_PYTHON in
+            3.14t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.14)
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge/label/python_rc -c conda-forge"
+              desired_python="3.14.0rc1"
+              ;;
+            3.13t)
+              CONDA_ENV_CREATE_FLAGS="python-freethreading"
+              EXTRA_CONDA_INSTALL_FLAGS="-c conda-forge"
+              desired_python="3.13"
+              ;;
+            *)
+              # shellcheck disable=SC2153
+              desired_python=${DESIRED_PYTHON}
+              ;;
+          esac
+
+          # shellcheck disable=SC2086
+          conda create -yn "test_conda_env" python="$desired_python" ${CONDA_ENV_CREATE_FLAGS} ${EXTRA_CONDA_INSTALL_FLAGS}
+          conda activate test_conda_env
+          pip install "$PYTORCH_FINAL_PACKAGE_DIR"/*.whl numpy -v
+
+          # shellcheck disable=SC2086
+          python "${PYTORCH_ROOT}/.ci/pytorch/smoke_test/smoke_test.py" --package torchonly ${SMOKE_TEST_PARAMS}
+      - uses: actions/upload-artifact@v4.4.0
+        if: always()
+        with:
+          name: wheel-py3_14t-cpu
+          retention-days: 14
+          if-no-files-found: error
+          path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}"
+  wheel-py3_14t-cpu-upload:  # Uploading
+    if: ${{ github.repository_owner == 'pytorch' }}
+    permissions:
+      id-token: write
+      contents: read
+    needs: wheel-py3_14t-cpu-build
+    with:
+      PYTORCH_ROOT: /pytorch
+      PACKAGE_TYPE: wheel
+      # TODO: This is a legacy variable that we eventually want to get rid of in
+      #       favor of GPU_ARCH_VERSION
+      DESIRED_CUDA: cpu
+      GPU_ARCH_TYPE: cpu
+      DOCKER_IMAGE: manylinux2_28-builder
+      DOCKER_IMAGE_TAG_PREFIX: cpu
+      DESIRED_PYTHON: "3.14t"
+      build_name: wheel-py3_14t-cpu
+      use_s3: False
+    secrets:
+      github-token: ${{ secrets.GITHUB_TOKEN }}
+    uses: ./.github/workflows/_binary-upload.yml
--- a/.github/workflows/h100-cutlass-backend.yml
+++ b/.github/workflows/h100-cutlass-backend.yml
@ -4,9 +4,12 @@ on:
  pull_request:
    paths:
      - .github/workflows/h100-cutlass-backend.yml
+      - torch/_inductor/codegen/cuda/**
+      - test/inductor/test_cutlass_backend.py
+      - test/inductor/test_cutlass_evt.py
  workflow_dispatch:
  schedule:
-    - cron: 22 9 * * *  # every 24 hours about 2:22am PDT
+    - cron: 22 9,21 * * *  # every 12 hours
  push:
    tags:
      - ciflow/h100-cutlass-backend/*
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -93,7 +93,7 @@ jobs:
      script: |
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running mypy"
-        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY --all-files" .github/scripts/lintrunner.sh
+        ADDITIONAL_LINTRUNNER_ARGS="--take MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh

  lintrunner-noclang:
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@ -111,9 +111,9 @@ jobs:
        CHANGED_FILES="${{ needs.get-changed-files.outputs.changed-files }}"
        echo "Running all other linters"
        if [ "$CHANGED_FILES" = '*' ]; then
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY --all-files" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT --all-files" .github/scripts/lintrunner.sh
        else
-          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY ${CHANGED_FILES}" .github/scripts/lintrunner.sh
+          ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT,MYPY,MYPYSTRICT ${CHANGED_FILES}" .github/scripts/lintrunner.sh
        fi

  quick-checks:
--- a/.github/workflows/vllm.yml
+++ b/.github/workflows/vllm.yml
@ -0,0 +1,45 @@
+name: vllm-test
+
+on:
+  push:
+    tags:
+      - ciflow/vllm/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  get-label-type:
+    name: get-label-type
+    uses: pytorch/pytorch/.github/workflows/_runner-determinator.yml@main
+    if: ${{ (github.event_name != 'schedule' || github.repository == 'pytorch/pytorch') && github.repository_owner == 'pytorch' }}
+    with:
+      triggering_actor: ${{ github.triggering_actor }}
+      issue_owner: ${{ github.event.pull_request.user.login || github.event.issue.user.login }}
+      curr_branch: ${{ github.head_ref || github.ref_name }}
+      curr_ref_type: ${{ github.ref_type }}
+      opt_out_experiments: lf
+
+  torch-build-sm89:
+    name: sm89-vllm-test
+    uses: ./.github/workflows/_linux-build.yml
+    needs: get-label-type
+    with:
+      build-additional-packages: "vision audio torchao"
+      build-external-packages: "vllm"
+      build-environment: linux-jammy-cuda12.8-py3.12-gcc11-sm89
+      docker-image-name: ci-image:pytorch-linux-jammy-cuda12.8-cudnn9-py3.12-gcc11-vllm
+      cuda-arch-list: '8.9'
+      runner: linux.24xlarge.memory
+      test-matrix: |
+        { include: [
+          { config: "vllm_basic_correctness_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu"  },
+          { config: "vllm_basic_models_test", shard: 1, num_shards: 1, runner: "${{ needs.get-label-type.outputs.label-type }}linux.g6.4xlarge.experimental.nvidia.gpu" },
+        ]}
+    secrets: inherit
--- a/aten/src/ATen/WrapDimUtils.h
+++ b/aten/src/ATen/WrapDimUtils.h
@ -121,7 +121,7 @@ inline int64_t legacy_cat_wrap_dim_symint(
    const std::vector<std::vector<c10::SymInt>>& tensor_sizes) {
  for (auto& sizes : tensor_sizes) {
    if (sizes.size() == 1) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) {
+      if (TORCH_GUARD_OR_FALSE(sizes[0].sym_eq(0))) {
        continue;
      }
    }
@ -135,7 +135,7 @@ inline int64_t legacy_cat_wrap_dim(
    const MaterializedITensorListRef& tensors) {
  for (const Tensor& tensor : tensors) {
    if (tensor.dim() == 1) {
-      if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) {
+      if (TORCH_GUARD_OR_FALSE(tensor.sym_sizes()[0].sym_eq(0))) {
        continue;
      }
    }
--- a/aten/src/ATen/native/QuantizedLinear.cpp
+++ b/aten/src/ATen/native/QuantizedLinear.cpp
@ -411,7 +411,8 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
    const Tensor& input,
    const Tensor& packed_weight,
-    const std::optional<Tensor>& bias) {
+    const std::optional<Tensor>& bias,
+    at::Tensor& output) {
  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
                  "and will be removed in a future PyTorch release.")

@ -436,9 +437,11 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
  // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions)
  const int64_t M = size_to_dim_(input.dim() - 1, input.sizes());
  const int64_t N = packed_weight_fp16.numCols();
+
  std::vector<int64_t> output_size = input.sizes().vec();
  output_size.back() = N;
-  Tensor output = at::empty(output_size, input.options().dtype(at::kFloat));
+  // Resize output Tensor
+  output.resize_(output_size);

  // Call the fp16 gemm interface
  fbgemm::cblas_gemm_compute(
@ -460,6 +463,14 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
  return output;
 }

+Tensor fbgemm_linear_fp16_weight_fp32_activation(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const std::optional<Tensor>& bias) {
+      at::Tensor output = at::empty({0}, input.options().dtype(at::kFloat));
+      return at::native::fbgemm_linear_fp16_weight_fp32_activation(input, packed_weight, bias, output);
+  }
+
 Tensor fbgemm_linear_fp16_weight(
    const Tensor& input,
    const Tensor& packed_weight,
@ -468,6 +479,15 @@ Tensor fbgemm_linear_fp16_weight(
      input, packed_weight, bias);
 }

+Tensor fbgemm_linear_fp16_weight(
+  const Tensor& input,
+    const Tensor& packed_weight,
+    const Tensor& bias,
+  at::Tensor& output) {
+  return at::native::fbgemm_linear_fp16_weight_fp32_activation(
+      input, packed_weight, bias, output);
+}
+
 #else // USE_FBGEMM

 Tensor fbgemm_linear_int8_weight_fp32_activation(
@ -554,6 +574,21 @@ Tensor fbgemm_pack_gemm_matrix_fp16(const Tensor& weight) {
      false, "This PyTorch installation was not built with FBGEMM operators");
 }

+Tensor fbgemm_linear_fp16_weight_fp32_activation(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const std::optional<Tensor>& bias,
+    at::Tensor& output) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight_fp32_activation is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
 Tensor fbgemm_linear_fp16_weight_fp32_activation(
    const Tensor& input,
    const Tensor& packed_weight,
@ -568,6 +603,21 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation(
      false, "This PyTorch installation was not built with FBGEMM operators");
 }

+Tensor fbgemm_linear_fp16_weight(
+    const Tensor& input,
+    const Tensor& packed_weight,
+    const Tensor& bias,
+    at::Tensor& output) {
+  TORCH_WARN_ONCE("fbgemm_linear_fp16_weight is deprecated "
+                  "and will be removed in a future PyTorch release.")
+
+  // We make a strong guarantee that models using these operators will have the
+  // same numerics across different machines. Therefore, we do not provide a
+  // fallback path and rather fail loudly if we cannot run FBGEMM.
+  TORCH_CHECK(
+      false, "This PyTorch installation was not built with FBGEMM operators");
+}
+
 Tensor fbgemm_linear_fp16_weight(
    const Tensor& input,
    const Tensor& packed_weight,
--- a/aten/src/ATen/native/TensorProperties.cpp
+++ b/aten/src/ATen/native/TensorProperties.cpp
@ -18,6 +18,7 @@
 #include <ATen/ops/is_set_to_native.h>
 #include <ATen/ops/size_native.h>
 #include <ATen/ops/stride_native.h>
+#include <ATen/ops/sym_is_contiguous_native.h>
 #include <ATen/ops/sym_numel_native.h>
 #include <ATen/ops/sym_size_native.h>
 #include <ATen/ops/sym_storage_offset_native.h>
@ -57,6 +58,12 @@ c10::SymInt sym_size(const Tensor& self, int64_t dim) {
  return self.sym_size(dim);
 }

+c10::SymBool sym_is_contiguous(
+    const Tensor& self,
+    c10::MemoryFormat memory_format) {
+  return self.sym_is_contiguous(memory_format);
+}
+
 c10::SymInt sym_stride(const Tensor& self, int64_t dim) {
  return self.sym_stride(dim);
 }
--- a/aten/src/ATen/native/cudnn/MHA.cpp
+++ b/aten/src/ATen/native/cudnn/MHA.cpp
@ -148,6 +148,56 @@ namespace fe = cudnn_frontend;

 #define MAX_MHA_DIM 4

+// Whether we will use ragged offsets in the dense (non-nested) path
+// to avoid recompilation
+bool use_ragged_in_dense(
+    const Tensor& q,
+    const Tensor& k,
+    const Tensor& v,
+    const Tensor& o,
+    bool has_bias) {
+  static bool flag =
+      c10::utils::check_env("TORCH_CUDNN_SDPA_AVOID_RECOMPILE") == true;
+  if (!flag) {
+    return flag;
+  }
+  TORCH_WARN_ONCE(
+      "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 is currently experimental. "
+      "Please report any issues to https://github.com/pytorch/pytorch/issues.");
+  if (has_bias) {
+    TORCH_WARN_ONCE(
+        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works without bias."
+        "Consider using the is_causal hint instead of bias for causal masking."
+        "Falling back to regular dense case, which may trigger excessive recompilation.");
+    return !has_bias;
+  }
+  bool all_bshd = q.dim() == 4 && q.transpose(1, 2).is_contiguous() &&
+      k.dim() == 4 && k.transpose(1, 2).is_contiguous() && v.dim() == 4 &&
+      v.transpose(1, 2).is_contiguous() && o.dim() == 4 &&
+      o.transpose(1, 2).is_contiguous();
+  if (!all_bshd) {
+    TORCH_WARN_ONCE(
+        "TORCH_CUDNN_SDPA_AVOID_RECOMPILE=1 only works with Q, K, V, and output in BSHD memory layout,"
+        "e.g., Q, K, V must be allocated with torch.randn((B, S, H, D).transpose(1, 2)."
+        "Falling back to regualr dense case, which may trigger excessive recompilation.");
+  }
+  return all_bshd;
+}
+
+int roundup_power2(int dim) {
+  if (!dim) {
+    return 1;
+  }
+  dim--;
+  dim |= dim >> 1;
+  dim |= dim >> 2;
+  dim |= dim >> 4;
+  dim |= dim >> 8;
+  dim |= dim >> 16;
+  dim++;
+  return dim;
+}
+
 struct MHAParams {
  c10::DeviceIndex device_id;
  fe::DataType_t dataType;
@ -171,6 +221,7 @@ struct MHAParams {
  // might be redundant if we take 0 dim/stride
  // as signaling no-bias
  bool has_attn_bias;
+  bool use_ragged;
 };

 void setMHAParams(
@ -228,6 +279,20 @@ void setMHAParams(
  std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin());
  std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin());
  std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin());
+  bool use_ragged = use_ragged_in_dense(q, k, v, q, params.has_attn_bias);
+  params.use_ragged = use_ragged;
+  if (use_ragged) {
+    // ignore B - stride in BSHD (THD) avoid-recompile
+    params.q_stride[0] = INT_MAX;
+    params.k_stride[0] = INT_MAX;
+    params.v_stride[0] = INT_MAX;
+    // fix seqlen to rounded value
+    params.s_q = roundup_power2(params.s_q);
+    params.s_kv = roundup_power2(params.s_kv);
+    params.q_dim[2] = roundup_power2(params.q_dim[2]);
+    params.k_dim[2] = roundup_power2(params.k_dim[2]);
+    params.v_dim[2] = roundup_power2(params.v_dim[2]);
+  }
  // uninit is OK as the struct is memset 0'd
  if (params.has_attn_bias) {
    std::copy(
@ -277,15 +342,29 @@ struct MHACacheKeyWrapper : ParamsWrapper<MHAParams> {
 template <typename T, typename KeyType>
 struct MHAGraphCache {
  std::unordered_map<KeyType, T, ParamsWrapperHash<KeyType>> engine_cache;
+  int count = 0;
+  int hits = 0;

  // no mutexes here as caches are now thread local for v8, can also return a
  // pointer to the Execution Plan if we know it will not be invalidated by
  // another thread
  T* find(const KeyType& key) {
+    static bool flag =
+        c10::utils::check_env("TORCH_CUDNN_SDPA_CACHE_DEBUG") == true;
+    if (flag && count) {
+      TORCH_WARN(
+          "SDPA Cache Called ",
+          count,
+          " times. Hit rate: ",
+          100 * hits / count,
+          "%");
+    }
+    count++;
    auto it = engine_cache.find(key);
    if (it == engine_cache.end()) {
      return nullptr;
    }
+    hits++;
    return &(it->second);
  }

@ -402,6 +481,25 @@ auto build_graph(
          .set_is_inference(return_softmaxstats == false)
          .set_causal_mask(is_causal)
          .set_attn_scale(attn_scale);
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto SEQ_LEN_Q_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_Q)
+                              .set_name("Seq_q")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto SEQ_LEN_KV_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_KV)
+                              .set_name("Seq_kv")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    scaled_dot_product_flash_attention_options.set_seq_len_q(SEQ_LEN_Q_)
+        .set_seq_len_kv(SEQ_LEN_KV_)
+        .set_padding_mask(true);
+  }
  if (dropout_probability != 0.0f) {
    auto seed = mha_graph->tensor(fe::graph::Tensor_attributes()
                                      .set_uid(SEED)
@ -425,23 +523,11 @@ auto build_graph(
        dropout_probability, seed, offset);
  }
  auto Q_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_uid(Q)
-          .set_name("Q")
-          .set_dim(q.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec())));
+      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
  auto K_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_uid(K)
-          .set_name("K")
-          .set_dim(k.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec())));
+      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
  auto V_ = mha_graph->tensor(
-      fe::graph::Tensor_attributes()
-          .set_uid(V)
-          .set_name("V")
-          .set_dim(v.sizes().vec())
-          .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec())));
+      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    bias =
@ -455,12 +541,90 @@ auto build_graph(

  auto [O_, Stats] =
      mha_graph->sdpa(Q_, K_, V_, scaled_dot_product_flash_attention_options);
-  O_->set_uid(O);
-  O_->set_output(true).set_dim(o.sizes().vec()).set_stride(o.strides().vec());
-
+  O_->set_uid(O).set_output(true);
  if (Stats) {
-    Stats->set_uid(LSE);
-    Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT);
+    Stats->set_uid(LSE)
+        .set_output(true)
+        .set_data_type(fe::DataType_t::FLOAT)
+        .set_stride(softmaxstats.strides().vec());
+  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto RAG_Q_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_Q_OFF)
+                              .set_name("cum_seq_q")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_K_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_K_OFF)
+                              .set_name("cum_seq_k")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_V_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_V_OFF)
+                              .set_name("cum_seq_v")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_O_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_O_OFF)
+                              .set_name("cum_seq_o")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_STATS_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_LSE_OFF)
+                              .set_name("cum_seq_stats")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    O_->set_ragged_offset(RAG_O_OFF_);
+    Q_->set_ragged_offset(RAG_Q_OFF_);
+    K_->set_ragged_offset(RAG_K_OFF_);
+    V_->set_ragged_offset(RAG_V_OFF_);
+    auto qsizevec = q.sizes().vec();
+    auto ksizevec = k.sizes().vec();
+    auto vsizevec = v.sizes().vec();
+    auto osizevec = o.sizes().vec();
+    qsizevec[2] = roundup_power2(qsizevec[2]);
+    ksizevec[2] = roundup_power2(ksizevec[2]);
+    vsizevec[2] = roundup_power2(vsizevec[2]);
+    osizevec[2] = roundup_power2(osizevec[2]);
+    // we checked for BSHD contig., set fake strides as cuDNN will complain
+    // if e.g., a ragged dim is smaller than a non-ragged one:
+    // consider HBSD tensor where H is 1
+    Q_->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    K_->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    V_->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    O_->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+    if (Stats) {
+      Stats->set_ragged_offset(RAG_STATS_OFF_);
+      auto statssizevec = softmaxstats.sizes().vec();
+      statssizevec[2] = roundup_power2(statssizevec[2]);
+      Stats->set_dim(statssizevec);
+    }
+  } else {
+    Q_->set_dim(q.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(q.sizes(), q.strides().vec()));
+    K_->set_dim(k.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(k.sizes(), k.strides().vec()));
+    V_->set_dim(v.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(v.sizes(), v.strides().vec()));
+    O_->set_dim(o.sizes().vec())
+        .set_stride(fixSizeOneDimStrideSDPA(o.sizes(), o.strides().vec()));
+    if (Stats) {
+      Stats->set_dim(softmaxstats.sizes().vec());
+    }
  }

  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
@ -566,7 +730,7 @@ auto build_graph_nestedtensor(
  auto q_strides = q.strides();
  auto k_strides = k.strides();
  auto v_strides = v.strides();
-  // NB: cuDNN API shape is transposed
+  // NB: cuDNN API shape is transposed: we pass it nominally as HTD
  constexpr int strideidx0 = 1;
  constexpr int strideidx1 = 0;
  constexpr int strideidx2 = 2;
@ -724,21 +888,32 @@ auto build_graph_backward(
                                   .set_name("CUDNN_SDPA_BACKWARD")
                                   .set_causal_mask(is_causal)
                                   .set_attn_scale(attn_scale);
-  auto Q_ = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_uid(Q)
-                                  .set_name("Q")
-                                  .set_dim(q.sizes().vec())
-                                  .set_stride(q.strides().vec()));
-  auto K_ = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_uid(K)
-                                  .set_name("K")
-                                  .set_dim(k.sizes().vec())
-                                  .set_stride(k.strides().vec()));
-  auto V_ = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_uid(V)
-                                  .set_name("V")
-                                  .set_dim(v.sizes().vec())
-                                  .set_stride(v.strides().vec()));
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto SEQ_LEN_Q_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_Q)
+                              .set_name("Seq_q")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto SEQ_LEN_KV_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(SEQ_LEN_KV)
+                              .set_name("Seq_kv")
+                              .set_dim({b, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    sdpa_backward_options.set_seq_len_q(SEQ_LEN_Q_)
+        .set_seq_len_kv(SEQ_LEN_KV_)
+        .set_padding_mask(true);
+  }
+
+  auto Q_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(Q).set_name("Q"));
+  auto K_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(K).set_name("K"));
+  auto V_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(V).set_name("V"));
  std::optional<std::shared_ptr<fe::graph::Tensor_attributes>> bias;
  if (attn_bias.has_value()) {
    bias =
@ -770,31 +945,108 @@ auto build_graph_backward(
                                                : fe::DataType_t::INT64));
    sdpa_backward_options.set_dropout(dropout_probability, seed, offset);
  }
-
-  auto O_ = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_uid(O)
-                                  .set_name("O")
-                                  .set_dim(o.sizes().vec())
-                                  .set_stride(o.strides().vec()));
+  auto O_ = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(O).set_name("O"));
  auto Stats = mha_graph->tensor(fe::graph::Tensor_attributes()
                                     .set_uid(LSE)
                                     .set_name("Stats")
-                                     .set_dim(softmaxstats.sizes().vec())
                                     .set_stride(softmaxstats.strides().vec())
                                     .set_data_type(fe::DataType_t::FLOAT));
-  auto Do = mha_graph->tensor(fe::graph::Tensor_attributes()
-                                  .set_uid(DO)
-                                  .set_name("DO")
-                                  .set_dim(dO.sizes().vec())
-                                  .set_stride(dO.strides().vec()));
+  auto Do = mha_graph->tensor(
+      fe::graph::Tensor_attributes().set_uid(DO).set_name("DO"));
  auto [Dq, Dk, Dv] = mha_graph->sdpa_backward(
      Q_, K_, V_, O_, Do, Stats, sdpa_backward_options);
-  Dq->set_uid(DQ);
-  Dq->set_output(true).set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
-  Dk->set_uid(DK);
-  Dk->set_output(true).set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
-  Dv->set_uid(DV);
-  Dv->set_output(true).set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
+  Dq->set_uid(DQ).set_output(true);
+  Dk->set_uid(DK).set_output(true);
+  Dv->set_uid(DV).set_output(true);
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    auto RAG_Q_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_Q_OFF)
+                              .set_name("cum_seq_q")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_K_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_K_OFF)
+                              .set_name("cum_seq_k")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_V_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_V_OFF)
+                              .set_name("cum_seq_v")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_O_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_O_OFF)
+                              .set_name("cum_seq_o")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    auto RAG_STATS_OFF_ =
+        mha_graph->tensor(fe::graph::Tensor_attributes()
+                              .set_uid(RAG_LSE_OFF)
+                              .set_name("cum_seq_stats")
+                              .set_dim({b + 1, 1, 1, 1})
+                              .set_stride({1, 1, 1, 1})
+                              .set_data_type(fe::DataType_t::INT32));
+    O_->set_ragged_offset(RAG_O_OFF_);
+    Q_->set_ragged_offset(RAG_Q_OFF_);
+    K_->set_ragged_offset(RAG_K_OFF_);
+    V_->set_ragged_offset(RAG_V_OFF_);
+    Dq->set_ragged_offset(RAG_Q_OFF_);
+    Dk->set_ragged_offset(RAG_K_OFF_);
+    Dv->set_ragged_offset(RAG_V_OFF_);
+    Do->set_ragged_offset(RAG_O_OFF_);
+    auto qsizevec = q.sizes().vec();
+    auto ksizevec = k.sizes().vec();
+    auto vsizevec = v.sizes().vec();
+    auto osizevec = o.sizes().vec();
+    qsizevec[2] = roundup_power2(qsizevec[2]);
+    ksizevec[2] = roundup_power2(ksizevec[2]);
+    vsizevec[2] = roundup_power2(vsizevec[2]);
+    osizevec[2] = roundup_power2(osizevec[2]);
+    // see corresponding section in the forward about the hardcoding
+    // of strides here
+    Q_->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    K_->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    V_->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    O_->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+    // should be identical to their non-d counterparts
+    Dq->set_dim(qsizevec).set_stride(
+        {INT_MAX, qsizevec[3], qsizevec[1] * qsizevec[3], 1});
+    Dk->set_dim(ksizevec).set_stride(
+        {INT_MAX, ksizevec[3], ksizevec[1] * ksizevec[3], 1});
+    Dv->set_dim(vsizevec).set_stride(
+        {INT_MAX, vsizevec[3], vsizevec[1] * vsizevec[3], 1});
+    Do->set_dim(osizevec).set_stride(
+        {INT_MAX, osizevec[3], osizevec[1] * osizevec[3], 1});
+
+    Stats->set_ragged_offset(RAG_STATS_OFF_);
+    auto statssizevec = softmaxstats.sizes().vec();
+    statssizevec[2] = roundup_power2(statssizevec[2]);
+    Stats->set_dim(statssizevec);
+  } else {
+    O_->set_dim(o.sizes().vec()).set_stride(o.strides().vec());
+    Q_->set_dim(q.sizes().vec()).set_stride(q.strides().vec());
+    K_->set_dim(k.sizes().vec()).set_stride(k.strides().vec());
+    V_->set_dim(v.sizes().vec()).set_stride(v.strides().vec());
+    Dq->set_dim(dQ.sizes().vec()).set_stride(dQ.strides().vec());
+    Dk->set_dim(dK.sizes().vec()).set_stride(dK.strides().vec());
+    Dv->set_dim(dV.sizes().vec()).set_stride(dV.strides().vec());
+    Do->set_dim(dO.sizes().vec()).set_stride(dO.strides().vec());
+    Stats->set_dim(softmaxstats.sizes().vec());
+  }
+
  AT_CUDNN_FRONTEND_CHECK(mha_graph->validate());
  AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle));
  AT_CUDNN_FRONTEND_CHECK(
@ -1066,6 +1318,47 @@ void run_cudnn_SDP_fprop(
    Tensor& o,
    Tensor& dropoutseed,
    Tensor& dropoutoffset) {
+  // do nothing if we got 0-element tensors
+  if (!q.numel() || !k.numel() || !v.numel()) {
+    return;
+  }
+  Tensor seqlen_q, seqlen_kv;
+  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
+
+  if (!o.defined()) {
+    // q is passed to us in BHSD dim order
+    alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
+  }
+  bool use_ragged = use_ragged_in_dense(q, k, v, o, attn_bias.has_value());
+  if (return_softmaxstats && !softmaxstats.defined()) {
+    // TODO(eqy): investigate why cuDNN doesn't like BSH layout softmaxstats
+    if (!use_ragged) {
+      softmaxstats = at::empty({b, h, s_q, 1}, q.options().dtype(kFloat));
+    } else {
+      softmaxstats =
+          at::empty({b, s_q, h, 1}, q.options().dtype(kFloat)).transpose(1, 2);
+    }
+  }
+
+  if (use_ragged) {
+    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
+    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
+    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
+                            .cumsum(0, kInt)
+                            .add_(-s_q);
+    auto cum_seqlen_kv =
+        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
+            .cumsum(0, kInt)
+            .add_(-s_kv);
+    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
+    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
+    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
+    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
+    if (return_softmaxstats) {
+      rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
+    }
+  }
+
  const auto dprops = at::cuda::getCurrentDeviceProperties();
  auto _dropoutseed = dropoutseed;
  auto _dropoutoffset = dropoutoffset;
@ -1076,21 +1369,10 @@ void run_cudnn_SDP_fprop(
  }

  cudnnHandle_t handle = getCudnnHandle();
-  if (!o.defined()) {
-    // q is passed to us in BHSD dim order
-    alloc_with_matching_layout(q, o, {b, h, s_q, d_v});
-  }
-
-  if (return_softmaxstats && !softmaxstats.defined()) {
-    // TODO(eqy): verify that this is correct
-    softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat));
-  }
-
-  // do nothing if we got 0-element tensors
-  if (!q.numel() || !k.numel() || !v.numel()) {
-    return;
-  }

+  // NB: The key initialization will round up sequence length, stride data etc.
+  // if use_ragged_in_dense is enabled (to allow multiple sequence lenghths to
+  // reuse the same cached value/graph)
  auto key = MHACacheKeyWrapper(
      b,
      h,
@ -1147,6 +1429,17 @@ void run_cudnn_SDP_fprop(
    variant_pack[SEED] = _dropoutseed.data_ptr();
    variant_pack[OFFSET] = _dropoutoffset.data_ptr();
  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    if (return_softmaxstats) {
+      variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+    }
+  }
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
@ -1278,6 +1571,9 @@ void run_cudnn_SDP_bprop(
      !softmaxstats.numel()) {
    return;
  }
+  Tensor seqlen_q, seqlen_kv;
+  Tensor rag_off_q, rag_off_k, rag_off_v, rag_off_o, rag_off_lse;
+
  auto dprops = at::cuda::getCurrentDeviceProperties();
  auto _dropoutseed = dropoutseed;
  auto _dropoutoffset = dropoutoffset;
@ -1304,10 +1600,28 @@ void run_cudnn_SDP_bprop(
      "with matching strides...");
 #else
  const auto innermost_dO_stride = dO.strides()[dO.strides().size() - 1];
-  if (innermost_dO_stride != 1) {
+  if (innermost_dO_stride != 1 ||
+      use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
    permute_to_matching_layout(o, dO_);
  }
 #endif
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    seqlen_q = at::full({b, 1, 1, 1}, s_q, q.options().dtype(kInt));
+    seqlen_kv = at::full({b, 1, 1, 1}, s_kv, q.options().dtype(kInt));
+    auto cum_seqlen_q = at::full({b + 1, 1, 1, 1}, s_q, q.options().dtype(kInt))
+                            .cumsum(0, kInt)
+                            .add_(-s_q);
+    auto cum_seqlen_kv =
+        at::full({b + 1, 1, 1, 1}, s_kv, q.options().dtype(kInt))
+            .cumsum(0, kInt)
+            .add_(-s_kv);
+    rag_off_q = cum_seqlen_q.mul(q.stride(-2));
+    rag_off_k = cum_seqlen_kv.mul(k.stride(-2));
+    rag_off_v = cum_seqlen_kv.mul(v.stride(-2));
+    rag_off_o = cum_seqlen_q.mul(o.stride(-2));
+    rag_off_lse = cum_seqlen_q.mul(softmaxstats.stride(-2));
+  }
+
  cudnnHandle_t handle = getCudnnHandle();
  auto key = MHACacheKeyWrapper(
      b,
@ -1372,6 +1686,16 @@ void run_cudnn_SDP_bprop(
  if (attn_bias.has_value()) {
    variant_pack[BIAS] = attn_bias.value().data_ptr();
  }
+  if (use_ragged_in_dense(q, k, v, o, attn_bias.has_value())) {
+    variant_pack[SEQ_LEN_Q] = seqlen_q.data_ptr();
+    variant_pack[SEQ_LEN_KV] = seqlen_kv.data_ptr();
+    variant_pack[RAG_Q_OFF] = rag_off_q.data_ptr();
+    variant_pack[RAG_K_OFF] = rag_off_k.data_ptr();
+    variant_pack[RAG_V_OFF] = rag_off_v.data_ptr();
+    variant_pack[RAG_O_OFF] = rag_off_o.data_ptr();
+    variant_pack[RAG_LSE_OFF] = rag_off_lse.data_ptr();
+  }
+
  auto workspace_size = mha_graph->get_workspace_size();
  auto workspace_ptr =
      c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
--- a/aten/src/ATen/native/mps/kernels/GridSampler.h
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.h
@ -0,0 +1,25 @@
+#pragma once
+#include <c10/metal/common.h>
+
+#ifdef __METAL__
+enum class GridSamplerInterpolation { Bilinear, Nearest, Bicubic };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+#else
+#include <ATen/native/GridSamplerUtils.h>
+using at::native::GridSamplerInterpolation;
+using at::native::GridSamplerPadding;
+#endif
+
+template <unsigned N = 5, typename idx_type_t = int32_t>
+struct GridSamplerParams {
+  int32_t sampler_dims;
+  ::c10::metal::array<idx_type_t, N> output_sizes;
+  ::c10::metal::array<idx_type_t, N> output_strides;
+  ::c10::metal::array<idx_type_t, N> input_sizes;
+  ::c10::metal::array<idx_type_t, N> input_strides;
+  ::c10::metal::array<idx_type_t, N> grid_sizes;
+  ::c10::metal::array<idx_type_t, N> grid_strides;
+  GridSamplerInterpolation interpolation_mode;
+  GridSamplerPadding padding_mode;
+  bool align_corners;
+};
--- a/aten/src/ATen/native/mps/kernels/GridSampler.metal
+++ b/aten/src/ATen/native/mps/kernels/GridSampler.metal
@ -0,0 +1,329 @@
+#include <ATen/native/mps/kernels/GridSampler.h>
+#include <c10/metal/utils.h>
+#include <metal_array>
+#include <metal_stdlib>
+
+using namespace metal;
+using namespace c10::metal;
+
+struct GridSamplerOffsets {
+  int32_t output;
+  int32_t input;
+  int32_t grid;
+
+  GridSamplerOffsets() : output(0), input(0), grid(0) {}
+};
+
+// Find offsets into the tensors that this thread will operate on,
+// based on the thread ID.
+static GridSamplerOffsets find_grid_sampler_offsets(
+    constant int32_t* output_sizes,
+    constant int32_t* output_strides,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    constant int32_t* grid_sizes,
+    constant int32_t* grid_strides,
+    int32_t sampler_dims,
+    uint tid) {
+  auto dims = sampler_dims + 2;
+  auto output_idx = static_cast<int32_t>(tid);
+  GridSamplerOffsets offsets;
+
+  for (auto dim = dims - 1; dim >= 0; dim--) {
+    auto dim_idx = output_idx % output_sizes[dim];
+    output_idx = output_idx / output_sizes[dim];
+
+    // Select the output element that this thread will calculate.
+    // output shape:
+    //   2 sampler dims: (N, C, Hout, Wout)
+    //   3 sampler dims: (N, C, Dout, Hout, Wout)
+    offsets.output += output_strides[dim] * dim_idx;
+
+    // Select the batch and channel for the input.
+    // input shape:
+    //   2 sampler dims: (N, C, Hin, Win)
+    //   3 sampler dims: (N, C, Din, Hin, Win)
+    if (dim < 2) {
+      offsets.input += input_strides[dim] * dim_idx;
+    }
+
+    // Select the grid coordinates for the output element.
+    // grid shape:
+    //   2 sampler dims: (N, Hout, Wout, 2)
+    //   3 sampler dims: (N, Dout, Hout, Wout, 3)
+    if (dim == 0) {
+      offsets.grid += grid_strides[dim] * dim_idx;
+    } else if (dim >= 2) {
+      offsets.grid += grid_strides[dim - 1] * dim_idx;
+    }
+  }
+
+  return offsets;
+}
+
+// Mod function which gives postive output when `a` is negative
+static int32_t mod(int32_t a, int32_t b) {
+  auto r = a % b;
+  return r + (r < 0 ? b : 0);
+}
+
+// Sentinel index value to indicate zero padding
+constant int32_t IDX_ZERO = -1;
+
+// Apply padding to an index into the input
+static int32_t pad_input_index(
+    int32_t idx,
+    int32_t input_size,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  int32_t idx_padded = idx;
+
+  if (padding_mode == GridSamplerPadding::Zeros) {
+    idx_padded = (idx < 0) ? IDX_ZERO : idx_padded;
+    idx_padded = (idx >= input_size) ? IDX_ZERO : idx_padded;
+
+  } else if (padding_mode == GridSamplerPadding::Border) {
+    idx_padded = (idx < 0) ? 0 : idx_padded;
+    idx_padded = (idx >= input_size) ? input_size - 1 : idx_padded;
+
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    auto scale_length = align_corners ? (input_size - 1) : input_size;
+    auto idx_mod = mod(idx, scale_length);
+    auto idx_mod_reverse = (input_size - 1) - idx_mod;
+    bool is_reverse = (abs(idx - idx_mod) / scale_length) % 2 == 1;
+    idx_padded = is_reverse ? idx_mod_reverse : idx_mod;
+  }
+  return idx_padded;
+}
+
+template <int32_t dims, typename T>
+T get_tensor_val(
+    constant T* input,
+    constant int32_t* input_strides,
+    int32_t indices[dims]) {
+  bool found_idx_zero = false;
+  int32_t offset = 0;
+
+  for (auto dim = 0; dim < dims; dim++) {
+    auto idx = indices[dim];
+    found_idx_zero = found_idx_zero || (idx == IDX_ZERO);
+    offset += (found_idx_zero ? 0 : idx) * input_strides[dim];
+  }
+
+  return found_idx_zero ? 0 : input[offset];
+}
+
+// This function performs 3D linear interpolation for one value. One way to
+// think of how this works is to imagine a unit cube where each corner of the
+// cube has one scalar value associated with it. Inside the cube, the values
+// change linearly, so the gradient is constant. The values associated with each
+// corner are given by the `input`, indexed at all eight different combinations
+// of the `left_indices` and `right_indices`. Given a 3D coordinate anywhere
+// within the cube, specified by the `scales` argument, we must calculate the
+// value associated with that position.
+template <typename T>
+T interpolate_linear_3d(
+    constant T* input,
+    constant int32_t* input_strides,
+    int32_t left_indices[3],
+    int32_t right_indices[3],
+    opmath_t<T> scales[3]) {
+  int32_t a_idx[3] = {left_indices[0], left_indices[1], left_indices[2]};
+  int32_t b_idx[3] = {left_indices[0], left_indices[1], right_indices[2]};
+  int32_t c_idx[3] = {left_indices[0], right_indices[1], left_indices[2]};
+  int32_t d_idx[3] = {left_indices[0], right_indices[1], right_indices[2]};
+  int32_t e_idx[3] = {right_indices[0], left_indices[1], left_indices[2]};
+  int32_t f_idx[3] = {right_indices[0], left_indices[1], right_indices[2]};
+  int32_t g_idx[3] = {right_indices[0], right_indices[1], left_indices[2]};
+  int32_t h_idx[3] = {right_indices[0], right_indices[1], right_indices[2]};
+  auto a =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, a_idx));
+  auto b =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, b_idx));
+  auto c =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, c_idx));
+  auto d =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, d_idx));
+  auto e =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, e_idx));
+  auto f =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, f_idx));
+  auto g =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, g_idx));
+  auto h =
+      static_cast<opmath_t<T>>(get_tensor_val<3>(input, input_strides, h_idx));
+
+  auto scale0_right = scales[0];
+  auto scale1_right = scales[1];
+  auto scale2_right = scales[2];
+  auto scale0_left = 1 - scale0_right;
+  auto scale1_left = 1 - scale1_right;
+  auto scale2_left = 1 - scale2_right;
+
+  return static_cast<T>(
+      scale0_left * scale1_left * scale2_left * a +
+      scale0_left * scale1_left * scale2_right * b +
+      scale0_left * scale1_right * scale2_left * c +
+      scale0_left * scale1_right * scale2_right * d +
+      scale0_right * scale1_left * scale2_left * e +
+      scale0_right * scale1_left * scale2_right * f +
+      scale0_right * scale1_right * scale2_left * g +
+      scale0_right * scale1_right * scale2_right * h);
+}
+
+// Calculates a single output element.
+// `input` shape:
+//    2 sampler dims: (Hin, Win)
+//    3 sampler dims: (Din, Hin, Win)
+// `coords` values:
+//    2 sampler dims: (Wcoord, Hcoord)
+//    3 sampler dims: (Wcoord, Hcoord, Dcoord)
+template <typename T>
+void grid_sampler_single_element(
+    device T* output,
+    constant T* input,
+    constant T* coords,
+    int32_t dims,
+    constant int32_t* input_sizes,
+    constant int32_t* input_strides,
+    GridSamplerInterpolation interpolation_mode,
+    GridSamplerPadding padding_mode,
+    bool align_corners) {
+  int32_t left_indices[3];
+  int32_t right_indices[3];
+  opmath_t<T> scales[3];
+
+  // For each dimension, find the pair of indices in the cooresponding dimension
+  // of `input` which surround the grid coordinate in that dimension. We'll do
+  // this by mapping different coordiante spaces onto each other. There are
+  // basically three different coordinate spaces to keep in mind:
+  //
+  //  * aligned grid space
+  //    - `-1` refers to the leftmost input value.
+  //    - `1` refers to the rightmost input value.
+  //
+  //  * unaligned grid space
+  //    - `-1` refers to the midpoint between the leftmost input value and
+  //      a padding value to the left of that.
+  //    - `1` refers to the midpoint between the rightmost input value and
+  //      a padding value to the right of that.
+  //
+  //  * input index space
+  //    - `n` refers to the n-th value of the input.
+  //    - `0` refers to the leftmost input value.
+  //    - `N-1` refers to the rightmost input value.
+  //
+  // If `align_corners == False`, then the coordinates are is in unaligned grid
+  // space, and we will map it onto aligned grid space. If `align_corners ==
+  // True`, then coordinates are already in aligned grid space.
+  //
+  // Then we will map unaligned grid space onto input index space, making it
+  // relatively simple to find the two input indices that surround the
+  // coordinate.
+  for (auto coord_dim = 0; coord_dim < dims; coord_dim++) {
+    auto input_dim = dims - coord_dim - 1;
+    auto input_size = input_sizes[input_dim];
+    auto coord = static_cast<opmath_t<T>>(coords[coord_dim]);
+
+    // Interpret nan as -1
+    coord = isnan(coord) ? -1 : coord;
+
+    if (!align_corners) {
+      // Map unaligned grid space to aligned grid space
+      auto corner_alignment_factor = static_cast<opmath_t<T>>(input_size) /
+          static_cast<opmath_t<T>>(input_size - 1);
+      coord = coord * corner_alignment_factor;
+    }
+
+    // Map aligned grid space to input index space
+    coord = (coord + 1) * (static_cast<opmath_t<T>>(input_size - 1) / 2);
+
+    // Get the input indices surrounding the coordinate, apply padding to them,
+    // and obtain the scaling factor between the two for interpolation.
+    auto left_idx = static_cast<int32_t>(floor(coord));
+    auto right_idx = static_cast<int32_t>(ceil(coord));
+    left_indices[input_dim] =
+        pad_input_index(left_idx, input_size, padding_mode, align_corners);
+    right_indices[input_dim] =
+        pad_input_index(right_idx, input_size, padding_mode, align_corners);
+
+    auto scale = coord - left_idx;
+
+    if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      // TODO: For some reason, rounding the scale to 0 or 1 and then using
+      // linear interpolation seems to work perfectly with zero padding mode,
+      // but we get flaky failures with border and reflection padding modes.
+      // Need to investigate and fix it.
+      scale = (scale <= 0.5) ? 0 : 1;
+    }
+    scales[input_dim] = scale;
+  }
+
+  // Now that we have the bounding indices and scale factor for each dimension
+  // of the input, we can interpolate.
+  if (dims == 3) {
+    *output = interpolate_linear_3d(
+        input, input_strides, left_indices, right_indices, scales);
+  }
+}
+
+template <typename T>
+kernel void grid_sampler(
+    device T* output [[buffer(0)]],
+    constant T* input [[buffer(1)]],
+    constant T* grid [[buffer(2)]],
+    constant GridSamplerParams<5>& params [[buffer(3)]],
+    uint tid [[thread_position_in_grid]]) {
+  auto output_sizes = params.output_sizes.data();
+  auto output_strides = params.output_strides.data();
+  auto input_sizes = params.input_sizes.data();
+  auto input_strides = params.input_strides.data();
+  auto grid_sizes = params.grid_sizes.data();
+  auto grid_strides = params.grid_strides.data();
+  auto sampler_dims = params.sampler_dims;
+
+  auto offsets = find_grid_sampler_offsets(
+      output_sizes,
+      output_strides,
+      input_sizes,
+      input_strides,
+      grid_sizes,
+      grid_strides,
+      sampler_dims,
+      tid);
+
+  output += offsets.output;
+  input += offsets.input;
+  auto coords = grid + offsets.grid;
+
+  input_sizes += 2;
+  input_strides += 2;
+
+  auto interpolation_mode = params.interpolation_mode;
+  auto padding_mode = params.padding_mode;
+  auto align_corners = params.align_corners;
+
+  grid_sampler_single_element(
+      output,
+      input,
+      coords,
+      sampler_dims,
+      input_sizes,
+      input_strides,
+      interpolation_mode,
+      padding_mode,
+      align_corners);
+}
+
+#define REGISTER_GRID_SAMPLER_OP(DTYPE)                     \
+  template [[host_name("grid_sampler_" #DTYPE)]]            \
+  kernel void grid_sampler<DTYPE>(                          \
+      device DTYPE * output [[buffer(0)]],                  \
+      constant DTYPE * input [[buffer(1)]],                 \
+      constant DTYPE * grid [[buffer(2)]],                  \
+      constant GridSamplerParams<5> & params [[buffer(3)]], \
+      uint tid [[thread_position_in_grid]]);
+
+REGISTER_GRID_SAMPLER_OP(float);
+REGISTER_GRID_SAMPLER_OP(half);
+REGISTER_GRID_SAMPLER_OP(bfloat);
--- a/aten/src/ATen/native/mps/operations/GridSampler.mm
+++ b/aten/src/ATen/native/mps/operations/GridSampler.mm
@ -1,7 +1,10 @@
 #define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+#include <ATen/mps/MPSProfiler.h>
 #include <ATen/native/GridSamplerUtils.h>
+#include <ATen/native/Pool.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
+#include <ATen/native/mps/kernels/GridSampler.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -9,9 +12,17 @@
 #else
 #include <ATen/ops/grid_sampler_2d.h>
 #include <ATen/ops/grid_sampler_2d_native.h>
+#include <ATen/ops/grid_sampler_3d_native.h>
 #endif

 namespace at::native {
+
+#ifndef PYTORCH_JIT_COMPILE_SHADERS
+static auto& lib = mps::MetalShaderLibrary::getBundledLibrary();
+#else
+#include <ATen/native/mps/GridSampler_metallib.h>
+#endif
+
 namespace mps {
 static void grid_sampler_2d_mps_impl(Tensor& output,
                                     const Tensor& input,
@ -120,6 +131,96 @@ static void grid_sampler_2d_mps_impl(Tensor& output,
    runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder);
  }
 }
+
+static void grid_sampler_template(Tensor& output,
+                                  const Tensor& input,
+                                  const Tensor& grid,
+                                  int64_t _interpolation_mode,
+                                  int64_t _padding_mode,
+                                  bool align_corners,
+                                  int32_t sampler_dims,
+                                  const std::string& op_name) {
+  check_grid_sampler_common(input, grid);
+  switch (sampler_dims) {
+    case 2:
+      check_grid_sampler_2d(input, grid);
+      break;
+    case 3:
+      check_grid_sampler_3d(input, grid, _interpolation_mode);
+      break;
+    default:
+      TORCH_INTERNAL_ASSERT(false, "Only 2D and 3D sampling are supported, but got: ", sampler_dims);
+  }
+  TORCH_CHECK(input.scalar_type() == grid.scalar_type(),
+              "expected input and grid to have the same type, but got ",
+              input.scalar_type(),
+              " and ",
+              grid.scalar_type());
+
+  auto interpolation_mode = static_cast<GridSamplerInterpolation>(_interpolation_mode);
+  auto padding_mode = static_cast<GridSamplerPadding>(_padding_mode);
+
+  switch (interpolation_mode) {
+    case GridSamplerInterpolation::Bilinear:
+      break;
+    case GridSamplerInterpolation::Nearest:
+      TORCH_CHECK(false, op_name, ": Unsupported Nearest interpolation");
+      break;
+    case GridSamplerInterpolation::Bicubic:
+      TORCH_CHECK(false, op_name, ": Unsupported Bicubic interpolation");
+      break;
+    default:
+      TORCH_CHECK(false, op_name, ": Unrecognised interpolation mode: ", _interpolation_mode);
+  }
+
+  switch (padding_mode) {
+    case GridSamplerPadding::Zeros:
+    case GridSamplerPadding::Border:
+    case GridSamplerPadding::Reflection:
+      break;
+    default:
+      TORCH_CHECK(false, op_name, ": Unrecognised Padding Mode: ", _padding_mode);
+  }
+
+  auto input_size = input.sizes();
+  auto grid_size = grid.sizes();
+  output.resize_({input_size[0], input_size[1], grid_size[1], grid_size[2], grid_size[3]}, MemoryFormat::Contiguous);
+
+  auto dims = input.dim();
+
+  GridSamplerParams<5> params;
+  params.sampler_dims = sampler_dims;
+  params.padding_mode = padding_mode;
+  params.interpolation_mode = interpolation_mode;
+  params.align_corners = align_corners;
+
+  for (const auto dim : c10::irange(dims)) {
+    params.output_sizes[dim] = safe_downcast<int32_t, int64_t>(output.size(dim));
+    params.output_strides[dim] = safe_downcast<int32_t, int64_t>(output.stride(dim));
+    params.input_sizes[dim] = safe_downcast<int32_t, int64_t>(input.size(dim));
+    params.input_strides[dim] = safe_downcast<int32_t, int64_t>(input.stride(dim));
+    params.grid_sizes[dim] = safe_downcast<int32_t, int64_t>(grid.size(dim));
+    params.grid_strides[dim] = safe_downcast<int32_t, int64_t>(grid.stride(dim));
+  }
+
+  auto num_threads = output.numel();
+  MPSStream* mpsStream = getCurrentMPSStream();
+
+  dispatch_sync_with_rethrow(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      auto pso = lib.getPipelineStateForFunc("grid_sampler_" + scalarToMetalTypeString(input));
+
+      getMPSProfiler().beginProfileKernel(pso, op_name, {input, grid});
+      [computeEncoder setComputePipelineState:pso];
+      mtl_setArgs(computeEncoder, output, input, grid, params);
+
+      mtl_dispatch1DJob(computeEncoder, pso, num_threads);
+      getMPSProfiler().endProfileKernel(pso);
+    }
+  });
+}
+
 } // namespace mps

 Tensor grid_sampler_2d_mps(const Tensor& input,
@ -135,4 +236,21 @@ Tensor grid_sampler_2d_mps(const Tensor& input,
  return output;
 }

+Tensor grid_sampler_3d_mps(const Tensor& input,
+                           const Tensor& grid,
+                           int64_t interpolation_mode,
+                           int64_t padding_mode,
+                           bool align_corners) {
+  auto output = at::empty({0}, input.options(), MemoryFormat::Contiguous);
+  mps::grid_sampler_template(output,
+                             input,
+                             grid,
+                             interpolation_mode,
+                             padding_mode,
+                             align_corners,
+                             /*sampler_dims=*/3,
+                             /*op_name=*/"grid_sampler_3d");
+  return output;
+}
+
 } // namespace at::native
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -2931,6 +2931,7 @@
  dispatch:
    CPU: grid_sampler_3d_cpu
    CUDA: grid_sampler_3d_cuda
+    MPS: grid_sampler_3d_mps
  autogen: grid_sampler_3d.out

 # `grid_sampler_3d_backward` takes in `output_mask` to optimize performance for
@ -3447,8 +3448,12 @@

 - func: fbgemm_linear_fp16_weight_fp32_activation(Tensor input, Tensor packed_weight, Tensor? bias) -> Tensor

+- func: fbgemm_linear_fp16_weight_fp32_activation.out(Tensor input, Tensor packed_weight, Tensor? bias, Tensor(a!) output) -> Tensor
+
 - func: fbgemm_linear_fp16_weight(Tensor input, Tensor packed_weight, Tensor bias) -> Tensor

+- func: fbgemm_linear_fp16_weight.out(Tensor input, Tensor packed_weight, Tensor bias, Tensor(a!) output) -> Tensor
+
 - func: fbgemm_pack_quantized_matrix(Tensor input) -> Tensor

 - func: fbgemm_pack_quantized_matrix.KN(Tensor input, int K, int N) -> Tensor
@ -5504,6 +5509,13 @@
  tags: core
  manual_cpp_binding: True

+- func: sym_is_contiguous(Tensor self, MemoryFormat memory_format=contiguous_format) -> SymBool
+  variants: function
+  device_check: NoCheck
+  device_guard: False
+  tags: core
+  manual_cpp_binding: True
+
 - func: sym_numel(Tensor self) -> SymInt
  variants: function
  device_check: NoCheck
--- a/aten/src/ATen/native/transformers/cuda/attention_backward.cu
+++ b/aten/src/ATen/native/transformers/cuda/attention_backward.cu
@ -260,7 +260,7 @@ std::tuple<Tensor, Tensor, Tensor> _cudnn_attention_backward(
                          attn_bias_ /*const std::optional<Tensor>& attn_bias*/,
                          out /*const Tensor& o*/,
                          grad_out/*const Tensor& dO*/,
-                          logsumexp.unsqueeze(-1)/*const Tensor& softmaxstats*/,
+                          logsumexp/*const Tensor& softmaxstats*/,
                          dq/*Tensor& dQ*/,
                          dk/*Tensor& dK*/,
                          dv/*Tensor& dV*/,
--- a/c10/core/TensorImpl.cpp
+++ b/c10/core/TensorImpl.cpp
@ -313,8 +313,15 @@ void TensorImpl::throw_data_ptr_access_error() const {
 c10::SymBool TensorImpl::sym_is_contiguous_custom(
    at::MemoryFormat memory_format) const {
  if (C10_UNLIKELY(matches_python_custom(SizesStridesPolicy::CustomStrides))) {
-    return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
-        this, memory_format);
+    // TO reduce BC breaking and reduce having to introduce
+    // sym_is_contiguous. call is_contiguous when tensor does not
+    if (C10_UNLIKELY(has_symbolic_sizes_strides_)) {
+      return pyobj_slot_.load_pyobj_interpreter()->sym_is_contiguous(
+          this, memory_format);
+    } else {
+      return pyobj_slot_.load_pyobj_interpreter()->is_contiguous(
+          this, memory_format);
+    }
  }

  return sym_is_contiguous_default(memory_format);
--- a/c10/core/impl/PyInterpreter.cpp
+++ b/c10/core/impl/PyInterpreter.cpp
@ -60,6 +60,10 @@ struct NoopPyInterpreterVTable final : public PyInterpreterVTable {
  bool is_contiguous(const TensorImpl* self, at::MemoryFormat) const override {
    PANIC(is_contiguous);
  }
+  c10::SymBool sym_is_contiguous(const TensorImpl* self, at::MemoryFormat)
+      const override {
+    PANIC(sym_is_contiguous);
+  }
  bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
      const override {
    PANIC(is_strides_like);
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -168,6 +168,9 @@ struct C10_API PyInterpreterVTable {

  virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
      const = 0;
+  virtual c10::SymBool sym_is_contiguous(
+      const TensorImpl* self,
+      at::MemoryFormat) const = 0;
  virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
      const = 0;
  virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
--- a/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png
+++ b/docs/source/_static/img/aoti_debugging_guide/cuda_ima_cca.png
--- a/docs/source/torch.compiler_aot_inductor.md
+++ b/docs/source/torch.compiler_aot_inductor.md
@ -202,6 +202,7 @@ Below are some useful tools for debugging AOT Inductor.

 logging
 torch.compiler_aot_inductor_minifier
+torch.compiler_aot_inductor_debugging_guide
 ```

 To enable runtime checks on inputs, set the environment variable `AOTI_RUNTIME_CHECK_INPUTS` to 1. This will raise a `RuntimeError` if the inputs to the compiled model differ in size, data type, or strides from those used during export.
--- a/docs/source/torch.compiler_aot_inductor_debugging_guide.md
+++ b/docs/source/torch.compiler_aot_inductor_debugging_guide.md
@ -0,0 +1,73 @@
+# AOTInductor Debugging Guide
+
+If you encounter CUDA illegal memory access (IMA) errors while using [AOT Inductor](./torch.compiler_aot_inductor.md), this guide provides a systematic approach to debug such errors. AOT Inductor is part of the PT2 stack, similar to torch.compile, but it produces a compilation artifact that can work in a C++ environment. CUDA illegal memory errors can happen non-deterministically and even appear transient at times.
+
+On a high-level, there are three main steps in debugging CUDA IMA errors:
+
+- **Sanity checks**: Use basic debugging flags to catch common issues before diving deeper.
+- **Pinpoint the CUDA IMA**: Make the error deterministic and identify the problematic kernel.
+- **Identify problematic kernels**: Use intermediate value debugging to inspect kernel inputs and outputs.
+
+## Step 1: Sanity Checks
+
+Before diving deep into reliably reproducing the error, try out some existing debugging flags:
+
+```bash
+AOTI_RUNTIME_CHECK_INPUTS=1
+TORCHINDUCTOR_NAN_ASSERTS=1
+```
+
+These flags take effect at compilation time (more precisely, at codegen time):
+
+- `AOTI_RUNTIME_CHECK_INPUTS=1` checks if the inputs satisfy the same set of guards used during compilation. See {ref}`torch.compiler_troubleshooting` for more details.
+- `TORCHINDUCTOR_NAN_ASSERTS=1` adds codegen before and after each Inductor's kernel to check for NaN.
+
+## Step 2: Pinpoint the CUDA IMA
+
+One hard part is CUDA IMA errors can be non-deterministic. They can happen at different locations, and sometimes not happen at all (though that just means the numerics are silently incorrect). With the following two flags, we can trigger the error deterministically:
+
+```bash
+PYTORCH_NO_CUDA_MEMORY_CACHING=1
+CUDA_LAUNCH_BLOCKING=1
+```
+
+These flags take effect at runtime:
+
+- `PYTORCH_NO_CUDA_MEMORY_CACHING=1` disables PyTorch's Caching Allocator, which allocates a bigger buffer than needed immediately to reduce the number of buffer allocations. This is usually the reason why CUDA illegal memory access errors are non-deterministic.
+![How PyTorch's caching allocator can mask CUDA illegal memory access errors](./_static/img/aoti_debugging_guide/cuda_ima_cca.png)
+*Figure: How PyTorch's caching allocator can mask CUDA illegal memory access errors*
+
+- `CUDA_LAUNCH_BLOCKING=1` forces the kernels to launch one at a time. Without this, we would get the famous "CUDA kernel errors might be asynchronously reported at some other API call" warning since kernels are launched asynchronously.
+
+## Step 3: Identify Problematic Kernels with Intermediate Value Debugger
+
+The AOTI Intermediate Value Debugger can help pinpoint the problematic kernel and get information about the inputs and outputs of said kernel.
+
+First, use:
+
+```bash
+AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=3
+```
+
+This flag takes effect at compilation time and prints the kernels one by one at runtime. Together with the previous flags, this would let us know which kernel was launched right before the error happened.
+
+However, it is important to note that just because the error happened in that kernel, it doesn't mean that kernel is problematic. For example, it can happen that an earlier kernel is problematic and produces some wrong outputs. So the natural next step is to inspect the inputs to the problematic kernel:
+
+```bash
+AOT_INDUCTOR_FILTERED_KERNELS_TO_PRINT="triton_poi_fused_add_ge_logical_and_logical_or_lt_231,_add_position_embeddings_kernel_5" AOT_INDUCTOR_DEBUG_INTERMEDIATE_VALUE_PRINTER=2
+```
+
+The filtered kernels to print environment variable has the names of the kernels you want to inspect. If the inputs to the kernel are not as expected, you then inspect the kernel that produces the bad input.
+
+## Additional Debugging Tools
+
+### Logging and Tracing
+
+- **tlparse / TORCH_TRACE**: Provides complete output codes for inspection and records the set of guards used. See {ref}`tlparse / TORCH_TRACE <tlparse-torch-trace>` for more details.
+- **TORCH_LOGS**: Use `TORCH_LOGS="+inductor,output_code"` to see more PT2 internal logs. See {ref}`TORCH_LOGS <torch-logs>` for more details.
+- **TORCH_SHOW_CPP_STACKTRACES**: Set `TORCH_SHOW_CPP_STACKTRACES=1` to potentially see more stack traces.
+
+### Common Sources of Issues
+
+- [**Dynamic shapes**](./torch.compiler_dynamic_shapes.md): Historically a source of many IMAs. Pay special attention when debugging dynamic shape scenarios.
+- **Custom ops**: Especially when implemented in C++ and used with dynamic shapes. There is a need to Symint'ify the meta function.
--- a/docs/source/torch.compiler_troubleshooting.md
+++ b/docs/source/torch.compiler_troubleshooting.md
@ -192,6 +192,8 @@ For more information on dynamic shapes, see [The dynamic shapes manual](https://

 ## Logging Tools

+(tlparse-torch-trace)=
+
 ### tlparse / TORCH_TRACE

 `tlparse` / `TORCH_TRACE` are a pair of tools that produce compilation reports that look like this:
@ -252,6 +254,8 @@ Here are some insights you can gain from a `tlparse`:
  For example, you can look at the high-level generated FX graph or the generated Triton code.
 - Is there relevant information for a particular frame? You can find these in `compilation_metrics`.

+(torch-logs)=
+
 ### TORCH_LOGS

 You can use the `TORCH_LOGS` environment variable to selectively enable parts of the `torch.compile` stack to log.
--- a/test/cpp/jit/test_misc.cpp
+++ b/test/cpp/jit/test_misc.cpp
@ -2709,6 +2709,7 @@ TEST(ProfilerDisableInCallbackTest, Basic) {
 }

 TEST(RecordDebugHandles, Basic) {
+  GTEST_SKIP() << "Test is flaky and sometimes hangs on CI. ";
  // Enable the profiler in this thread
  const std::set<torch::autograd::profiler::ActivityType> activities(
      {torch::autograd::profiler::ActivityType::CPU});
--- a/test/distributed/pipelining/test_backward.py
+++ b/test/distributed/pipelining/test_backward.py
@ -10,7 +10,10 @@ from torch.distributed.pipelining._backward import (
    stage_backward_input,
    stage_backward_weight,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    skipXPUIf,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase


@ -19,6 +22,7 @@ batch_size = 256


 class StageBackwardTests(TestCase):
+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
    def test_stage_backward(self, device):
        # MLP as a stage module
        mod = MLPModule(d_hid).to(device)
@ -93,6 +97,7 @@ class StageBackwardTests(TestCase):
            # Check that the weight gradients were not updated
            self.assertEqual(p.grad, None)

+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
    def test_stage_backward_weight(self, device):
        # MLP as a stage module
        mod = MLPModule(d_hid).to(device)
@ -133,6 +138,7 @@ class StageBackwardTests(TestCase):
                print(f"Gradient test failed for {name}: {p.grad} vs {ref_p.grad}")
                raise

+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
    def test_stage_backward_weight_multiple_iters(self, device):
        # MLP as a stage module
        mod = MLPModule(d_hid).to(device)
@ -223,7 +229,9 @@ class StageBackwardTests(TestCase):


 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(StageBackwardTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    StageBackwardTests, globals(), only_for=devices, allow_xpu=True
+)

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/pipelining/test_microbatch.py
+++ b/test/distributed/pipelining/test_microbatch.py
@ -9,7 +9,10 @@ from torch.distributed.pipelining.microbatch import (
    split_args_kwargs_into_chunks,
    TensorChunkSpec,
 )
-from torch.testing._internal.common_device_type import instantiate_device_type_tests
+from torch.testing._internal.common_device_type import (
+    instantiate_device_type_tests,
+    skipXPUIf,
+)
 from torch.testing._internal.common_utils import run_tests, TestCase


@ -56,6 +59,7 @@ class MicrobatchTests(TestCase):
        torch.testing.assert_close(merged_kwargs, kwargs)
        print("Microbatch test passed")

+    @skipXPUIf(True, "https://github.com/intel/torch-xpu-ops/issues/1682")
    def test_chunk_spec(self, device):
        mod = ModelWithKwargs().to(device)
        batch_size = ModelWithKwargs.DEFAULT_BATCH_SIZE
@ -84,12 +88,15 @@ class MicrobatchTests(TestCase):

        ref = mod(x, y)
        out = pipe(x, y)[0]
+
        torch.testing.assert_close(out, ref)
        print(f"equivalence test passed {torch.sum(out)} ref {torch.sum(ref)}")


 devices = ["cpu", "cuda", "hpu", "xpu"]
-instantiate_device_type_tests(MicrobatchTests, globals(), only_for=devices)
+instantiate_device_type_tests(
+    MicrobatchTests, globals(), only_for=devices, allow_xpu=True
+)

 if __name__ == "__main__":
    run_tests()
--- a/test/distributed/test_cupy_as_tensor.py
+++ b/test/distributed/test_cupy_as_tensor.py
@ -0,0 +1,110 @@
+# Owner(s): ["oncall: distributed"]
+
+# To run:
+# python test/distributed/test_cupy_as_tensor.py
+
+from dataclasses import dataclass
+
+import torch
+from torch.multiprocessing.reductions import reduce_tensor
+from torch.testing._internal.common_distributed import MultiProcContinousTest
+from torch.testing._internal.common_utils import (
+    requires_cuda_p2p_access,
+    run_tests,
+    skipIfRocm,
+)
+
+
+# So that tests are written in device-agnostic way
+device_type = "cuda"
+device_module = torch.get_device_module(device_type)
+
+
+@dataclass
+class CupyWrapper:
+    data_ptr: int
+    size_in_bytes: int
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "shape": (self.size_in_bytes,),
+            "typestr": "|u1",
+            "data": (self.data_ptr, False),
+            "version": 3,
+        }
+
+
+def from_buffer(
+    data_ptr: int, size_in_bytes: int, device: str, dtype: torch.dtype
+) -> torch.Tensor:
+    data = torch.as_tensor(CupyWrapper(data_ptr, size_in_bytes), device=device).view(
+        dtype
+    )
+    assert data.data_ptr() == data_ptr
+    return data
+
+
+@requires_cuda_p2p_access()
+class CupyAsTensorTest(MultiProcContinousTest):
+    @classmethod
+    def backend_str(cls):
+        return "gloo"
+
+    def _init_device(self) -> None:
+        # need to use vmm api to test it,
+        # see https://forums.developer.nvidia.com/t/inconsistent-behavior-of-cudapointergetattributes-between-cudamalloc-ipc-and-vmm-based-ipc/339025/5 # noqa: B950
+        torch.cuda.memory._set_allocator_settings("expandable_segments:True")
+        # init and pin the process to the device
+        device_module.set_device(self.device)
+        torch.empty(1, device=self.device)
+
+    @property
+    def device(self) -> torch.device:
+        return torch.device(device_type, self.rank)
+
+    @skipIfRocm
+    def test_cupy_as_tensor(self) -> None:
+        """
+        Test that torch.as_tensor works for cupy array interface
+        with zero-copy when the pointer is p2p-shared across processes.
+        """
+        self._init_device()
+
+        tensor: torch.Tensor
+        if self.rank == 1:
+            # it seems only error from rank non-zero will be caught by this test
+            tensor = torch.randn(2333, device=self.device)
+            tensor_meta = reduce_tensor(tensor)
+            torch.distributed.broadcast_object_list([tensor_meta], src=1)
+        else:
+            recv_list = [None]
+            torch.distributed.broadcast_object_list(recv_list, src=1)
+            tensor_meta = recv_list[0]
+            func, args = tensor_meta
+            args = list(args)
+            args[6] = self.rank
+            ipc_tensor = func(*args)
+            tensor = from_buffer(
+                ipc_tensor.data_ptr(),
+                ipc_tensor.numel() * ipc_tensor.element_size(),
+                self.device,
+                ipc_tensor.dtype,
+            )
+
+        torch.distributed.barrier()
+        if self.rank == 1:
+            tensor.fill_(1)
+        device_module.synchronize()
+        torch.distributed.barrier()
+        assert tensor.allclose(tensor, 1)
+        torch.distributed.barrier()
+
+    @classmethod
+    def tearDownClass(cls):
+        torch.cuda.memory._set_allocator_settings("expandable_segments:False")
+        super().tearDownClass()
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/dynamo/cpython/3_13/test_contextlib.diff
+++ b/test/dynamo/cpython/3_13/test_contextlib.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_contextlib.py b/test/dynamo/cpython/3_13/test_contextlib.py
-index cf651959803..51fd083b112 100644
+index cf651959803..256a824932d 100644
 --- a/test/dynamo/cpython/3_13/test_contextlib.py
 +++ b/test/dynamo/cpython/3_13/test_contextlib.py
@@ -1,3 +1,57 @@
@ -60,7 +60,7 @@ index cf651959803..51fd083b112 100644
 """Unit tests for contextlib.py, and other context managers."""
 
 import io
-@@ -14,7 +68,7 @@ from test.support.testcase import ExceptionIsLikeMixin
+@@ -14,60 +68,67 @@ from test.support.testcase import ExceptionIsLikeMixin
 import weakref
 
 
@ -68,8 +68,81 @@ index cf651959803..51fd083b112 100644
 +class TestAbstractContextManager(__TestCase):
 
     def test_enter(self):
-         class DefaultEnter(AbstractContextManager):
-@@ -67,7 +121,7 @@ class TestAbstractContextManager(unittest.TestCase):
+-        class DefaultEnter(AbstractContextManager):
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)
+ 
+         manager = DefaultEnter()
+         self.assertIs(manager.__enter__(), manager)
+ 
+     def test_slots(self):
+-        class DefaultContextManager(AbstractContextManager):
+-            __slots__ = ()
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultContextManager(AbstractContextManager):
+                __slots__ = ()
+ 
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
+                def __exit__(self, *args):
+                    super().__exit__(*args)
+ 
+         with self.assertRaises(AttributeError):
+             DefaultContextManager().var = 42
+ 
+     def test_exit_is_abstract(self):
+-        class MissingExit(AbstractContextManager):
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class MissingExit(AbstractContextManager):
+                pass
+ 
+         with self.assertRaises(TypeError):
+             MissingExit()
+ 
+     def test_structural_subclassing(self):
+-        class ManagerFromScratch:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, exc_type, exc_value, traceback):
+-                return None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class ManagerFromScratch:
+                def __enter__(self):
+                    return self
+                def __exit__(self, exc_type, exc_value, traceback):
+                    return None
+ 
+         self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))
+ 
+-        class DefaultEnter(AbstractContextManager):
+-            def __exit__(self, *args):
+-                super().__exit__(*args)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)
+ 
+         self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))
+ 
+-        class NoEnter(ManagerFromScratch):
+-            __enter__ = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class NoEnter(ManagerFromScratch):
+                __enter__ = None
+ 
+         self.assertFalse(issubclass(NoEnter, AbstractContextManager))
+ 
+-        class NoExit(ManagerFromScratch):
+-            __exit__ = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class NoExit(ManagerFromScratch):
+                __exit__ = None
+ 
         self.assertFalse(issubclass(NoExit, AbstractContextManager))
 
 
@ -78,7 +151,81 @@ index cf651959803..51fd083b112 100644
 
     def test_contextmanager_plain(self):
         state = []
-@@ -396,7 +450,7 @@ def woohoo():
+@@ -115,8 +176,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         self.assertEqual(frames[0].line, '1/0')
+ 
+         # Repeat with RuntimeError (which goes through a different code path)
+-        class RuntimeErrorSubclass(RuntimeError):
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class RuntimeErrorSubclass(RuntimeError):
+                pass
+ 
+         try:
+             with f():
+@@ -128,8 +190,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
+         self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')
+ 
+-        class StopIterationSubclass(StopIteration):
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class StopIterationSubclass(StopIteration):
+                pass
+ 
+         for stop_exc in (
+             StopIteration('spam'),
+@@ -169,9 +232,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         ctx.__enter__()
+         with self.assertRaises(RuntimeError):
+             ctx.__exit__(TypeError, TypeError("foo"), None)
+-        if support.check_impl_detail(cpython=True):
+-            # The "gen" attribute is an implementation detail.
+-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)
+ 
+     def test_contextmanager_trap_no_yield(self):
+         @contextmanager
+@@ -191,9 +254,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         ctx.__enter__()
+         with self.assertRaises(RuntimeError):
+             ctx.__exit__(None, None, None)
+-        if support.check_impl_detail(cpython=True):
+-            # The "gen" attribute is an implementation detail.
+-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)
+ 
+     def test_contextmanager_non_normalised(self):
+         @contextmanager
+@@ -230,8 +293,9 @@ class ContextManagerTestCase(unittest.TestCase):
+         def woohoo():
+             yield
+ 
+-        class StopIterationSubclass(StopIteration):
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class StopIterationSubclass(StopIteration):
+                pass
+ 
+         for stop_exc in (StopIteration('spam'), StopIterationSubclass('spam')):
+             with self.subTest(type=type(stop_exc)):
+@@ -344,8 +408,9 @@ def woohoo():
+             self.assertEqual(target, (11, 22, 33, 44))
+ 
+     def test_nokeepref(self):
+-        class A:
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class A:
+                pass
+ 
+         @contextmanager
+         def woohoo(a, b):
+@@ -396,7 +461,7 @@ def woohoo():
         self.assertEqual(depth, 0)
 
 
@ -87,16 +234,48 @@ index cf651959803..51fd083b112 100644
 
     @support.requires_docstrings
     def test_instance_docs(self):
-@@ -430,7 +484,7 @@ class ClosingTestCase(unittest.TestCase):
+@@ -407,9 +472,10 @@ class ClosingTestCase(unittest.TestCase):
+ 
+     def test_closing(self):
+         state = []
+-        class C:
+-            def close(self):
+-                state.append(1)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                def close(self):
+                    state.append(1)
+         x = C()
+         self.assertEqual(state, [])
+         with closing(x) as y:
+@@ -418,9 +484,10 @@ class ClosingTestCase(unittest.TestCase):
+ 
+     def test_closing_error(self):
+         state = []
+-        class C:
+-            def close(self):
+-                state.append(1)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                def close(self):
+                    state.append(1)
+         x = C()
+         self.assertEqual(state, [])
+         with self.assertRaises(ZeroDivisionError):
+@@ -430,16 +497,17 @@ class ClosingTestCase(unittest.TestCase):
         self.assertEqual(state, [1])
 
 
 -class NullcontextTestCase(unittest.TestCase):
 +class NullcontextTestCase(__TestCase):
     def test_nullcontext(self):
-         class C:
-             pass
-@@ -439,7 +493,7 @@ class NullcontextTestCase(unittest.TestCase):
+-        class C:
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                pass
+         c = C()
+         with nullcontext(c) as c_in:
             self.assertIs(c_in, c)
 
 
@ -105,7 +284,7 @@ index cf651959803..51fd083b112 100644
 
     def testWithOpen(self):
         tfn = tempfile.mktemp()
-@@ -457,7 +511,7 @@ class FileContextTestCase(unittest.TestCase):
+@@ -457,7 +525,7 @@ class FileContextTestCase(unittest.TestCase):
         finally:
             os_helper.unlink(tfn)
 
@ -114,7 +293,7 @@ index cf651959803..51fd083b112 100644
 
     def boilerPlate(self, lock, locked):
         self.assertFalse(locked())
-@@ -520,7 +574,7 @@ class mycontext(ContextDecorator):
+@@ -520,7 +588,7 @@ class mycontext(ContextDecorator):
         return self.catch
 
 
@ -123,7 +302,95 @@ index cf651959803..51fd083b112 100644
 
     @support.requires_docstrings
     def test_instance_docs(self):
-@@ -680,7 +734,7 @@ class TestContextDecorator(unittest.TestCase):
+@@ -584,13 +652,14 @@ class TestContextDecorator(unittest.TestCase):
+     def test_decorating_method(self):
+         context = mycontext()
+ 
+-        class Test(object):
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class Test(object):
+ 
+-            @context
+-            def method(self, a, b, c=None):
+-                self.a = a
+-                self.b = b
+-                self.c = c
+                @context
+                def method(self, a, b, c=None):
+                    self.a = a
+                    self.b = b
+                    self.c = c
+ 
+         # these tests are for argument passing when used as a decorator
+         test = Test()
+@@ -612,11 +681,12 @@ class TestContextDecorator(unittest.TestCase):
+ 
+ 
+     def test_typo_enter(self):
+-        class mycontext(ContextDecorator):
+-            def __unter__(self):
+-                pass
+-            def __exit__(self, *exc):
+-                pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class mycontext(ContextDecorator):
+                def __unter__(self):
+                    pass
+                def __exit__(self, *exc):
+                    pass
+ 
+         with self.assertRaisesRegex(TypeError, 'the context manager'):
+             with mycontext():
+@@ -624,11 +694,12 @@ class TestContextDecorator(unittest.TestCase):
+ 
+ 
+     def test_typo_exit(self):
+-        class mycontext(ContextDecorator):
+-            def __enter__(self):
+-                pass
+-            def __uxit__(self, *exc):
+-                pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class mycontext(ContextDecorator):
+                def __enter__(self):
+                    pass
+                def __uxit__(self, *exc):
+                    pass
+ 
+         with self.assertRaisesRegex(TypeError, 'the context manager.*__exit__'):
+             with mycontext():
+@@ -636,19 +707,20 @@ class TestContextDecorator(unittest.TestCase):
+ 
+ 
+     def test_contextdecorator_as_mixin(self):
+-        class somecontext(object):
+-            started = False
+-            exc = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class somecontext(object):
+                started = False
+                exc = None
+ 
+-            def __enter__(self):
+-                self.started = True
+-                return self
+                def __enter__(self):
+                    self.started = True
+                    return self
+ 
+-            def __exit__(self, *exc):
+-                self.exc = exc
+                def __exit__(self, *exc):
+                    self.exc = exc
+ 
+-        class mycontext(somecontext, ContextDecorator):
+-            pass
+            class mycontext(somecontext, ContextDecorator):
+                pass
+ 
+         context = mycontext()
+         @context
+@@ -680,7 +752,7 @@ class TestContextDecorator(unittest.TestCase):
         self.assertEqual(state, [1, 'something else', 999])
 
 
@ -132,7 +399,164 @@ index cf651959803..51fd083b112 100644
     exit_stack = None
 
     @support.requires_docstrings
-@@ -1141,7 +1195,7 @@ class TestBaseExitStack:
+@@ -745,13 +817,14 @@ class TestBaseExitStack:
+             self.assertIsNone(exc_type)
+             self.assertIsNone(exc)
+             self.assertIsNone(exc_tb)
+-        class ExitCM(object):
+-            def __init__(self, check_exc):
+-                self.check_exc = check_exc
+-            def __enter__(self):
+-                self.fail("Should not be called!")
+-            def __exit__(self, *exc_details):
+-                self.check_exc(*exc_details)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class ExitCM(object):
+                def __init__(self, check_exc):
+                    self.check_exc = check_exc
+                def __enter__(self):
+                    self.fail("Should not be called!")
+                def __exit__(self, *exc_details):
+                    self.check_exc(*exc_details)
+         with self.exit_stack() as stack:
+             stack.push(_expect_ok)
+             self.assertIs(stack._exit_callbacks[-1][1], _expect_ok)
+@@ -770,11 +843,12 @@ class TestBaseExitStack:
+             1/0
+ 
+     def test_enter_context(self):
+-        class TestCM(object):
+-            def __enter__(self):
+-                result.append(1)
+-            def __exit__(self, *exc_details):
+-                result.append(3)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class TestCM(object):
+                def __enter__(self):
+                    result.append(1)
+                def __exit__(self, *exc_details):
+                    result.append(3)
+ 
+         result = []
+         cm = TestCM()
+@@ -789,14 +863,15 @@ class TestBaseExitStack:
+         self.assertEqual(result, [1, 2, 3, 4])
+ 
+     def test_enter_context_errors(self):
+-        class LacksEnterAndExit:
+-            pass
+-        class LacksEnter:
+-            def __exit__(self, *exc_info):
+-                pass
+-        class LacksExit:
+-            def __enter__(self):
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class LacksEnterAndExit:
+                 pass
+            class LacksEnter:
+                def __exit__(self, *exc_info):
+                    pass
+            class LacksExit:
+                def __enter__(self):
+                    pass
+ 
+         with self.exit_stack() as stack:
+             with self.assertRaisesRegex(TypeError, 'the context manager'):
+@@ -877,32 +952,33 @@ class TestBaseExitStack:
+     def test_exit_exception_chaining_reference(self):
+         # Sanity check to make sure that ExitStack chaining matches
+         # actual nested with statements
+-        class RaiseExc:
+-            def __init__(self, exc):
+-                self.exc = exc
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                raise self.exc
+-
+-        class RaiseExcWithContext:
+-            def __init__(self, outer, inner):
+-                self.outer = outer
+-                self.inner = inner
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                try:
+-                    raise self.inner
+-                except:
+-                    raise self.outer
+-
+-        class SuppressExc:
+-            def __enter__(self):
+-                return self
+-            def __exit__(self, *exc_details):
+-                type(self).saved_details = exc_details
+-                return True
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class RaiseExc:
+                def __init__(self, exc):
+                    self.exc = exc
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    raise self.exc
+
+            class RaiseExcWithContext:
+                def __init__(self, outer, inner):
+                    self.outer = outer
+                    self.inner = inner
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    try:
+                        raise self.inner
+                    except:
+                        raise self.outer
+
+            class SuppressExc:
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    type(self).saved_details = exc_details
+                    return True
+ 
+         try:
+             with RaiseExc(IndexError):
+@@ -957,8 +1033,9 @@ class TestBaseExitStack:
+         # Ensure ExitStack chaining matches actual nested `with` statements
+         # regarding explicit __context__ = None.
+ 
+-        class MyException(Exception):
+-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class MyException(Exception):
+                pass
+ 
+         @contextmanager
+         def my_cm():
+@@ -1096,7 +1173,8 @@ class TestBaseExitStack:
+                 stack.callback(int)
+ 
+     def test_instance_bypass(self):
+-        class Example(object): pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class Example(object): pass
+         cm = Example()
+         cm.__enter__ = object()
+         cm.__exit__ = object()
+@@ -1108,8 +1186,9 @@ class TestBaseExitStack:
+ 
+     def test_dont_reraise_RuntimeError(self):
+         # https://bugs.python.org/issue27122
+-        class UniqueException(Exception): pass
+-        class UniqueRuntimeError(RuntimeError): pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class UniqueException(Exception): pass
+            class UniqueRuntimeError(RuntimeError): pass
+ 
+         @contextmanager
+         def second():
+@@ -1141,7 +1220,7 @@ class TestBaseExitStack:
         self.assertIs(exc.__cause__, exc.__context__)
 
 
@ -141,7 +565,7 @@ index cf651959803..51fd083b112 100644
     exit_stack = ExitStack
     callback_error_internal_frames = [
         ('__exit__', 'raise exc'),
-@@ -1149,7 +1203,7 @@ class TestExitStack(TestBaseExitStack, unittest.TestCase):
+@@ -1149,7 +1228,7 @@ class TestExitStack(TestBaseExitStack, unittest.TestCase):
     ]
 
 
@ -150,7 +574,7 @@ index cf651959803..51fd083b112 100644
 
     redirect_stream = None
     orig_stream = None
-@@ -1206,19 +1260,19 @@ class TestRedirectStream:
+@@ -1206,19 +1285,19 @@ class TestRedirectStream:
         self.assertEqual(s, "Hello World!\n")
 
 
@ -173,7 +597,7 @@ index cf651959803..51fd083b112 100644
 
     @support.requires_docstrings
     def test_instance_docs(self):
-@@ -1315,7 +1369,7 @@ class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
+@@ -1315,7 +1394,7 @@ class TestSuppress(ExceptionIsLikeMixin, unittest.TestCase):
         )
 
 
@ -182,7 +606,7 @@ index cf651959803..51fd083b112 100644
     def make_relative_path(self, *parts):
         return os.path.join(
             os.path.dirname(os.path.realpath(__file__)),
-@@ -1331,6 +1385,7 @@ class TestChdir(unittest.TestCase):
+@@ -1331,6 +1410,7 @@ class TestChdir(unittest.TestCase):
             self.assertEqual(os.getcwd(), target)
         self.assertEqual(os.getcwd(), old_cwd)
 
@ -190,7 +614,7 @@ index cf651959803..51fd083b112 100644
     def test_reentrant(self):
         old_cwd = os.getcwd()
         target1 = self.make_relative_path('data')
-@@ -1363,4 +1418,4 @@ class TestChdir(unittest.TestCase):
+@@ -1363,4 +1443,4 @@ class TestChdir(unittest.TestCase):
 
 
 if __name__ == "__main__":
--- a/test/dynamo/cpython/3_13/test_contextlib.py
+++ b/test/dynamo/cpython/3_13/test_contextlib.py
@ -71,52 +71,59 @@ import weakref
 class TestAbstractContextManager(__TestCase):

    def test_enter(self):
-        class DefaultEnter(AbstractContextManager):
-            def __exit__(self, *args):
-                super().__exit__(*args)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)

        manager = DefaultEnter()
        self.assertIs(manager.__enter__(), manager)

    def test_slots(self):
-        class DefaultContextManager(AbstractContextManager):
-            __slots__ = ()
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultContextManager(AbstractContextManager):
+                __slots__ = ()

-            def __exit__(self, *args):
-                super().__exit__(*args)
+                def __exit__(self, *args):
+                    super().__exit__(*args)

        with self.assertRaises(AttributeError):
            DefaultContextManager().var = 42

    def test_exit_is_abstract(self):
-        class MissingExit(AbstractContextManager):
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class MissingExit(AbstractContextManager):
+                pass

        with self.assertRaises(TypeError):
            MissingExit()

    def test_structural_subclassing(self):
-        class ManagerFromScratch:
-            def __enter__(self):
-                return self
-            def __exit__(self, exc_type, exc_value, traceback):
-                return None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class ManagerFromScratch:
+                def __enter__(self):
+                    return self
+                def __exit__(self, exc_type, exc_value, traceback):
+                    return None

        self.assertTrue(issubclass(ManagerFromScratch, AbstractContextManager))

-        class DefaultEnter(AbstractContextManager):
-            def __exit__(self, *args):
-                super().__exit__(*args)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class DefaultEnter(AbstractContextManager):
+                def __exit__(self, *args):
+                    super().__exit__(*args)

        self.assertTrue(issubclass(DefaultEnter, AbstractContextManager))

-        class NoEnter(ManagerFromScratch):
-            __enter__ = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class NoEnter(ManagerFromScratch):
+                __enter__ = None

        self.assertFalse(issubclass(NoEnter, AbstractContextManager))

-        class NoExit(ManagerFromScratch):
-            __exit__ = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class NoExit(ManagerFromScratch):
+                __exit__ = None

        self.assertFalse(issubclass(NoExit, AbstractContextManager))

@ -169,8 +176,9 @@ class ContextManagerTestCase(__TestCase):
        self.assertEqual(frames[0].line, '1/0')

        # Repeat with RuntimeError (which goes through a different code path)
-        class RuntimeErrorSubclass(RuntimeError):
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class RuntimeErrorSubclass(RuntimeError):
+                pass

        try:
            with f():
@ -182,8 +190,9 @@ class ContextManagerTestCase(__TestCase):
        self.assertEqual(frames[0].name, 'test_contextmanager_traceback')
        self.assertEqual(frames[0].line, 'raise RuntimeErrorSubclass(42)')

-        class StopIterationSubclass(StopIteration):
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class StopIterationSubclass(StopIteration):
+                pass

        for stop_exc in (
            StopIteration('spam'),
@ -223,9 +232,9 @@ class ContextManagerTestCase(__TestCase):
        ctx.__enter__()
        with self.assertRaises(RuntimeError):
            ctx.__exit__(TypeError, TypeError("foo"), None)
-        if support.check_impl_detail(cpython=True):
-            # The "gen" attribute is an implementation detail.
-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)

    def test_contextmanager_trap_no_yield(self):
        @contextmanager
@ -245,9 +254,9 @@ class ContextManagerTestCase(__TestCase):
        ctx.__enter__()
        with self.assertRaises(RuntimeError):
            ctx.__exit__(None, None, None)
-        if support.check_impl_detail(cpython=True):
-            # The "gen" attribute is an implementation detail.
-            self.assertFalse(ctx.gen.gi_suspended)
+        # if support.check_impl_detail(cpython=True):
+        #     # The "gen" attribute is an implementation detail.
+        #     self.assertFalse(ctx.gen.gi_suspended)

    def test_contextmanager_non_normalised(self):
        @contextmanager
@ -284,8 +293,9 @@ class ContextManagerTestCase(__TestCase):
        def woohoo():
            yield

-        class StopIterationSubclass(StopIteration):
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class StopIterationSubclass(StopIteration):
+                pass

        for stop_exc in (StopIteration('spam'), StopIterationSubclass('spam')):
            with self.subTest(type=type(stop_exc)):
@ -398,8 +408,9 @@ def woohoo():
            self.assertEqual(target, (11, 22, 33, 44))

    def test_nokeepref(self):
-        class A:
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class A:
+                pass

        @contextmanager
        def woohoo(a, b):
@ -461,9 +472,10 @@ class ClosingTestCase(__TestCase):

    def test_closing(self):
        state = []
-        class C:
-            def close(self):
-                state.append(1)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                def close(self):
+                    state.append(1)
        x = C()
        self.assertEqual(state, [])
        with closing(x) as y:
@ -472,9 +484,10 @@ class ClosingTestCase(__TestCase):

    def test_closing_error(self):
        state = []
-        class C:
-            def close(self):
-                state.append(1)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                def close(self):
+                    state.append(1)
        x = C()
        self.assertEqual(state, [])
        with self.assertRaises(ZeroDivisionError):
@ -486,8 +499,9 @@ class ClosingTestCase(__TestCase):

 class NullcontextTestCase(__TestCase):
    def test_nullcontext(self):
-        class C:
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class C:
+                pass
        c = C()
        with nullcontext(c) as c_in:
            self.assertIs(c_in, c)
@ -638,13 +652,14 @@ class TestContextDecorator(__TestCase):
    def test_decorating_method(self):
        context = mycontext()

-        class Test(object):
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class Test(object):

-            @context
-            def method(self, a, b, c=None):
-                self.a = a
-                self.b = b
-                self.c = c
+                @context
+                def method(self, a, b, c=None):
+                    self.a = a
+                    self.b = b
+                    self.c = c

        # these tests are for argument passing when used as a decorator
        test = Test()
@ -666,11 +681,12 @@ class TestContextDecorator(__TestCase):


    def test_typo_enter(self):
-        class mycontext(ContextDecorator):
-            def __unter__(self):
-                pass
-            def __exit__(self, *exc):
-                pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class mycontext(ContextDecorator):
+                def __unter__(self):
+                    pass
+                def __exit__(self, *exc):
+                    pass

        with self.assertRaisesRegex(TypeError, 'the context manager'):
            with mycontext():
@ -678,11 +694,12 @@ class TestContextDecorator(__TestCase):


    def test_typo_exit(self):
-        class mycontext(ContextDecorator):
-            def __enter__(self):
-                pass
-            def __uxit__(self, *exc):
-                pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class mycontext(ContextDecorator):
+                def __enter__(self):
+                    pass
+                def __uxit__(self, *exc):
+                    pass

        with self.assertRaisesRegex(TypeError, 'the context manager.*__exit__'):
            with mycontext():
@ -690,19 +707,20 @@ class TestContextDecorator(__TestCase):


    def test_contextdecorator_as_mixin(self):
-        class somecontext(object):
-            started = False
-            exc = None
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class somecontext(object):
+                started = False
+                exc = None

-            def __enter__(self):
-                self.started = True
-                return self
+                def __enter__(self):
+                    self.started = True
+                    return self

-            def __exit__(self, *exc):
-                self.exc = exc
+                def __exit__(self, *exc):
+                    self.exc = exc

-        class mycontext(somecontext, ContextDecorator):
-            pass
+            class mycontext(somecontext, ContextDecorator):
+                pass

        context = mycontext()
        @context
@ -799,13 +817,14 @@ class _TestBaseExitStack:
            self.assertIsNone(exc_type)
            self.assertIsNone(exc)
            self.assertIsNone(exc_tb)
-        class ExitCM(object):
-            def __init__(self, check_exc):
-                self.check_exc = check_exc
-            def __enter__(self):
-                self.fail("Should not be called!")
-            def __exit__(self, *exc_details):
-                self.check_exc(*exc_details)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class ExitCM(object):
+                def __init__(self, check_exc):
+                    self.check_exc = check_exc
+                def __enter__(self):
+                    self.fail("Should not be called!")
+                def __exit__(self, *exc_details):
+                    self.check_exc(*exc_details)
        with self.exit_stack() as stack:
            stack.push(_expect_ok)
            self.assertIs(stack._exit_callbacks[-1][1], _expect_ok)
@ -824,11 +843,12 @@ class _TestBaseExitStack:
            1/0

    def test_enter_context(self):
-        class TestCM(object):
-            def __enter__(self):
-                result.append(1)
-            def __exit__(self, *exc_details):
-                result.append(3)
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class TestCM(object):
+                def __enter__(self):
+                    result.append(1)
+                def __exit__(self, *exc_details):
+                    result.append(3)

        result = []
        cm = TestCM()
@ -843,14 +863,15 @@ class _TestBaseExitStack:
        self.assertEqual(result, [1, 2, 3, 4])

    def test_enter_context_errors(self):
-        class LacksEnterAndExit:
-            pass
-        class LacksEnter:
-            def __exit__(self, *exc_info):
-                pass
-        class LacksExit:
-            def __enter__(self):
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class LacksEnterAndExit:
                pass
+            class LacksEnter:
+                def __exit__(self, *exc_info):
+                    pass
+            class LacksExit:
+                def __enter__(self):
+                    pass

        with self.exit_stack() as stack:
            with self.assertRaisesRegex(TypeError, 'the context manager'):
@ -931,32 +952,33 @@ class _TestBaseExitStack:
    def test_exit_exception_chaining_reference(self):
        # Sanity check to make sure that ExitStack chaining matches
        # actual nested with statements
-        class RaiseExc:
-            def __init__(self, exc):
-                self.exc = exc
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                raise self.exc
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class RaiseExc:
+                def __init__(self, exc):
+                    self.exc = exc
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    raise self.exc

-        class RaiseExcWithContext:
-            def __init__(self, outer, inner):
-                self.outer = outer
-                self.inner = inner
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                try:
-                    raise self.inner
-                except:
-                    raise self.outer
+            class RaiseExcWithContext:
+                def __init__(self, outer, inner):
+                    self.outer = outer
+                    self.inner = inner
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    try:
+                        raise self.inner
+                    except:
+                        raise self.outer

-        class SuppressExc:
-            def __enter__(self):
-                return self
-            def __exit__(self, *exc_details):
-                type(self).saved_details = exc_details
-                return True
+            class SuppressExc:
+                def __enter__(self):
+                    return self
+                def __exit__(self, *exc_details):
+                    type(self).saved_details = exc_details
+                    return True

        try:
            with RaiseExc(IndexError):
@ -1011,8 +1033,9 @@ class _TestBaseExitStack:
        # Ensure ExitStack chaining matches actual nested `with` statements
        # regarding explicit __context__ = None.

-        class MyException(Exception):
-            pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class MyException(Exception):
+                pass

        @contextmanager
        def my_cm():
@ -1150,7 +1173,8 @@ class _TestBaseExitStack:
                stack.callback(int)

    def test_instance_bypass(self):
-        class Example(object): pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class Example(object): pass
        cm = Example()
        cm.__enter__ = object()
        cm.__exit__ = object()
@ -1162,8 +1186,9 @@ class _TestBaseExitStack:

    def test_dont_reraise_RuntimeError(self):
        # https://bugs.python.org/issue27122
-        class UniqueException(Exception): pass
-        class UniqueRuntimeError(RuntimeError): pass
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class UniqueException(Exception): pass
+            class UniqueRuntimeError(RuntimeError): pass

        @contextmanager
        def second():
--- a/test/dynamo/cpython/3_13/test_defaultdict.diff
+++ b/test/dynamo/cpython/3_13/test_defaultdict.diff
@ -0,0 +1,98 @@
+diff --git a/test/dynamo/cpython/3_13/test_defaultdict.py b/test/dynamo/cpython/3_13/test_defaultdict.py
+index bdbe9b81e8f..d55f1dc54c6 100644
+--- a/test/dynamo/cpython/3_13/test_defaultdict.py
+++ b/test/dynamo/cpython/3_13/test_defaultdict.py
+@@ -1,3 +1,60 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+# Test copied from
+# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_defaultdict.py
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+
+ """Unit tests for collections.defaultdict."""
+ 
+ import copy
+@@ -9,7 +66,7 @@ from collections import defaultdict
+ def foobar():
+     return list
+ 
+-class TestDefaultDict(unittest.TestCase):
+class TestDefaultDict(__TestCase):
+ 
+     def test_basic(self):
+         d1 = defaultdict()
+@@ -127,11 +184,12 @@ class TestDefaultDict(unittest.TestCase):
+ 
+     def test_recursive_repr(self):
+         # Issue2045: stack overflow when default_factory is a bound method
+-        class sub(defaultdict):
+-            def __init__(self):
+-                self.default_factory = self._factory
+-            def _factory(self):
+-                return []
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class sub(defaultdict):
+                def __init__(self):
+                    self.default_factory = self._factory
+                def _factory(self):
+                    return []
+         d = sub()
+         self.assertRegex(repr(d),
+             r"sub\(<bound method .*sub\._factory "
+@@ -187,4 +245,4 @@ class TestDefaultDict(unittest.TestCase):
+             i |= None
+ 
+ if __name__ == "__main__":
+-    unittest.main()
+    run_tests()
--- a/test/dynamo/cpython/3_13/test_defaultdict.py
+++ b/test/dynamo/cpython/3_13/test_defaultdict.py
@ -0,0 +1,248 @@
+# ======= BEGIN Dynamo patch =======
+# Owner(s): ["module: dynamo"]
+
+# ruff: noqa
+# flake8: noqa
+
+# Test copied from
+# https://raw.githubusercontent.com/python/cpython/refs/tags/v3.13.5/Lib/test/test_defaultdict.py
+
+import sys
+import torch
+import torch._dynamo.test_case
+import unittest
+from torch._dynamo.test_case import CPythonTestCase
+from torch.testing._internal.common_utils import (
+    run_tests,
+)
+
+__TestCase = CPythonTestCase
+
+
+# redirect import statements
+import sys
+import importlib.abc
+
+redirect_imports = (
+    "test.mapping_tests",
+    "test.typinganndata",
+    "test.test_grammar",
+    "test.test_math",
+    "test.test_iter",
+    "test.typinganndata.ann_module",
+)
+
+class RedirectImportFinder(importlib.abc.MetaPathFinder):
+    def find_spec(self, fullname, path, target=None):
+        # Check if the import is the problematic one
+        if fullname in redirect_imports:
+            try:
+                # Attempt to import the standalone module
+                name = fullname.removeprefix("test.")
+                r = importlib.import_module(name)
+                # Redirect the module in sys.modules
+                sys.modules[fullname] = r
+                # Return a module spec from the found module
+                return importlib.util.find_spec(name)
+            except ImportError:
+                return None
+        return None
+
+# Add the custom finder to sys.meta_path
+sys.meta_path.insert(0, RedirectImportFinder())
+
+
+# ======= END DYNAMO PATCH =======
+
+
+"""Unit tests for collections.defaultdict."""
+
+import copy
+import pickle
+import unittest
+
+from collections import defaultdict
+
+def foobar():
+    return list
+
+class TestDefaultDict(__TestCase):
+
+    def test_basic(self):
+        d1 = defaultdict()
+        self.assertEqual(d1.default_factory, None)
+        d1.default_factory = list
+        d1[12].append(42)
+        self.assertEqual(d1, {12: [42]})
+        d1[12].append(24)
+        self.assertEqual(d1, {12: [42, 24]})
+        d1[13]
+        d1[14]
+        self.assertEqual(d1, {12: [42, 24], 13: [], 14: []})
+        self.assertTrue(d1[12] is not d1[13] is not d1[14])
+        d2 = defaultdict(list, foo=1, bar=2)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, {"foo": 1, "bar": 2})
+        self.assertEqual(d2["foo"], 1)
+        self.assertEqual(d2["bar"], 2)
+        self.assertEqual(d2[42], [])
+        self.assertIn("foo", d2)
+        self.assertIn("foo", d2.keys())
+        self.assertIn("bar", d2)
+        self.assertIn("bar", d2.keys())
+        self.assertIn(42, d2)
+        self.assertIn(42, d2.keys())
+        self.assertNotIn(12, d2)
+        self.assertNotIn(12, d2.keys())
+        d2.default_factory = None
+        self.assertEqual(d2.default_factory, None)
+        try:
+            d2[15]
+        except KeyError as err:
+            self.assertEqual(err.args, (15,))
+        else:
+            self.fail("d2[15] didn't raise KeyError")
+        self.assertRaises(TypeError, defaultdict, 1)
+
+    def test_missing(self):
+        d1 = defaultdict()
+        self.assertRaises(KeyError, d1.__missing__, 42)
+        d1.default_factory = list
+        self.assertEqual(d1.__missing__(42), [])
+
+    def test_repr(self):
+        d1 = defaultdict()
+        self.assertEqual(d1.default_factory, None)
+        self.assertEqual(repr(d1), "defaultdict(None, {})")
+        self.assertEqual(eval(repr(d1)), d1)
+        d1[11] = 41
+        self.assertEqual(repr(d1), "defaultdict(None, {11: 41})")
+        d2 = defaultdict(int)
+        self.assertEqual(d2.default_factory, int)
+        d2[12] = 42
+        self.assertEqual(repr(d2), "defaultdict(<class 'int'>, {12: 42})")
+        def foo(): return 43
+        d3 = defaultdict(foo)
+        self.assertTrue(d3.default_factory is foo)
+        d3[13]
+        self.assertEqual(repr(d3), "defaultdict(%s, {13: 43})" % repr(foo))
+
+    def test_copy(self):
+        d1 = defaultdict()
+        d2 = d1.copy()
+        self.assertEqual(type(d2), defaultdict)
+        self.assertEqual(d2.default_factory, None)
+        self.assertEqual(d2, {})
+        d1.default_factory = list
+        d3 = d1.copy()
+        self.assertEqual(type(d3), defaultdict)
+        self.assertEqual(d3.default_factory, list)
+        self.assertEqual(d3, {})
+        d1[42]
+        d4 = d1.copy()
+        self.assertEqual(type(d4), defaultdict)
+        self.assertEqual(d4.default_factory, list)
+        self.assertEqual(d4, {42: []})
+        d4[12]
+        self.assertEqual(d4, {42: [], 12: []})
+
+        # Issue 6637: Copy fails for empty default dict
+        d = defaultdict()
+        d['a'] = 42
+        e = d.copy()
+        self.assertEqual(e['a'], 42)
+
+    def test_shallow_copy(self):
+        d1 = defaultdict(foobar, {1: 1})
+        d2 = copy.copy(d1)
+        self.assertEqual(d2.default_factory, foobar)
+        self.assertEqual(d2, d1)
+        d1.default_factory = list
+        d2 = copy.copy(d1)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, d1)
+
+    def test_deep_copy(self):
+        d1 = defaultdict(foobar, {1: [1]})
+        d2 = copy.deepcopy(d1)
+        self.assertEqual(d2.default_factory, foobar)
+        self.assertEqual(d2, d1)
+        self.assertTrue(d1[1] is not d2[1])
+        d1.default_factory = list
+        d2 = copy.deepcopy(d1)
+        self.assertEqual(d2.default_factory, list)
+        self.assertEqual(d2, d1)
+
+    def test_keyerror_without_factory(self):
+        d1 = defaultdict()
+        try:
+            d1[(1,)]
+        except KeyError as err:
+            self.assertEqual(err.args[0], (1,))
+        else:
+            self.fail("expected KeyError")
+
+    def test_recursive_repr(self):
+        # Issue2045: stack overflow when default_factory is a bound method
+        with torch._dynamo.set_fullgraph(fullgraph=False):
+            class sub(defaultdict):
+                def __init__(self):
+                    self.default_factory = self._factory
+                def _factory(self):
+                    return []
+        d = sub()
+        self.assertRegex(repr(d),
+            r"sub\(<bound method .*sub\._factory "
+            r"of sub\(\.\.\., \{\}\)>, \{\}\)")
+
+    def test_callable_arg(self):
+        self.assertRaises(TypeError, defaultdict, {})
+
+    def test_pickling(self):
+        d = defaultdict(int)
+        d[1]
+        for proto in range(pickle.HIGHEST_PROTOCOL + 1):
+            s = pickle.dumps(d, proto)
+            o = pickle.loads(s)
+            self.assertEqual(d, o)
+
+    def test_union(self):
+        i = defaultdict(int, {1: 1, 2: 2})
+        s = defaultdict(str, {0: "zero", 1: "one"})
+
+        i_s = i | s
+        self.assertIs(i_s.default_factory, int)
+        self.assertDictEqual(i_s, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i_s), [1, 2, 0])
+
+        s_i = s | i
+        self.assertIs(s_i.default_factory, str)
+        self.assertDictEqual(s_i, {0: "zero", 1: 1, 2: 2})
+        self.assertEqual(list(s_i), [0, 1, 2])
+
+        i_ds = i | dict(s)
+        self.assertIs(i_ds.default_factory, int)
+        self.assertDictEqual(i_ds, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i_ds), [1, 2, 0])
+
+        ds_i = dict(s) | i
+        self.assertIs(ds_i.default_factory, int)
+        self.assertDictEqual(ds_i, {0: "zero", 1: 1, 2: 2})
+        self.assertEqual(list(ds_i), [0, 1, 2])
+
+        with self.assertRaises(TypeError):
+            i | list(s.items())
+        with self.assertRaises(TypeError):
+            list(s.items()) | i
+
+        # We inherit a fine |= from dict, so just a few sanity checks here:
+        i |= list(s.items())
+        self.assertIs(i.default_factory, int)
+        self.assertDictEqual(i, {1: "one", 2: 2, 0: "zero"})
+        self.assertEqual(list(i), [1, 2, 0])
+
+        with self.assertRaises(TypeError):
+            i |= None
+
+if __name__ == "__main__":
+    run_tests()
--- a/test/dynamo/cpython/3_13/test_generators.diff
+++ b/test/dynamo/cpython/3_13/test_generators.diff
@ -1,5 +1,5 @@
 diff --git a/test/dynamo/cpython/3_13/test_generators.py b/test/dynamo/cpython/3_13/test_generators.py
-index e48d79d34f4..a48da0914b9 100644
+index 515fe7407f1..a48da0914b9 100644
 --- a/test/dynamo/cpython/3_13/test_generators.py
 +++ b/test/dynamo/cpython/3_13/test_generators.py
@@ -1,3 +1,56 @@
@ -105,7 +105,8 @@ index e48d79d34f4..a48da0914b9 100644
 +                return self.val
 +
 +            # No __iter__ method
-+
+ 
+-class ModifyUnderlyingIterableTest(unittest.TestCase):
 +        class C:
 +
 +            def __iter__(self):
@ -113,8 +114,7 @@ index e48d79d34f4..a48da0914b9 100644
 +
 +        self.assertEqual([1,2], list(i for i in C()))
 +
- 
-class ModifyUnderlyingIterableTest(unittest.TestCase):
+
 +class ModifyUnderlyingIterableTest(__TestCase):
     iterables = [
         range(0),
@ -137,99 +137,16 @@ index e48d79d34f4..a48da0914b9 100644
 
     def test_close_no_return_value(self):
         def f():
-@@ -630,90 +706,7 @@ class GeneratorCloseTest(unittest.TestCase):
+@@ -630,7 +706,7 @@ class GeneratorCloseTest(unittest.TestCase):
         self.assertIsNone(f_wr())
 
 
-# See https://github.com/python/cpython/issues/125723
-class GeneratorDeallocTest(unittest.TestCase):
-    def test_frame_outlives_generator(self):
-        def g1():
-            a = 42
-            yield sys._getframe()
-
-        def g2():
-            a = 42
-            yield
-
-        def g3(obj):
-            a = 42
-            obj.frame = sys._getframe()
-            yield
-
-        class ObjectWithFrame():
-            def __init__(self):
-                self.frame = None
-
-        def get_frame(index):
-            if index == 1:
-                return next(g1())
-            elif index == 2:
-                gen = g2()
-                next(gen)
-                return gen.gi_frame
-            elif index == 3:
-                obj = ObjectWithFrame()
-                next(g3(obj))
-                return obj.frame
-            else:
-                return None
-
-        for index in (1, 2, 3):
-            with self.subTest(index=index):
-                frame = get_frame(index)
-                frame_locals = frame.f_locals
-                self.assertIn('a', frame_locals)
-                self.assertEqual(frame_locals['a'], 42)
-
-    def test_frame_locals_outlive_generator(self):
-        frame_locals1 = None
-
-        def g1():
-            nonlocal frame_locals1
-            frame_locals1 = sys._getframe().f_locals
-            a = 42
-            yield
-
-        def g2():
-            a = 42
-            yield sys._getframe().f_locals
-
-        def get_frame_locals(index):
-            if index == 1:
-                nonlocal frame_locals1
-                next(g1())
-                return frame_locals1
-            if index == 2:
-                return next(g2())
-            else:
-                return None
-
-        for index in (1, 2):
-            with self.subTest(index=index):
-                frame_locals = get_frame_locals(index)
-                self.assertIn('a', frame_locals)
-                self.assertEqual(frame_locals['a'], 42)
-
-    def test_frame_locals_outlive_generator_with_exec(self):
-        def g():
-            a = 42
-            yield locals(), sys._getframe().f_locals
-
-        locals_ = {'g': g}
-        for i in range(10):
-            exec("snapshot, live_locals = next(g())", locals=locals_)
-            for l in (locals_['snapshot'], locals_['live_locals']):
-                self.assertIn('a', l)
-                self.assertEqual(l['a'], 42)
-
-
 -class GeneratorThrowTest(unittest.TestCase):
 +class GeneratorThrowTest(__TestCase):
 
     def test_exception_context_with_yield(self):
         def f():
-@@ -812,7 +805,7 @@ class GeneratorThrowTest(unittest.TestCase):
+@@ -729,7 +805,7 @@ class GeneratorThrowTest(unittest.TestCase):
             gen.throw(ValueError)
 
 
@ -238,7 +155,7 @@ index e48d79d34f4..a48da0914b9 100644
 
     def check_stack_names(self, frame, expected):
         names = []
-@@ -861,7 +854,7 @@ class GeneratorStackTraceTest(unittest.TestCase):
+@@ -778,7 +854,7 @@ class GeneratorStackTraceTest(unittest.TestCase):
         self.check_yield_from_example(call_throw)
 
 
@ -247,7 +164,7 @@ index e48d79d34f4..a48da0914b9 100644
     def test_generator_gi_yieldfrom(self):
         def a():
             self.assertEqual(inspect.getgeneratorstate(gen_b), inspect.GEN_RUNNING)
-@@ -2752,21 +2745,27 @@ test_generators just happened to be the test that drew these out.
+@@ -2669,21 +2745,27 @@ test_generators just happened to be the test that drew these out.
 
 """
 
--- a/test/dynamo/test_generator.py
+++ b/test/dynamo/test_generator.py
@ -1515,6 +1515,76 @@ class TestGeneratorThrow(GeneratorTestsBase):

        self._compile_check(fn)

+    def test_return_const_value_in_except_and_finally(self):
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return 2  # noqa: B901
+            finally:
+                return 3  # noqa: B012, SIM107, B901
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert e.args[0] == 3
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+
+    def test_return_value_in_except_and_finally(self):
+        class Foo:
+            def __init__(self, x):
+                self.x = x
+
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return Foo(2)  # noqa: B901
+            finally:
+                return Foo(3)  # noqa: B012, SIM107, B901
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert e.args[0].x == 3
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+
+    def test_return_None_in_except_and_finally(self):
+        def whoo():
+            try:
+                yield 1
+            except ValueError:
+                return 2  # noqa: B901
+            finally:
+                return  # noqa: B012, SIM107
+
+        def fn(t):
+            gen = whoo()
+            next(gen)
+            try:
+                gen.throw(ValueError)
+            except StopIteration as e:
+                assert len(e.args) == 0
+            except Exception as e:
+                raise AssertionError from e
+            return t.sin()
+
+        self._compile_check(fn)
+

 instantiate_parametrized_tests(GeneratorTests)
 instantiate_parametrized_tests(TestGeneratorSend)
--- a/test/dynamo/test_graph_deduplication.py
+++ b/test/dynamo/test_graph_deduplication.py
@ -4,9 +4,7 @@ import contextlib

 import torch
 import torch.fx
-from torch._dynamo.graph_deduplication import apply_graph_deduplication
 from torch._dynamo.graph_utils import _detect_cycles
-from torch._dynamo.output_graph import FakeRootModule
 from torch._dynamo.test_case import TestCase
 from torch._dynamo.testing import (
    AotEagerAndRecordGraphs,
@ -1131,82 +1129,6 @@ def forward(self, L_x_ : torch.Tensor, L_y_ : torch.Tensor):
        result_eager = fn(*inps)
        self.assertEqual(result_compiled, result_eager)

-    def test_tuple_inputs(self):
-        with (
-            torch._dynamo.config.patch("use_graph_deduplication", False),
-            torch._dynamo.config.patch("track_nodes_for_deduplication", True),
-        ):
-
-            def inner(x, y):
-                x0, x1 = torch.split(x, 5)
-                return x0 + x1 + y
-
-            def fn(x, y):
-                o1 = inner(x, y)
-                o2 = inner(x, y)
-                o3 = inner(x, y)
-                o4 = inner(x, y)
-                return o1.sum() + o2.sum() + o3.sum() + o4.sum()
-
-            graph, tracker = extract_graph_and_tracker(
-                fn, torch.rand(10, 10), torch.rand(5, 10)
-            )
-
-            class MockOutputGraph:
-                def __init__(self):
-                    self.graph = graph
-                    self.region_tracker = tracker
-                    self.nn_modules = FakeRootModule({})
-
-                def install_subgraph(self, name, subgraph):
-                    return ""
-
-            splits = [
-                n
-                for n in graph.nodes
-                if n.op == "call_function" and n.target == torch.split
-            ]
-            for split in splits:
-                tracker.node_to_duplicates.pop(split)
-
-            apply_graph_deduplication(MockOutputGraph())
-            self.assertExpectedInline(
-                graph,
-                """\
-graph():
-    %_unnamed : [num_users=4] = get_attr[target=]
-    %l_x_ : torch.Tensor [num_users=4] = placeholder[target=L_x_]
-    %l_y_ : torch.Tensor [num_users=4] = placeholder[target=L_y_]
-    %split : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 0), kwargs = {})
-    %x1 : [num_users=1] = call_function[target=operator.getitem](args = (%split, 1), kwargs = {})
-    %split_1 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 0), kwargs = {})
-    %x1_1 : [num_users=1] = call_function[target=operator.getitem](args = (%split_1, 1), kwargs = {})
-    %split_2 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 0), kwargs = {})
-    %x1_2 : [num_users=1] = call_function[target=operator.getitem](args = (%split_2, 1), kwargs = {})
-    %split_3 : [num_users=2] = call_function[target=torch.functional.split](args = (%l_x_, 5), kwargs = {})
-    %x0_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 0), kwargs = {})
-    %x1_3 : [num_users=1] = call_function[target=operator.getitem](args = (%split_3, 1), kwargs = {})
-    %invoke_subgraph : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0, %x1, %l_y_), kwargs = {})
-    %getitem_8 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph, 0), kwargs = {})
-    %sum_1 : [num_users=1] = call_method[target=sum](args = (%getitem_8,), kwargs = {})
-    %invoke_subgraph_1 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_1, %x1_1, %l_y_), kwargs = {})
-    %getitem_9 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_1, 0), kwargs = {})
-    %sum_2 : [num_users=1] = call_method[target=sum](args = (%getitem_9,), kwargs = {})
-    %add_8 : [num_users=1] = call_function[target=operator.add](args = (%sum_1, %sum_2), kwargs = {})
-    %invoke_subgraph_2 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_2, %x1_2, %l_y_), kwargs = {})
-    %getitem_10 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_2, 0), kwargs = {})
-    %sum_3 : [num_users=1] = call_method[target=sum](args = (%getitem_10,), kwargs = {})
-    %add_9 : [num_users=1] = call_function[target=operator.add](args = (%add_8, %sum_3), kwargs = {})
-    %invoke_subgraph_3 : [num_users=1] = call_function[target=torch.ops.higher_order.invoke_subgraph](args = (%_unnamed, , %x0_3, %x1_3, %l_y_), kwargs = {})
-    %getitem_11 : [num_users=1] = call_function[target=operator.getitem](args = (%invoke_subgraph_3, 0), kwargs = {})
-    %sum_4 : [num_users=1] = call_method[target=sum](args = (%getitem_11,), kwargs = {})
-    %add_10 : [num_users=1] = call_function[target=operator.add](args = (%add_9, %sum_4), kwargs = {})
-    return (add_10,)""",
-            )
-
    def test_param_transfer_to_submodule(self):
        def inner_fn(x, y):
            return x + y + y + x
--- a/test/dynamo/test_graph_region_tracker.py
+++ b/test/dynamo/test_graph_region_tracker.py
@ -9,28 +9,6 @@ from torch._dynamo.testing import extract_graph_and_tracker
 from torch.utils._pytree import tree_map


-def get_nodes_by_name(graph, names):
-    nodes = []
-    for node in graph.nodes:
-        if node.name in names:
-            nodes.append(node)
-
-    return nodes
-
-
-unique_ind = 0
-
-
-def track_same_nodes(names, graph, region_tracker):
-    global unique_ind
-    unique_ind += 1
-    # find nodes in graph with names and track them
-    # as if they were at the same code location
-    nodes = get_nodes_by_name(graph, names)
-    for node in nodes:
-        region_tracker.track_node("x", unique_ind, node)
-
-
 class GraphRegionTrackerTests(TestCase):
    def setUp(self):
        self.exit_stack = contextlib.ExitStack()
@ -378,6 +356,35 @@ class GraphRegionTrackerTests(TestCase):
        _sort_with_ref_region(index_to_rank, regions)
        self.assertExpectedInline(regions, """[[0, 2, 1], [1, 0, 2]]""")

+    def test_no_duplicate_tracking(self):
+        def inner_fn(x, y):
+            x0 = x + 1
+            y0 = y + 2
+            z = x0.sum() + y0.sum()
+            return z
+
+        def fn(x, y):
+            o0 = inner_fn(x, y)
+            o1 = torch.sin(y)
+            o2 = inner_fn(x, o1)
+            o3 = inner_fn(x, y)
+            o4 = o3 * o3
+            return o2 * o4 + o0
+
+        graph, tracker = extract_graph_and_tracker(
+            fn, torch.rand(10, 10), torch.ones(10, 20)
+        )
+        self.assertExpectedInline(
+            tracker.node_to_duplicates,
+            """{l_x_: [l_x_], x0: [x0, x0_1, x0_2], l_y_: [l_y_], y0: [y0, y0_1, y0_2], sum_1: \
+[sum_1, sum_3, sum_5], sum_2: [sum_2, sum_4, sum_6], z: [z, z_1, z_2], o1: [o1], x0_1: [x0, x0_1, x0_2], y0_1: [y0, y0_1, y0_2], \
+sum_3: [sum_1, sum_3, sum_5], sum_4: [sum_2, sum_4, sum_6], \
+z_1: [z, z_1, z_2], x0_2: [x0, x0_1, x0_2], y0_2: [y0, y0_1, y0_2], sum_5: [sum_1, sum_3, sum_5], sum_6: [sum_2, sum_4, sum_6], \
+z_2: [z, z_1, z_2], o4: [o4], mul_1: [mul_1], add_9: [add_9]}""",
+        )
+        key = next(iter(tracker.node_to_duplicates.keys()))
+        tracker.track_node(None, key)  # this will fail if the node is added again
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_recompiles.py
+++ b/test/dynamo/test_recompiles.py
@ -4,9 +4,58 @@ from unittest.mock import patch
 import torch
 import torch._dynamo.test_case
 import torch._dynamo.testing
+from torch._dynamo import config as dc


 class RecompileTests(torch._dynamo.test_case.TestCase):
+    def test_inline_inbuilt_nn_modules_candidate(self):
+        def hook_flag_on(guard_manager, f_locals, builder):
+            self.assertTrue(
+                "[inline-inbuilt-nn-modules-candidate]" not in str(guard_manager)
+            )
+
+        def hook_flag_off(guard_manager, f_locals, builder):
+            self.assertTrue(
+                "[inline-inbuilt-nn-modules-candidate]" in str(guard_manager)
+            )
+
+        class SubMod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            @torch.compile(backend="eager")
+            def forward(self, x):
+                return self.linear(x)
+
+        class Mod(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.sm1 = SubMod()
+                self.sm2 = SubMod()
+
+            def forward(self, x):
+                return self.sm1(x) + self.sm2(x)
+
+        try:
+            from .utils import install_guard_manager_testing_hook
+        except ImportError:
+            from utils import install_guard_manager_testing_hook
+
+        with (
+            install_guard_manager_testing_hook(hook_flag_on),
+            dc.patch(inline_inbuilt_nn_modules=True),
+        ):
+            mod = Mod()
+            mod(torch.randn(2, 2))
+
+        with (
+            install_guard_manager_testing_hook(hook_flag_off),
+            dc.patch(inline_inbuilt_nn_modules=False),
+        ):
+            mod = Mod()
+            mod(torch.randn(2, 2))
+
    def test_automatic_dynamic_reduce_recompiles(self):
        # Test the counterfactual, lots of recompiles without this config
        def foo(x, y):
--- a/test/dynamo/test_structured_trace.py
+++ b/test/dynamo/test_structured_trace.py
@ -25,10 +25,10 @@ from torch.testing._internal.common_utils import find_free_port
 from torch.testing._internal.triton_utils import requires_cuda_and_triton


+requires_cuda_and_triton = unittest.skipUnless(HAS_CUDA, "requires cuda")
 if torch.distributed.is_available():
    from torch.testing._internal.distributed.fake_pg import FakeStore

-
 HAS_TLPARSE = shutil.which("tlparse") is not None
 requires_tlparse = unittest.skipUnless(HAS_TLPARSE, "requires tlparse")
 requires_distributed = functools.partial(
@ -1198,13 +1198,13 @@ def forward(self, x_1: "f32[2][1]cpu"):

    @contextmanager
    def _setup_runtime_estimates_capture(self):
-        """Helper to turn on and capture the 'inductor_tlparse_runtime' structured trace."""
+        """Helper to turn on and capture the combined 'inductor_runtime_and_tensor_meta' structured trace."""
        payload_buffer = io.StringIO()
        payload_handler = logging.StreamHandler(payload_buffer)
        payload_handler.setLevel(logging.DEBUG)
        payload_handler.setFormatter(StructuredTracePayloadFormatter())
        payload_handler.addFilter(
-            StructuredTraceTestingFilter("inductor_tlparse_runtime")
+            StructuredTraceTestingFilter("inductor_runtime_and_tensor_meta")
        )
        trace_log.addHandler(payload_handler)
        try:
@ -1245,8 +1245,10 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))

-                # Verify runtime estimates artifact was logged
-                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())
+                # Verify runtime + tensor meta artifact was logged
+                self.assertIn(
+                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
+                )

                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1310,8 +1312,10 @@ def forward(self, x_1: "f32[2][1]cpu"):
                compiled = torch.compile(mod, backend="inductor")
                compiled(torch.randn(4, 4, device="cuda"))

-                # Verify runtime estimates artifact was logged
-                self.assertIn('"inductor_tlparse_runtime"', self.buffer.getvalue())
+                # Verify artifact was logged
+                self.assertIn(
+                    '"inductor_runtime_and_tensor_meta"', self.buffer.getvalue()
+                )

                payload_content = payload_buffer.getvalue().strip()
                if payload_content:
@ -1333,6 +1337,145 @@ def forward(self, x_1: "f32[2][1]cpu"):
        finally:
            dist.destroy_process_group()

+    @requires_tlparse
+    @requires_distributed()
+    @requires_cuda_and_triton
+    @torch._inductor.config.patch("fx_graph_cache", False)
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging_multiple_ops(self):
+        import torch.distributed as dist
+
+        store = FakeStore()
+        dist.init_process_group(backend="fake", rank=0, world_size=2, store=store)
+
+        class Mixed(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(4, 4)
+
+            def forward(self, x):
+                y = torch.relu(self.linear(x))
+                y = torch.ops._c10d_functional.all_reduce.default(y, "sum", "0")
+                y = torch.ops._c10d_functional.wait_tensor.default(y)
+                return y + 1
+
+        try:
+            with self._setup_runtime_estimates_capture() as payload_buffer:
+                torch._dynamo.reset()
+                mod = Mixed().cuda()
+                compiled = torch.compile(mod, backend="inductor")
+                compiled(torch.randn(4, 4, device="cuda"))
+                payload = payload_buffer.getvalue().strip()
+                if payload:
+                    data = json.loads(payload)
+                    types = sorted({op.get("type") for op in data.get("ops", [])})
+                    self.assertExpectedInline(
+                        str(types), """['collective', 'compute']"""
+                    )
+                self.assertParses()
+        finally:
+            dist.destroy_process_group()
+
+    @requires_tlparse
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging(self):
+        """Emit unified runtime+tensor-metadata artifact and assert a stable simplified JSON inline."""
+        with self._setup_runtime_estimates_capture() as payload_buffer:
+
+            def f(x):
+                y = x.transpose(0, 1)
+                z = y.mean(dim=0)
+                w = z.to(torch.float16)
+                return w
+
+            compiled = torch.compile(f, backend="inductor", fullgraph=True)
+            compiled(torch.ones(2, 3))
+
+            # Verify artifact was logged
+            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
+
+            payload = payload_buffer.getvalue().strip()
+            if payload:
+                data = json.loads(payload)
+                ops = data.get("ops", [])
+
+                simplified_ops = []
+                for op in ops:
+                    outs = [
+                        {
+                            "shape": out.get("shape", []),
+                            "stride": out.get("stride", []),
+                            "dtype": out.get("dtype", None),
+                        }
+                        for out in op.get("outputs", [])
+                    ]
+                    if outs:
+                        simplified_ops.append(
+                            {
+                                "type": op.get("type", ""),
+                                "outputs": outs,
+                            }
+                        )
+
+                self.assertExpectedInline(
+                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
+                    """{'ops': [{'type': 'compute', 'outputs': [{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}""",
+                )
+
+            self.assertParses()
+
+    @requires_tlparse
+    @torch._inductor.config.patch("log_tlparse", True)
+    def test_tensor_metadata_logging_dynamic_shapes(self):
+        """Same as test_tensor_metadata_logging, but with dynamic shapes enabled to cover to_size_hints."""
+        with self._setup_runtime_estimates_capture() as payload_buffer:
+
+            def f(x):
+                y = x.transpose(0, 1)
+                z = y.mean(dim=0)
+                w = z.to(torch.float16)
+                return w
+
+            compiled = torch.compile(f, backend="inductor", dynamic=True)
+            compiled(torch.ones(2, 3))
+
+            # Verify artifact was logged
+            self.assertIn('"inductor_runtime_and_tensor_meta"', self.buffer.getvalue())
+
+            payload = payload_buffer.getvalue().strip()
+            if payload:
+                data = json.loads(payload)
+                ops = data.get("ops", [])
+
+                simplified_ops = []
+                for op in ops:
+                    outs = [
+                        {
+                            "shape": out.get("shape", []),
+                            "stride": out.get("stride", []),
+                            "dtype": out.get("dtype", None),
+                        }
+                        for out in op.get("outputs", [])
+                    ]
+                    if outs:
+                        simplified_ops.append(
+                            {
+                                "type": op.get("type", ""),
+                                "outputs": outs,
+                            }
+                        )
+
+                self.assertExpectedInline(
+                    {"ops": simplified_ops[-1:]} if simplified_ops else {"ops": []},
+                    (
+                        "{'ops': [{'type': 'compute', 'outputs': ["
+                        "{'shape': [2], 'stride': [1], 'dtype': 'float32'}, "
+                        "{'shape': [2], 'stride': [1], 'dtype': 'float16'}]}]}"
+                    ),
+                )
+
+            self.assertParses()
+

 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests
--- a/test/dynamo/test_utils.py
+++ b/test/dynamo/test_utils.py
@ -500,6 +500,7 @@ class TestDynamoTimed(TestCase):
 'inductor_fx_remote_cache_hit_keys': None,
 'inductor_fx_remote_cache_miss_count': None,
 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
 'is_forward': True,
 'is_runtime': False,
 'joint_graph_pass_time_us': 0,
@ -583,6 +584,7 @@ class TestDynamoTimed(TestCase):
 'inductor_fx_remote_cache_hit_keys': None,
 'inductor_fx_remote_cache_miss_count': None,
 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
 'is_forward': True,
 'is_runtime': False,
 'joint_graph_pass_time_us': 0,
@ -677,6 +679,7 @@ class TestDynamoTimed(TestCase):
 'inductor_fx_remote_cache_hit_keys': None,
 'inductor_fx_remote_cache_miss_count': None,
 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
 'is_forward': False,
 'is_runtime': False,
 'joint_graph_pass_time_us': None,
@ -760,6 +763,7 @@ class TestDynamoTimed(TestCase):
 'inductor_fx_remote_cache_hit_keys': None,
 'inductor_fx_remote_cache_miss_count': None,
 'inductor_fx_remote_cache_miss_keys': None,
+ 'inline_inbuilt_nn_modules_candidate': False,
 'is_forward': False,
 'is_runtime': False,
 'joint_graph_pass_time_us': None,
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_total
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_total
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_unary
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_unary
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_new_builtins_issue_43102
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestNamedTuple.test_new_builtins_issue_43102
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_protocol
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_dict_protocol
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_list_protocol
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_list_protocol
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_str_protocol
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestUserObjects.test_str_protocol
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ClosingTestCase.test_closing_error
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_except_stopiter
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_second_yield
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_second_yield
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_yield_after_throw
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_contextmanager_trap_yield_after_throw
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-ContextManagerTestCase.test_nokeepref
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-NullcontextTestCase.test_nullcontext
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-NullcontextTestCase.test_nullcontext
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_enter
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_enter
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_exit_is_abstract
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_slots
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_structural_subclassing
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestAbstractContextManager.test_structural_subclassing
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_contextdecorator_as_mixin
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_contextdecorator_as_mixin
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_decorating_method
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_enter
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestContextDecorator.test_typo_exit
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_dont_reraise_RuntimeError
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_enter_context_errors
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_chaining_reference
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_exit_exception_explicit_none_context
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_instance_bypass
--- a/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push
+++ b/test/dynamo_expected_failures/CPython313-test_contextlib-TestExitStack.test_push
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_basic
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_basic
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_callable_arg
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_callable_arg
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_dict_coercion
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestChainMap.test_dict_coercion
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_deep_copy
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_deep_copy
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_keyerror_without_factory
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_missing
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_missing
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_helper_function
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_helper_function
--- a/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_invariant_for_the_in_operator
+++ b/test/dynamo_expected_failures/CPython313-test_collections-TestCounter.test_invariant_for_the_in_operator
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_shallow_copy
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_shallow_copy
--- a/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_union
+++ b/test/dynamo_expected_failures/CPython313-test_defaultdict-TestDefaultDict.test_union
--- a/Show More
+++ b/Show More